From d02b0d86e25d700ce4f4cb0eccac1f743aea6986 Mon Sep 17 00:00:00 2001 From: sean-sn Date: Mon, 22 Jun 2020 11:31:22 -0400 Subject: [PATCH] initial commit --- .gitignore | 62 + LICENSE | 201 ++ README.md | 98 + bindings/blst.h | 332 ++ bindings/blst.swg | 105 + bindings/blst_aux.h | 48 + bindings/go/README.md | 55 + bindings/go/assembly.S | 53 + bindings/go/blst.go | 1387 ++++++++ bindings/go/blst.tgo | 135 + bindings/go/blst_minpk.tgo | 424 +++ bindings/go/blst_minpk_test.go | 349 ++ bindings/go/blst_minsig_test.go | 353 ++ bindings/go/blst_misc.tgo | 159 + bindings/go/blst_px.tgo | 122 + bindings/go/generate.py | 122 + bindings/go/server.c | 20 + bindings/rust/Cargo.toml | 30 + bindings/rust/README.md | 55 + bindings/rust/benches/blst_benches.rs | 426 +++ bindings/rust/build.rs | 65 + bindings/rust/rustfmt.toml | 1 + bindings/rust/src/lib.rs | 1378 ++++++++ blst_logo_small.png | Bin 0 -> 10176 bytes build.bat | 11 + build.sh | 64 + build/coff/add_mod_256-x86_64.s | 643 ++++ build/coff/add_mod_384-x86_64.s | 2154 ++++++++++++ build/coff/add_mod_384x384-x86_64.s | 326 ++ build/coff/inverse_mod_384-x86_64.s | 412 +++ build/coff/mulq_mont_256-x86_64.s | 872 +++++ build/coff/mulq_mont_384-x86_64.s | 4205 +++++++++++++++++++++++ build/coff/mulx_mont_256-x86_64.s | 784 +++++ build/coff/mulx_mont_384-x86_64.s | 3560 ++++++++++++++++++++ build/coff/sha256-x86_64.s | 1560 +++++++++ build/elf/add_mod_256-x86_64.s | 404 +++ build/elf/add_mod_384-x86_64.s | 1485 +++++++++ build/elf/add_mod_384x384-x86_64.s | 252 ++ build/elf/inverse_mod_384-x86_64.s | 378 +++ build/elf/mulq_mont_256-x86_64.s | 714 ++++ build/elf/mulq_mont_384-x86_64.s | 3619 ++++++++++++++++++++ build/elf/mulx_mont_256-x86_64.s | 627 ++++ build/elf/mulx_mont_384-x86_64.s | 2969 +++++++++++++++++ build/elf/sha256-x86_64.s | 1446 ++++++++ build/mach-o/add_mod_256-x86_64.s | 396 +++ build/mach-o/add_mod_384-x86_64.s | 1477 +++++++++ build/mach-o/add_mod_384x384-x86_64.s | 244 ++ build/mach-o/inverse_mod_384-x86_64.s | 370 +++ build/mach-o/mulq_mont_256-x86_64.s | 706 ++++ build/mach-o/mulq_mont_384-x86_64.s | 3611 ++++++++++++++++++++ build/mach-o/mulx_mont_256-x86_64.s | 619 ++++ build/mach-o/mulx_mont_384-x86_64.s | 2961 +++++++++++++++++ build/mach-o/sha256-x86_64.s | 1438 ++++++++ build/refresh.sh | 13 + build/win64/add_mod_256-x86_64.asm | 660 ++++ build/win64/add_mod_384-x86_64.asm | 2191 ++++++++++++ build/win64/add_mod_384x384-x86_64.asm | 334 ++ build/win64/inverse_mod_384-x86_64.asm | 419 +++ build/win64/mulq_mont_256-x86_64.asm | 884 +++++ build/win64/mulq_mont_384-x86_64.asm | 4232 ++++++++++++++++++++++++ build/win64/mulx_mont_256-x86_64.asm | 796 +++++ build/win64/mulx_mont_384-x86_64.asm | 3587 ++++++++++++++++++++ build/win64/sha256-x86_64.asm | 1570 +++++++++ src/aggregate.c | 435 +++ src/asm/add_mod_256-x86_64.pl | 392 +++ src/asm/add_mod_384-x86_64.pl | 1420 ++++++++ src/asm/add_mod_384x384-x86_64.pl | 260 ++ src/asm/inverse_mod_384-x86_64.pl | 411 +++ src/asm/mulq_mont_256-x86_64.pl | 517 +++ src/asm/mulq_mont_384-x86_64.pl | 2674 +++++++++++++++ src/asm/mulx_mont_256-x86_64.pl | 472 +++ src/asm/mulx_mont_384-x86_64.pl | 2385 +++++++++++++ src/asm/sha256-x86_64.pl | 788 +++++ src/asm/x86_64-xlate.pl | 1779 ++++++++++ src/client_min_pk.c | 16 + src/client_min_sig.c | 16 + src/consts.c | 36 + src/consts.h | 28 + src/e1.c | 390 +++ src/e2.c | 434 +++ src/ec_mult.h | 287 ++ src/ec_ops.h | 462 +++ src/errors.h | 13 + src/exp.c | 145 + src/exp2.c | 179 + src/exports.c | 452 +++ src/fields.h | 96 + src/fp12_tower.c | 785 +++++ src/hash_to_field.c | 136 + src/keygen.c | 161 + src/map_to_g1.c | 533 +++ src/map_to_g2.c | 496 +++ src/pairing.c | 407 +++ src/point.h | 66 + src/recip-addchain.h | 489 +++ src/server.c | 20 + src/sha256.h | 130 + src/sqrt-addchain.h | 489 +++ src/sqrt2-addchain.h | 922 ++++++ src/vect.c | 128 + src/vect.h | 371 +++ 101 files changed, 78193 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 bindings/blst.h create mode 100644 bindings/blst.swg create mode 100644 bindings/blst_aux.h create mode 100644 bindings/go/README.md create mode 100644 bindings/go/assembly.S create mode 100644 bindings/go/blst.go create mode 100644 bindings/go/blst.tgo create mode 100644 bindings/go/blst_minpk.tgo create mode 100644 bindings/go/blst_minpk_test.go create mode 100644 bindings/go/blst_minsig_test.go create mode 100644 bindings/go/blst_misc.tgo create mode 100644 bindings/go/blst_px.tgo create mode 100755 bindings/go/generate.py create mode 100644 bindings/go/server.c create mode 100644 bindings/rust/Cargo.toml create mode 100644 bindings/rust/README.md create mode 100644 bindings/rust/benches/blst_benches.rs create mode 100644 bindings/rust/build.rs create mode 100644 bindings/rust/rustfmt.toml create mode 100644 bindings/rust/src/lib.rs create mode 100644 blst_logo_small.png create mode 100644 build.bat create mode 100755 build.sh create mode 100644 build/coff/add_mod_256-x86_64.s create mode 100644 build/coff/add_mod_384-x86_64.s create mode 100644 build/coff/add_mod_384x384-x86_64.s create mode 100644 build/coff/inverse_mod_384-x86_64.s create mode 100644 build/coff/mulq_mont_256-x86_64.s create mode 100644 build/coff/mulq_mont_384-x86_64.s create mode 100644 build/coff/mulx_mont_256-x86_64.s create mode 100644 build/coff/mulx_mont_384-x86_64.s create mode 100644 build/coff/sha256-x86_64.s create mode 100644 build/elf/add_mod_256-x86_64.s create mode 100644 build/elf/add_mod_384-x86_64.s create mode 100644 build/elf/add_mod_384x384-x86_64.s create mode 100644 build/elf/inverse_mod_384-x86_64.s create mode 100644 build/elf/mulq_mont_256-x86_64.s create mode 100644 build/elf/mulq_mont_384-x86_64.s create mode 100644 build/elf/mulx_mont_256-x86_64.s create mode 100644 build/elf/mulx_mont_384-x86_64.s create mode 100644 build/elf/sha256-x86_64.s create mode 100644 build/mach-o/add_mod_256-x86_64.s create mode 100644 build/mach-o/add_mod_384-x86_64.s create mode 100644 build/mach-o/add_mod_384x384-x86_64.s create mode 100644 build/mach-o/inverse_mod_384-x86_64.s create mode 100644 build/mach-o/mulq_mont_256-x86_64.s create mode 100644 build/mach-o/mulq_mont_384-x86_64.s create mode 100644 build/mach-o/mulx_mont_256-x86_64.s create mode 100644 build/mach-o/mulx_mont_384-x86_64.s create mode 100644 build/mach-o/sha256-x86_64.s create mode 100755 build/refresh.sh create mode 100644 build/win64/add_mod_256-x86_64.asm create mode 100644 build/win64/add_mod_384-x86_64.asm create mode 100644 build/win64/add_mod_384x384-x86_64.asm create mode 100644 build/win64/inverse_mod_384-x86_64.asm create mode 100644 build/win64/mulq_mont_256-x86_64.asm create mode 100644 build/win64/mulq_mont_384-x86_64.asm create mode 100644 build/win64/mulx_mont_256-x86_64.asm create mode 100644 build/win64/mulx_mont_384-x86_64.asm create mode 100644 build/win64/sha256-x86_64.asm create mode 100644 src/aggregate.c create mode 100755 src/asm/add_mod_256-x86_64.pl create mode 100755 src/asm/add_mod_384-x86_64.pl create mode 100755 src/asm/add_mod_384x384-x86_64.pl create mode 100755 src/asm/inverse_mod_384-x86_64.pl create mode 100755 src/asm/mulq_mont_256-x86_64.pl create mode 100755 src/asm/mulq_mont_384-x86_64.pl create mode 100755 src/asm/mulx_mont_256-x86_64.pl create mode 100755 src/asm/mulx_mont_384-x86_64.pl create mode 100755 src/asm/sha256-x86_64.pl create mode 100755 src/asm/x86_64-xlate.pl create mode 100644 src/client_min_pk.c create mode 100644 src/client_min_sig.c create mode 100644 src/consts.c create mode 100644 src/consts.h create mode 100644 src/e1.c create mode 100644 src/e2.c create mode 100644 src/ec_mult.h create mode 100644 src/ec_ops.h create mode 100644 src/errors.h create mode 100644 src/exp.c create mode 100644 src/exp2.c create mode 100644 src/exports.c create mode 100644 src/fields.h create mode 100644 src/fp12_tower.c create mode 100644 src/hash_to_field.c create mode 100644 src/keygen.c create mode 100644 src/map_to_g1.c create mode 100644 src/map_to_g2.c create mode 100644 src/pairing.c create mode 100644 src/point.h create mode 100644 src/recip-addchain.h create mode 100644 src/server.c create mode 100644 src/sha256.h create mode 100644 src/sqrt-addchain.h create mode 100644 src/sqrt2-addchain.h create mode 100644 src/vect.c create mode 100644 src/vect.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..f50c8b3b --- /dev/null +++ b/.gitignore @@ -0,0 +1,62 @@ +# Prerequisites +*.d + +# Object files +*.o +*.ko +*.obj +*.elf + +# Linker output +*.ilk +*.map +*.exp + +# Precompiled Headers +*.gch +*.pch + +# Libraries +*.lib +*.a +*.la +*.lo + +# Shared objects (inc. Windows DLLs) +*.dll +*.so +*.so.* +*.dylib + +# Executables +*.exe +*.out +*.app +*.i*86 +*.x86_64 +*.hex + +# Debug files +*.dSYM/ +*.su +*.idb +*.pdb + +# Kernel Module Compile Results +*.mod* +*.cmd +.tmp_versions/ +modules.order +Module.symvers +Mkfile.old +dkms.conf + +# Open swap files +*.swp + +# Emacs backup files +*~ + +# Rust build +Cargo.lock +bindings/rust/target diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..261eeb9e --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 00000000..4e3d95f2 --- /dev/null +++ b/README.md @@ -0,0 +1,98 @@ +
+ +
+ +# blst (pronounced 'blast') +A BLS12-381 signature library written in C and assembly focused on performance and security. + +## Status +**This library has not yet been audited. Use at your own risk.** + +Compliant with IETF draft specifications: +- [IETF BLS Signature V2](https://tools.ietf.org/html/draft-irtf-cfrg-bls-signature-02) +- [IETF Hash-to-Curve V8](https://tools.ietf.org/html/draft-irtf-cfrg-hash-to-curve-08) + +Support for x86_64 + +Support for Linux, Mac, and Windows +- Limited testing on Mac and Windows + +Explicit bindings for other languages +- Go +- Rust + +Bindings for other languages provided using [swig](http://swig.org) +- tested Python + +Support for ARM64 is coming soon + +Formal verification will be rolling in to various components of the library over the coming months utilizing [cryptol](https://www.cryptol.net) and [coq](https://coq.inria.fr/) +- Field, curve and bulk signature operations + +## API +The blst API is defined in the C header [bindings/blst.h](https://github.com/supranational/blst/blob/master/bindings/blst.h). The API can be categorized as follows with some example operations: +- Field (add, sub, mul, neg, inv, to/from Montgomery) +- Curve (add, double, mul, to/from affine, group check) +- Intermediate (hash to curve, pairing, serdes) +- BLS12-381 signature core (sign, verify, aggregate) + +Note there is also an auxillary header file [bindings/blst_aux.h](https://github.com/supranational/blst/blob/master/bindings/blst_aux.h) that is used as a staging area for experimental interfaces that may or may not get promoted to blst.h. + +## Build +The build process is very simple and only requires a C complier and Perl. Although pre-built assembly files are provided which can remove the need for Perl. + +### C static library +A static library called libblst.a can be built in current working directory of user's choice. + +Linux, Mac, and Windows (in MinGW or Cygwin environments) +``` +/some/where/build.sh +``` + +Windows (Visual C) +``` +\some\where\build.bat +``` + +## Bindings +Bindings to other languages that implement minimal-signature-size and minimal-pubkey-size variants of the BLS signature specification are provided as follows: + +### Go [src](https://github.com/supranational/blst/tree/master/bindings/go) +TODO - basic details + +For more details see the Go binding [readme](https://github.com/supranational/blst/tree/master/bindings/go/README.md). + +### Rust [src](https://github.com/supranational/blst/tree/master/bindings/rust) +TODO - publish crate + +`blst` is the Rust binding crate. + +To use min-pk version: +``` +use blst::min_pk::*; +``` + +To use min-sig version: +``` +use blst::min_sig::*; +``` + +For more details see the Rust binding [readme](https://github.com/supranational/blst/tree/master/bindings/rust/README.md). + +### Others +TODO - example swig build/usage + +## General notes on implementation +The goal of the blst library is to provide a foundational component for applications and other libraries that require high performance and formally verified BLS12-381 operations. With that in mind some decisions are made to maximize the public good beyond BLS12-381. For example the field operations are optimized for general 384-bit usage as opposed to tuned specifically for the 381-bit BLS12-381 curve parameters. With the formal verification of these foundational components, we believe they can provide a reliable building block for other curves that would like high performance and an extra element of security. + +Library deliberately abstains from dealing with memory management and multi-threading with rationale that these ultimately belong in langugage-/run-time-specific bindings. + +The assembly code is written in perl scripts which can output an assembly file based on the [ABI](https://en.wikipedia.org/wiki/Application_binary_interface) and operating system. For example in the build directory there are pre-build assembly files for elf, mingw64, masm, and macosx. See [build.sh](https://github.com/supranational/blst/blob/master/build.sh) or [refresh.sh](https://github.com/supranational/blst/blob/master/build/refresh.sh) for usage. This method allows for simple reuse of optimized assembly across various platforms with minimal effort. + +Serialization formatting is implemented according to [Appendix A. BLS12-381](https://tools.ietf.org/html/draft-irtf-cfrg-bls-signature-02#appendix-A) of the IETF spec that calls for using the [ZCash definition](https://github.com/zkcrypto/pairing/blob/master/src/bls12_381/README.md#serialization). + +## Performance +Currently both the Go and Rust bindings provide benchmarks for a variety of signature related operations. + +## License +The blst library is licensed under the [Apache License Version 2.0](LICENSE) software license. diff --git a/bindings/blst.h b/bindings/blst.h new file mode 100644 index 00000000..bb62b28f --- /dev/null +++ b/bindings/blst.h @@ -0,0 +1,332 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLST_H__ +#define __BLST_H__ + +#ifdef __SIZE_TYPE__ +typedef __SIZE_TYPE__ size_t; +#else +#include +#endif + +#if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) \ + && defined(__UINT64_TYPE__) +typedef __UINT8_TYPE__ uint8_t; +typedef __UINT32_TYPE__ uint32_t; +typedef __UINT64_TYPE__ uint64_t; +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#elif defined(__BLST_CGO__) +typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */ +#elif defined(__STDC_VERSION__) && __STDC_VERSION__>=199901 +# define bool _Bool +#else +# define bool int +#endif + +#ifdef SWIG +# define DEFNULL =NULL +#else +# define DEFNULL +#endif + +typedef enum { + BLST_SUCCESS = 0, + BLST_BAD_ENCODING, + BLST_POINT_NOT_ON_CURVE, + BLST_POINT_NOT_IN_GROUP, + BLST_AGGR_TYPE_MISMATCH, + BLST_VERIFY_FAIL, +} BLST_ERROR; + +typedef uint8_t byte; +typedef uint64_t limb_t; + +typedef struct { limb_t l[256/8/sizeof(limb_t)]; } blst_scalar; +typedef struct { limb_t l[256/8/sizeof(limb_t)]; } blst_fr; +typedef struct { limb_t l[384/8/sizeof(limb_t)]; } blst_fp; +/* 0 is "real" part, 1 is "imaginary" */ +typedef struct { blst_fp fp[2]; } blst_fp2; +typedef struct { blst_fp2 fp2[3]; } blst_fp6; +typedef struct { blst_fp6 fp6[2]; } blst_fp12; + +#ifndef SWIG +void blst_scalar_from_uint32(blst_scalar *ret, const uint32_t a[8]); +void blst_uint32_from_scalar(uint32_t ret[8], const blst_scalar *a); +void blst_scalar_from_uint64(blst_scalar *ret, const uint64_t a[4]); +void blst_uint64_from_scalar(uint64_t ret[4], const blst_scalar *a); +void blst_scalar_from_bendian(blst_scalar *ret, const byte a[32]); +void blst_bendian_from_scalar(byte ret[32], const blst_scalar *a); +void blst_scalar_from_lendian(blst_scalar *ret, const byte a[32]); +void blst_lendian_from_scalar(byte ret[32], const blst_scalar *a); +bool blst_scalar_fr_check(const blst_scalar *a); + +/* + * BLS12-381-specifc Fr operations. + */ +void blst_fr_add(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_sub(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_mul_by_3(blst_fr *ret, const blst_fr *a); +void blst_fr_lshift(blst_fr *ret, const blst_fr *a, size_t count); +void blst_fr_rshift(blst_fr *ret, const blst_fr *a, size_t count); +void blst_fr_mul(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_sqr(blst_fr *ret, const blst_fr *a); +void blst_fr_cneg(blst_fr *ret, const blst_fr *a, size_t flag); +void blst_fr_to(blst_fr *ret, const blst_fr *a); +void blst_fr_from(blst_fr *ret, const blst_fr *a); + +/* + * BLS12-381-specifc Fp operations. + */ +void blst_fp_add(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_sub(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_mul_by_3(blst_fp *ret, const blst_fp *a); +void blst_fp_mul_by_8(blst_fp *ret, const blst_fp *a); +void blst_fp_lshift(blst_fp *ret, const blst_fp *a, size_t count); +void blst_fp_mul(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_sqr(blst_fp *ret, const blst_fp *a); +void blst_fp_cneg(blst_fp *ret, const blst_fp *a, size_t flag); +void blst_fp_eucl_inverse(blst_fp *ret, const blst_fp *a); +void blst_fp_to(blst_fp *ret, const blst_fp *a); +void blst_fp_from(blst_fp *ret, const blst_fp *a); + +void blst_fp_from_uint32(blst_fp *ret, const uint32_t a[12]); +void blst_uint32_from_fp(uint32_t ret[12], const blst_fp *a); +void blst_fp_from_uint64(blst_fp *ret, const uint64_t a[6]); +void blst_uint64_from_fp(uint64_t ret[6], const blst_fp *a); +void blst_fp_from_bendian(blst_fp *ret, const byte a[48]); +void blst_bendian_from_fp(byte ret[48], const blst_fp *a); +void blst_fp_from_lendian(blst_fp *ret, const byte a[48]); +void blst_lendian_from_fp(byte ret[48], const blst_fp *a); + +/* + * BLS12-381-specifc Fp2 operations. + */ +void blst_fp2_add(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_sub(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_mul_by_3(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_mul_by_8(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_lshift(blst_fp2 *ret, const blst_fp2 *a, size_t count); +void blst_fp2_mul(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_sqr(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_cneg(blst_fp2 *ret, const blst_fp2 *a, size_t flag); + +/* + * BLS12-381-specifc Fp12 operations. + */ +void blst_fp12_sqr(blst_fp12 *ret, const blst_fp12 *a); +void blst_fp12_cyclotomic_sqr(blst_fp12 *ret, const blst_fp12 *a); +void blst_fp12_mul(blst_fp12 *ret, const blst_fp12 *a, const blst_fp12 *b); +void blst_fp12_mul_by_xy00z0(blst_fp12 *ret, const blst_fp12 *a, + const blst_fp6 *xy00z0); +void blst_fp12_conjugate(blst_fp12 *a); +void blst_fp12_inverse(blst_fp12 *ret, const blst_fp12 *a); +/* caveat lector! |n| has to be non-zero and not more than 3! */ +void blst_fp12_frobenius_map(blst_fp12 *ret, const blst_fp12 *a, size_t n); +bool blst_fp12_is_equal(const blst_fp12 *a, const blst_fp12 *b); +bool blst_fp12_is_one(const blst_fp12 *a); +#endif // SWIG + +/* + * BLS12-381-specifc point operations. + */ +typedef struct { blst_fp x, y, z; } blst_p1; +typedef struct { blst_fp x, y; } blst_p1_affine; + +void blst_p1_add(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); +void blst_p1_add_or_double(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); +void blst_p1_add_affine(blst_p1 *out, const blst_p1 *a, + const blst_p1_affine *b); +void blst_p1_add_or_double_affine(blst_p1 *out, const blst_p1 *a, + const blst_p1_affine *b); +void blst_p1_double(blst_p1 *out, const blst_p1 *a); +void blst_p1_mult_w5(blst_p1 *out, const blst_p1 *p, + const blst_scalar *scalar, size_t nbits); +void blst_p1_cneg(blst_p1 *p, size_t cbit); +void blst_p1_to_affine(blst_p1_affine *out, const blst_p1 *in); +void blst_p1_from_affine(blst_p1 *out, const blst_p1_affine *in); +bool blst_p1_affine_on_curve(const blst_p1_affine *p); +bool blst_p1_affine_in_g1(const blst_p1_affine *p); +bool blst_p1_affine_is_equal(const blst_p1_affine *a, const blst_p1_affine *b); + +typedef struct { blst_fp2 x, y, z; } blst_p2; +typedef struct { blst_fp2 x, y; } blst_p2_affine; + +void blst_p2_add(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); +void blst_p2_add_or_double(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); +void blst_p2_add_affine(blst_p2 *out, const blst_p2 *a, + const blst_p2_affine *b); +void blst_p2_add_or_double_affine(blst_p2 *out, const blst_p2 *a, + const blst_p2_affine *b); +void blst_p2_double(blst_p2 *out, const blst_p2 *a); +void blst_p2_mult_w5(blst_p2 *out, const blst_p2 *p, + const blst_scalar *scalar, size_t nbits); +void blst_p2_cneg(blst_p2 *p, size_t cbit); +void blst_p2_to_affine(blst_p2_affine *out, const blst_p2 *in); +void blst_p2_from_affine(blst_p2 *out, const blst_p2_affine *in); +bool blst_p2_affine_on_curve(const blst_p2_affine *p); +bool blst_p2_affine_in_g2(const blst_p2_affine *p); +bool blst_p2_affine_is_equal(const blst_p2_affine *a, const blst_p2_affine *b); + +/* + * Hash-to-curve operations. + */ +#ifndef SWIG +void blst_map_to_g1(blst_p1 *out, const blst_fp *u, const blst_fp *v DEFNULL); +void blst_map_to_g2(blst_p2 *out, const blst_fp2 *u, const blst_fp2 *v DEFNULL); +#endif + +void blst_encode_to_g1(blst_p1 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); +void blst_hash_to_g1(blst_p1 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); + +void blst_encode_to_g2(blst_p2 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); +void blst_hash_to_g2(blst_p2 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); + +/* + * Zcash-compatible serialization/deserialization. + */ +void blst_p1_serialize(byte out[96], const blst_p1 *in); +void blst_p1_compress(byte out[48], const blst_p1 *in); +void blst_p1_affine_serialize(byte out[96], const blst_p1_affine *in); +void blst_p1_affine_compress(byte out[48], const blst_p1_affine *in); +BLST_ERROR blst_p1_uncompress(blst_p1_affine *out, const byte in[48]); +BLST_ERROR blst_p1_deserialize(blst_p1_affine *out, const byte in[96]); + +void blst_p2_serialize(byte out[192], const blst_p2 *in); +void blst_p2_compress(byte out[96], const blst_p2 *in); +void blst_p2_affine_serialize(byte out[192], const blst_p2_affine *in); +void blst_p2_affine_compress(byte out[96], const blst_p2_affine *in); +BLST_ERROR blst_p2_uncompress(blst_p2_affine *out, const byte in[96]); +BLST_ERROR blst_p2_deserialize(blst_p2_affine *out, const byte in[192]); + +/* + * Specification defines two variants, 'minimal-signature-size' and + * 'minimal-pubkey-size'. To unify appearance we choose to distinguish + * them by suffix referring to the public key type, more specifically + * _pk_in_g1 corresponds to 'minimal-pubkey-size' and _pk_in_g2 - to + * 'minimal-signature-size'. It might appear a bit counterintuitive + * in sign call, but no matter how you twist it, something is bound to + * turn a little odd. + */ +/* + * Secret-key operations. + */ +void blst_keygen(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_sk_to_pk_in_g1(blst_p1 *out_pk, const blst_scalar *SK); +void blst_sign_pk_in_g1(blst_p2 *out_sig, const blst_p2 *hash, + const blst_scalar *SK); +void blst_sk_to_pk_in_g2(blst_p2 *out_pk, const blst_scalar *SK); +void blst_sign_pk_in_g2(blst_p1 *out_sig, const blst_p1 *hash, + const blst_scalar *SK); + +/* + * Pairing interface. + */ +#ifndef SWIG +void blst_miller_loop(blst_fp12 *ret, const blst_p2_affine *Q, + const blst_p1_affine *P); +void blst_final_exp(blst_fp12 *ret, const blst_fp12 *f); +void blst_precompute_lines(blst_fp6 Qlines[68], const blst_p2_affine *Q); +void blst_miller_loop_lines(blst_fp12 *ret, const blst_fp6 Qlines[68], + const blst_p1_affine *P); +#endif + +#ifdef __BLST_CGO__ +typedef limb_t blst_pairing; +#else +typedef struct {} blst_pairing; +#endif + +size_t blst_pairing_sizeof(); +void blst_pairing_init(blst_pairing *new_ctx); +void blst_pairing_commit(blst_pairing *ctx); +BLST_ERROR blst_pairing_aggregate_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + const blst_p1_affine *signature, + bool hash_or_encode, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, + size_t DST_len DEFNULL, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_aggregate_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + const blst_p2_affine *signature, + bool hash_or_encode, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, + size_t DST_len DEFNULL, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_merge(blst_pairing *ctx, const blst_pairing *ctx1); +bool blst_pairing_finalverify(const blst_pairing *ctx, + const blst_fp12 *gtsig DEFNULL); + + +/* + * Customarily applications aggregate signatures separately. + * In which case application would have to pass NULLs for |signature| + * to blst_pairing_aggregate calls and pass aggregated signature + * collected with these calls to blst_pairing_finalverify. Inputs are + * Zcash-compatible "straight-from-wire" byte vectors, compressed or + * not. + */ +BLST_ERROR blst_aggregate_in_g1(blst_p1 *out, const blst_p1 *in, + const byte *zwire); +BLST_ERROR blst_aggregate_in_g2(blst_p2 *out, const blst_p2 *in, + const byte *zwire); + +void blst_aggregated_in_g1(blst_fp12 *out, const blst_p1_affine *signature); +void blst_aggregated_in_g2(blst_fp12 *out, const blst_p2_affine *signature); + +/* + * "One-shot" CoreVerify entry points. + */ +BLST_ERROR blst_core_verify_pk_in_g1(const blst_p1_affine *pk, + const blst_p2_affine *signature, + bool hash_or_encode, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, + size_t DST_len DEFNULL, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_core_verify_pk_in_g2(const blst_p2_affine *pk, + const blst_p1_affine *signature, + bool hash_or_encode, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, + size_t DST_len DEFNULL, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); + +extern blst_p1_affine BLS12_381_G1; +extern blst_p1_affine BLS12_381_NEG_G1; +extern blst_p2_affine BLS12_381_G2; +extern blst_p2_affine BLS12_381_NEG_G2; + +#include "blst_aux.h" + +#ifdef __cplusplus +} +#endif +#endif diff --git a/bindings/blst.swg b/bindings/blst.swg new file mode 100644 index 00000000..c1de9a78 --- /dev/null +++ b/bindings/blst.swg @@ -0,0 +1,105 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +%module blst +%rename("%(strip:[blst_])s") ""; // prefix is redundant in named module + +#if defined(SWIGPYTHON) || defined(SWIGPERL) +# if defined(SWIGPYTHON) +%begin %{ +#define SWIG_PYTHON_STRICT_BYTE_CHAR +%} +# endif + +%include "exception.i" + +// some sorcery to allow assignments as output, e.g. +// P = blst.encode_to_g1(b"foo") + +# if defined(SWIGPYTHON) +%typemap(out) void " $result = NULL; "; +%typemap(ret) void " if ($result == NULL) $result = SWIG_Py_Void(); "; +# endif + +%typemap(in, numinputs=0) OBJECT *OUTPUT($1_basetype temp) { $1 = &temp; }; +%typemap(argout) OBJECT *OUTPUT { +# ifdef SWIGPYTHON + PyObject *obj = SWIG_NewPointerObj(memcpy(malloc(sizeof($1_basetype)), + $1,sizeof($1_basetype)), + $descriptor, SWIG_POINTER_NEW); + $result = ($result==NULL) ? obj + : SWIG_Python_AppendOutput($result, obj); +# else // TODO: figure out more language-specific ways to return multi-values... + if ($result == NULL) + $result = SWIG_NewPointerObj(memcpy(malloc(sizeof($1_basetype)), + $1,sizeof($1_basetype)), + $descriptor, SWIG_POINTER_NEW); +# endif +}; +%apply OBJECT *OUTPUT { blst_p1 *out, blst_p1 *out_pk, blst_p1 *out_sig }; +%apply OBJECT *OUTPUT { blst_p1_affine *out, blst_p1_affine *out_pk, + blst_p1_affine *out_sig }; +%apply OBJECT *OUTPUT { blst_p2 *out, blst_p2 *out_pk, blst_p2 *out_sig }; +%apply OBJECT *OUTPUT { blst_p2_affine *out, blst_p2_affine *out_pk, + blst_p2_affine *out_sig }; +%apply OBJECT *OUTPUT { blst_scalar *out, blst_scalar *out_SK, blst_fp12 *out }; + +%typemap(out) BLST_ERROR { + if ($1 != BLST_SUCCESS) { + SWIG_exception(SWIG_ValueError, BLST_ERROR_str[$1]); + SWIG_fail; + } + resultobj = SWIG_From_int($1); +}; + +%typemap(in, numinputs=0) byte out[ANY](byte temp[$1_dim0]) { + $1 = temp; +}; +%typemap(argout) byte out[ANY] { +# ifdef SWIGPYTHON + PyObject *obj = SWIG_FromCharPtrAndSize((char *)$1, $1_dim0); + $result = ($result==NULL) ? obj + : SWIG_Python_AppendOutput($result, obj); +# else // TODO: figure out more language-specific ways to return multi-values... + if ($result == NULL) + $result = SWIG_FromCharPtrAndSize((char *)$1, $1_dim0); +# endif +}; + +%typemap(in, numinputs=0) blst_pairing *new_ctx { + $1 = calloc(1, blst_pairing_sizeof()); +}; +%typemap(argout) blst_pairing *new_ctx { + $result = SWIG_NewPointerObj($1, $descriptor, SWIG_POINTER_NEW); +}; +#endif // defined(SWIGPYTHON) || defined(SWIGPERL) + +%apply (char *STRING, size_t LENGTH) { (const byte *msg, size_t msg_len) }; +%apply (char *STRING, size_t LENGTH) { (const byte *DST, size_t DST_len) }; +%apply (char *STRING, size_t LENGTH) { (const byte *aug, size_t aug_len) }; +%apply (char *STRING, size_t LENGTH) { (const byte *IKM, size_t IKM_len) }; +%apply (char *STRING, size_t LENGTH) { (const byte *info, size_t info_len) }; +%apply const char * { const byte in[ANY] }; + +%include "blst.h" +%include "blst_aux.h" +%extend blst_pairing { + blst_pairing() { return calloc(1, blst_pairing_sizeof()); } + ~blst_pairing() { free($self); } +}; + +%begin %{ +#include "blst.h" + +static const char *const BLST_ERROR_str [] = { + "success", + "bad point encoding", + "point is not on curve", + "point is not in group", + "context type mismatch", + "verify failed", +}; +%} + +%include "cdata.i" diff --git a/bindings/blst_aux.h b/bindings/blst_aux.h new file mode 100644 index 00000000..55d97f09 --- /dev/null +++ b/bindings/blst_aux.h @@ -0,0 +1,48 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLST_AUX_H__ +#define __BLST_AUX_H__ +/* + * This file lists interfaces that might be promoted to blst.h or removed, + * depending on their proven/unproven worthiness. + */ + +bool blst_p1_on_curve(const blst_p1 *p); +bool blst_p2_on_curve(const blst_p2 *p); +void blst_p1_from_jacobian(blst_p1 *out, const blst_p1 *in); +void blst_p2_from_jacobian(blst_p2 *out, const blst_p2 *in); +bool blst_p1_affine_is_equal(const blst_p1_affine *a, const blst_p1_affine *b); +bool blst_p2_affine_is_equal(const blst_p2_affine *a, const blst_p2_affine *b); + +/* + * Below functions produce both point and deserialized outcome of + * SkToPk and Sign. However, deserialized outputs are pre-decorated + * with sign and infinity bits. This means that you have to bring the + * output into compliance prior returning to application. If you want + * compressed point value, then do [equivalent of] + * + * byte temp[96]; + * blst_sk_to_pk2_in_g1(temp, out_pk, SK); + * temp[0] |= 0x80; + * memcpy(out, temp, 48); + * + * Otherwise do + * + * blst_sk_to_pk2_in_g1(out, out_pk, SK); + * out[0] &= ~0x20; + * + * Either |out| or |out_| can be NULL. + */ +void blst_sk_to_pk2_in_g1(byte out[96], blst_p1_affine *out_pk, + const blst_scalar *SK); +void blst_sign_pk2_in_g1(byte out[192], blst_p2_affine *out_sig, + const blst_p2 *hash, const blst_scalar *SK); +void blst_sk_to_pk2_in_g2(byte out[192], blst_p2_affine *out_pk, + const blst_scalar *SK); +void blst_sign_pk2_in_g2(byte out[96], blst_p1_affine *out_sig, + const blst_p1 *hash, const blst_scalar *SK); + +#endif diff --git a/bindings/go/README.md b/bindings/go/README.md new file mode 100644 index 00000000..ea0d00af --- /dev/null +++ b/bindings/go/README.md @@ -0,0 +1,55 @@ +# blst + +The `blst` package provides a rust interface to the blst BLS12-381 signature library. + +## Build +The build process consists of two steps, code generation followed by compilation. + +``` +./generate.py # Optional - only required if making code changes +go build +go test +``` + +The generate.py script is used to generate both min-pk and min-sig variants of the binding from a common code base. It consumes the `*.tgo` files along with `blst_minpk_test.go` and produces `blst.go` and `blst_minsig_test.go`. The .tgo files can treated as if they were .go files, including the use of gofmt and goimports. The generate script will filter out extra imports while processing and automatically run goimports on the final blst.go file. + +After running generate.py, `go build` and `go test` can be run as usual. Cgo will compile `server.c`, which includes the required C implementation files, and `assembly.S`, which includes approprate pre-generated assembly code for the platform. To compile on Windows one has to have MinGW gcc on the %PATH%. + +## Usage +There are two primary modes of operation that can be chosen based on type definitions in the application. + +For minimal-pubkey-size operations: +``` +type PublicKey = P1Affine +type Signature = P2Affine +type AggregateSignature = P2Aggregate +type AggregatePublicKey = P1Aggregate +``` + +For minimal-signature-size operations: +``` +type PublicKey = P2Affine +type Signature = P1Affine +type AggregateSignature = P1Aggregate +type AggregatePublicKey = P2Aggregate +``` + +TODO - structures and possibly methods + +A simple example for generating a key, signing a message, and verifying the message: +``` +var ikm [32]byte +_, _ := rand.Read(ikm[:]) +sk := KeyGen(ikm[:]) +pk := new(PublicKey).From(sk) + +var dst = []byte("BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_NUL_") +msg := []byte("hello foo") +sig := new(Signature).Sign(sk, msg, dst) + +if !sig.Verify(pk, msg, dst) { + t.Errorf("verify sig0") +} +``` + +See the tests for further examples of usage. diff --git a/bindings/go/assembly.S b/bindings/go/assembly.S new file mode 100644 index 00000000..5993b30c --- /dev/null +++ b/bindings/go/assembly.S @@ -0,0 +1,53 @@ +#if defined(__x86_64) || defined(__x86_64__) +# if defined(__ELF__) +# include "elf/sha256-x86_64.s" +# include "elf/inverse_mod_384-x86_64.s" +# include "elf/add_mod_384-x86_64.s" +# include "elf/add_mod_384x384-x86_64.s" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# define __sub_mod_384x384 __sub_mont_384x384 +# ifdef __ADX__ +# include "elf/mulx_mont_384-x86_64.s" +# include "elf/mulx_mont_256-x86_64.s" +# else +# include "elf/mulq_mont_384-x86_64.s" +# include "elf/mulq_mont_256-x86_64.s" +# endif +# include "elf/add_mod_256-x86_64.s" +# elif defined(_WIN64) || defined(__CYGWIN__) +# include "coff/sha256-x86_64.s" +# include "coff/inverse_mod_384-x86_64.s" +# include "coff/add_mod_384-x86_64.s" +# include "coff/add_mod_384x384-x86_64.s" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# define __sub_mod_384x384 __sub_mont_384x384 +# ifdef __ADX__ +# include "coff/mulx_mont_384-x86_64.s" +# include "coff/mulx_mont_256-x86_64.s" +# else +# include "coff/mulq_mont_384-x86_64.s" +# include "coff/mulq_mont_256-x86_64.s" +# endif +# include "coff/add_mod_256-x86_64.s" +# elif defined(__APPLE__) +# include "mach-o/sha256-x86_64.s" +# include "mach-o/inverse_mod_384-x86_64.s" +# include "mach-o/add_mod_384-x86_64.s" +# include "mach-o/add_mod_384x384-x86_64.s" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# define __sub_mod_384x384 __sub_mont_384x384 +# ifdef __ADX__ +# include "mach-o/mulx_mont_384-x86_64.s" +# include "mach-o/mulx_mont_256-x86_64.s" +# else +# include "mach-o/mulq_mont_384-x86_64.s" +# include "mach-o/mulq_mont_256-x86_64.s" +# endif +# include "mach-o/add_mod_256-x86_64.s" +# endif +#else +# error "unsupported platform" +#endif diff --git a/bindings/go/blst.go b/bindings/go/blst.go new file mode 100644 index 00000000..daf450d2 --- /dev/null +++ b/bindings/go/blst.go @@ -0,0 +1,1387 @@ +//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +// DO NOT EDIT THIS FILE!! +// The file is generated from *.tgo by generate.py +//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +package blst + +// #cgo CFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../../build -I${SRCDIR}/../../src -D__BLST_CGO__ -march=native -mno-avx +// #include "blst.h" +import "C" +import ( + "fmt" + "runtime" + "sync/atomic" +) + +const BLST_SCALAR_BYTES = 256 / 8 +const BLST_SCALAR_LIMBS = 256 / 64 +const BLST_FP_BYTES = 384 / 8 +const BLST_FP_LIMBS = 384 / 64 +const BLST_P1_COMPRESS_BYTES = BLST_FP_BYTES +const BLST_P1_SERIALIZE_BYTES = BLST_FP_BYTES * 2 +const BLST_P2_COMPRESS_BYTES = BLST_FP_BYTES * 2 +const BLST_P2_SERIALIZE_BYTES = BLST_FP_BYTES * 4 + +type Scalar = C.blst_scalar +type Fp = C.blst_fp +type Fp2 = C.blst_fp2 +type Fp6 = C.blst_fp6 +type Fp12 = C.blst_fp12 +type P1 = C.blst_p1 +type P2 = C.blst_p2 +type P1Affine = C.blst_p1_affine +type P2Affine = C.blst_p2_affine +type Message = []byte +type Pairing = []uint64 +type SecretKey = Scalar + +// +// Secret key +// +func KeyGen(ikm []byte, optional ...[]byte) *SecretKey { + var sk SecretKey + var info []byte + var infoP *C.byte + if len(optional) > 0 { + info = optional[0] + infoP = (*C.byte)(&info[0]) + } + if len(ikm) < 32 { + return nil + } + C.blst_keygen(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), + infoP, C.size_t(len(info))) + return &sk +} + +// +// Pairing +// +func PairingCtx() Pairing { + ctx := make([]uint64, C.blst_pairing_sizeof()/8) + C.blst_pairing_init((*C.blst_pairing)(&ctx[0])) + return ctx +} + +func PairingAggregatePkInG1(ctx Pairing, PK *P1Affine, sig *P2Affine, + hash_or_encode bool, msg []byte, optional ...[]byte) int { + var DST []byte + var uDST *C.byte + if len(optional) > 0 { + DST = optional[0] + uDST = (*C.byte)(&DST[0]) + } + var aug []byte + var uaug *C.byte + if len(optional) > 1 { + aug = optional[1] + if aug != nil { + uaug = (*C.byte)(&aug[0]) + } + } + var umsg *C.byte + if msg != nil { + umsg = (*C.byte)(&msg[0]) + } + + r := C.blst_pairing_aggregate_pk_in_g1((*C.blst_pairing)(&ctx[0]), + PK, sig, C.bool(hash_or_encode), + umsg, C.size_t(len(msg)), + uDST, C.size_t(len(DST)), + uaug, C.size_t(len(aug))) + + return int(r) +} + +func PairingAggregatePkInG2(ctx Pairing, PK *P2Affine, sig *P1Affine, + hash_or_encode bool, msg []byte, optional ...[]byte) int { + var DST []byte + var uDST *C.byte + if len(optional) > 0 { + DST = optional[0] + uDST = (*C.byte)(&DST[0]) + } + var aug []byte + var uaug *C.byte + if len(optional) > 1 { + aug = optional[1] + if aug != nil { + uaug = (*C.byte)(&aug[0]) + } + } + + r := C.blst_pairing_aggregate_pk_in_g2((*C.blst_pairing)(&ctx[0]), + PK, sig, C.bool(hash_or_encode), + (*C.byte)(&msg[0]), C.size_t(len(msg)), + uDST, C.size_t(len(DST)), + uaug, C.size_t(len(aug))) + + return int(r) +} + +func PairingCommit(ctx Pairing) { + C.blst_pairing_commit((*C.blst_pairing)(&ctx[0])) +} + +func PairingMerge(ctx Pairing, ctx1 Pairing) int { + r := C.blst_pairing_merge((*C.blst_pairing)(&ctx[0]), + (*C.blst_pairing)(&ctx1[0])) + return int(r) +} + +func PairingFinalVerify(ctx Pairing, optional ...*Fp12) bool { + var gtsig *Fp12 = nil + if len(optional) > 0 { + gtsig = optional[0] + } + return bool(C.blst_pairing_finalverify((*C.blst_pairing)(&ctx[0]), gtsig)) +} + +// +// MIN-PK +// + +// +// PublicKey +// + +func (pk *P1Affine) From(s *Scalar) *P1Affine { + C.blst_sk_to_pk2_in_g1(nil, pk, s) + return pk +} + +// +// Sign +// + +func (sig *P2Affine) Sign(sk *SecretKey, msg []byte, dst []byte, + optional ...interface{}) *P2Affine { + augSingle, aug, useHash, ok := parseOpts(optional...) + if !ok || len(aug) != 0 { + return nil + } + + var q *P2 + if useHash { + q = HashToG2(msg, dst, augSingle) + } else { + q = EncodeToG2(msg, dst, augSingle) + } + C.blst_sign_pk2_in_g1(nil, sig, q, sk) + return sig +} + +// +// Signature +// + +// Functions to return a signature and public key+augmentation tuple. +// This enables point decompression (if needed) to happen in parallel. +type sigGetterP2 func() *P2Affine +type pkGetterP1 func(i uint32, temp *P1Affine) (*P1Affine, []byte) + +// Single verify with decompressed pk +func (sig *P2Affine) Verify(pk *P1Affine, msg Message, dst []byte, + optional ...interface{}) bool { // useHash bool, aug []byte + + // CLEANUP!! + // Check for infinities (eth spec) + var zeroSig P2Affine + var zeroPk P1Affine + if pk.Equals(&zeroPk) && sig.Equals(&zeroSig) { + return true + } + // CLEANUP!! + + aug, _, useHash, ok := parseOpts(optional...) + if !ok { + return false + } + return sig.AggregateVerify([]*P1Affine{pk}, []Message{msg}, dst, + useHash, [][]byte{aug}) +} + +// Single verify with compressed pk +// Uses a dummy signature to get the correct type +func (dummy *P2Affine) VerifyCompressed(sig []byte, pk []byte, + msg Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + return dummy.AggregateVerifyCompressed(sig, [][]byte{pk}, + []Message{msg}, dst, optional...) +} + +// Aggregate verify with uncompressed signature and public keys +func (sig *P2Affine) AggregateVerify(pks []*P1Affine, msgs []Message, + dst []byte, + optional ...interface{}) bool { // useHash bool, augs [][]byte + + // sanity checks and argument parsing + if len(pks) != len(msgs) { + return false + } + _, augs, useHash, ok := parseOpts(optional...) + useAugs := len(augs) != 0 + if !ok || (useAugs && len(augs) != len(msgs)) { + return false + } + + sigFn := func() *P2Affine { + return sig + } + + pkFn := func(i uint32, _ *P1Affine) (*P1Affine, []byte) { + if useAugs { + return pks[i], augs[i] + } else { + return pks[i], nil + } + } + + return coreAggregateVerifyPkInG1(sigFn, pkFn, msgs, dst, useHash) +} + +// Aggregate verify with compressed signature and public keys +// Uses a dummy signature to get the correct type +func (dummy *P2Affine) AggregateVerifyCompressed(sig []byte, pks [][]byte, + msgs []Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + // sanity checks and argument parsing + if len(pks) != len(msgs) { + return false + } + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + usePksAsAugs := false + if len(optional) > 1 { + usePksAsAugs = optional[1] + } + + sigFn := func() *P2Affine { + sigP := new(P2Affine) + if sig[0]&0x80 == 0 { + // Not compressed + if sigP.Deserialize(sig) == nil { + return nil + } + } else { + if sigP.Uncompress(sig) == nil { + return nil + } + } + return sigP + } + pkFn := func(i uint32, pk *P1Affine) (*P1Affine, []byte) { + bytes := pks[i] + if len(bytes) == 0 { + return nil, nil + } + if bytes[0]&0x80 == 0 { + // Not compressed + if pk.Deserialize(bytes) == nil { + return nil, nil + } + } else { + if pk.Uncompress(bytes) == nil { + return nil, nil + } + } + if usePksAsAugs { + return pk, bytes + } + return pk, nil + } + return coreAggregateVerifyPkInG1(sigFn, pkFn, msgs, dst, useHash) +} + +// TODO: check message uniqueness +func coreAggregateVerifyPkInG1(sigFn sigGetterP2, pkFn pkGetterP1, + msgs []Message, dst []byte, + optional ...bool) bool { // useHash + + n := len(msgs) + if n == 0 { + return true + } + + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + + numThreads := runtime.GOMAXPROCS(0) + if numThreads > n { + numThreads = n + } + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding pk,msg[,aug] tuple and + // repeat until n is exceeded. The resulting accumulations will be + // fed into the msgsCh channel. + msgsCh := make(chan Pairing, numThreads) + valid := int32(1) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + pairing := PairingCtx() + var temp P1Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + // pull Public Key and augmentation blob + curPk, aug := pkFn(work, &temp) + if curPk == nil { + atomic.StoreInt32(&valid, 0) + break + } + + // Pairing and accumulate + // TODO: delay subgroup check until miller loop by default + PairingAggregatePkInG1(pairing, curPk, nil, + useHash, msgs[work], dst, aug) + + // application might have some async work to do + runtime.Gosched() + } + if atomic.LoadInt32(&valid) > 0 { + PairingCommit(pairing) + msgsCh <- pairing + } else { + msgsCh <- nil + } + }() + } + + // Uncompress and check signature + var gtsig Fp12 + sig := sigFn() + if sig == nil { + atomic.StoreInt32(&valid, 0) + } else { + C.blst_aggregated_in_g2(>sig, sig) + } + + // Accumulate the thread results + var pairings Pairing + for i := 0; i < numThreads; i++ { + msg := <-msgsCh + if msg != nil { + if pairings == nil { + pairings = msg + } else { + PairingMerge(pairings, msg) + } + } + } + if atomic.LoadInt32(&valid) == 0 || pairings == nil { + return false + } + + return PairingFinalVerify(pairings, >sig) +} + +func (sig *P2Affine) FastAggregateVerify(pks []*P1Affine, msg Message, + dst []byte, optional ...interface{}) bool { + n := len(pks) + + // TODO: return value for length zero? + if n == 0 { + return false + } + + aggregator := new(P1Aggregate).Aggregate(pks) + if aggregator == nil { + return false + } + pkAff := aggregator.ToAffine() + + // Verify + return sig.Verify(pkAff, msg, dst, optional...) +} + +// +// Aggregate P2 +// + +type aggGetterP2 func(i uint32, temp *P2Affine) *P2Affine +type P2Aggregate struct { + v *P2 +} + +// Aggregate uncompressed elements +func (agg *P2Aggregate) Aggregate(elmts []*P2Affine) *P2Aggregate { + if len(elmts) == 0 { + return agg + } + getter := func(i uint32, _ *P2Affine) *P2Affine { return elmts[i] } + if !agg.aggregate(getter, len(elmts)) { + return nil + } + return agg +} + +// Aggregate compressed elements +func (agg *P2Aggregate) AggregateCompressed(elmts [][]byte) *P2Aggregate { + if len(elmts) == 0 { + return agg + } + getter := func(i uint32, p *P2Affine) *P2Affine { + bytes := elmts[i] + if len(bytes) == 0 { + return nil + } + if bytes[0]&0x80 == 0 { + // Not compressed + if p.Deserialize(bytes) == nil { + return nil + } + } else { + if p.Uncompress(bytes) == nil { + return nil + } + } + return p + } + if !agg.aggregate(getter, len(elmts)) { + return nil + } + return agg +} + +func (agg *P2Aggregate) AddAggregate(other *P2Aggregate) *P2Aggregate { + if other.v == nil { + // do nothing + } else if agg.v == nil { + agg.v = other.v + } else { + C.blst_p2_add(agg.v, agg.v, other.v) + } + return agg +} + +func (agg *P2Aggregate) Add(elmt *P2Affine) *P2Aggregate { + if agg.v == nil { + agg.v = new(P2) + C.blst_p2_from_affine(agg.v, elmt) + } else { + C.blst_p2_add_or_double_affine(agg.v, agg.v, elmt) + } + return agg +} + +func (agg *P2Aggregate) ToAffine() *P2Affine { + if agg.v == nil { + return nil + } + return agg.v.ToAffine() +} + +func (agg *P2Aggregate) aggregate(getter aggGetterP2, n int) bool { + if n == 0 { + return true + } + numThreads := runtime.GOMAXPROCS(0) + if numThreads > n { + numThreads = n + } + + valid := int32(1) + type result struct { + agg *P2 + empty bool + } + msgs := make(chan result, numThreads) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + first := true + var agg P2 + var temp P2Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + // Signature validate + curElmt := getter(work, &temp) + if curElmt == nil { + atomic.StoreInt32(&valid, 0) + break + } + if first { + C.blst_p2_from_affine(&agg, curElmt) + first = false + } else { + C.blst_p2_add_or_double_affine(&agg, &agg, curElmt) + } + } + if first { + msgs <- result{nil, true} + } else if atomic.LoadInt32(&valid) > 0 { + msgs <- result{&agg, false} + } else { + msgs <- result{nil, false} + } + }() + } + + // Accumulate the thread results + first := agg.v == nil + validLocal := true + for i := 0; i < numThreads; i++ { + msg := <-msgs + if !validLocal || msg.empty { + // do nothing + } else if msg.agg == nil { + validLocal = false + // This should be unnecessary but seems safer + atomic.StoreInt32(&valid, 0) + } else { + if first { + agg.v = msg.agg + first = false + } else { + C.blst_p2_add(agg.v, agg.v, msg.agg) + } + } + } + if atomic.LoadInt32(&valid) == 0 { + agg.v = nil + return false + } + return true +} + +// +// MIN-SIG +// + +// +// PublicKey +// + +func (pk *P2Affine) From(s *Scalar) *P2Affine { + C.blst_sk_to_pk2_in_g2(nil, pk, s) + return pk +} + +// +// Sign +// + +func (sig *P1Affine) Sign(sk *SecretKey, msg []byte, dst []byte, + optional ...interface{}) *P1Affine { + augSingle, aug, useHash, ok := parseOpts(optional...) + if !ok || len(aug) != 0 { + return nil + } + + var q *P1 + if useHash { + q = HashToG1(msg, dst, augSingle) + } else { + q = EncodeToG1(msg, dst, augSingle) + } + C.blst_sign_pk2_in_g2(nil, sig, q, sk) + return sig +} + +// +// Signature +// + +// Functions to return a signature and public key+augmentation tuple. +// This enables point decompression (if needed) to happen in parallel. +type sigGetterP1 func() *P1Affine +type pkGetterP2 func(i uint32, temp *P2Affine) (*P2Affine, []byte) + +// Single verify with decompressed pk +func (sig *P1Affine) Verify(pk *P2Affine, msg Message, dst []byte, + optional ...interface{}) bool { // useHash bool, aug []byte + + // CLEANUP!! + // Check for infinities (eth spec) + var zeroSig P1Affine + var zeroPk P2Affine + if pk.Equals(&zeroPk) && sig.Equals(&zeroSig) { + return true + } + // CLEANUP!! + + aug, _, useHash, ok := parseOpts(optional...) + if !ok { + return false + } + return sig.AggregateVerify([]*P2Affine{pk}, []Message{msg}, dst, + useHash, [][]byte{aug}) +} + +// Single verify with compressed pk +// Uses a dummy signature to get the correct type +func (dummy *P1Affine) VerifyCompressed(sig []byte, pk []byte, + msg Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + return dummy.AggregateVerifyCompressed(sig, [][]byte{pk}, + []Message{msg}, dst, optional...) +} + +// Aggregate verify with uncompressed signature and public keys +func (sig *P1Affine) AggregateVerify(pks []*P2Affine, msgs []Message, + dst []byte, + optional ...interface{}) bool { // useHash bool, augs [][]byte + + // sanity checks and argument parsing + if len(pks) != len(msgs) { + return false + } + _, augs, useHash, ok := parseOpts(optional...) + useAugs := len(augs) != 0 + if !ok || (useAugs && len(augs) != len(msgs)) { + return false + } + + sigFn := func() *P1Affine { + return sig + } + + pkFn := func(i uint32, _ *P2Affine) (*P2Affine, []byte) { + if useAugs { + return pks[i], augs[i] + } else { + return pks[i], nil + } + } + + return coreAggregateVerifyPkInG2(sigFn, pkFn, msgs, dst, useHash) +} + +// Aggregate verify with compressed signature and public keys +// Uses a dummy signature to get the correct type +func (dummy *P1Affine) AggregateVerifyCompressed(sig []byte, pks [][]byte, + msgs []Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + // sanity checks and argument parsing + if len(pks) != len(msgs) { + return false + } + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + usePksAsAugs := false + if len(optional) > 1 { + usePksAsAugs = optional[1] + } + + sigFn := func() *P1Affine { + sigP := new(P1Affine) + if sig[0]&0x80 == 0 { + // Not compressed + if sigP.Deserialize(sig) == nil { + return nil + } + } else { + if sigP.Uncompress(sig) == nil { + return nil + } + } + return sigP + } + pkFn := func(i uint32, pk *P2Affine) (*P2Affine, []byte) { + bytes := pks[i] + if len(bytes) == 0 { + return nil, nil + } + if bytes[0]&0x80 == 0 { + // Not compressed + if pk.Deserialize(bytes) == nil { + return nil, nil + } + } else { + if pk.Uncompress(bytes) == nil { + return nil, nil + } + } + if usePksAsAugs { + return pk, bytes + } + return pk, nil + } + return coreAggregateVerifyPkInG2(sigFn, pkFn, msgs, dst, useHash) +} + +// TODO: check message uniqueness +func coreAggregateVerifyPkInG2(sigFn sigGetterP1, pkFn pkGetterP2, + msgs []Message, dst []byte, + optional ...bool) bool { // useHash + + n := len(msgs) + if n == 0 { + return true + } + + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + + numThreads := runtime.GOMAXPROCS(0) + if numThreads > n { + numThreads = n + } + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding pk,msg[,aug] tuple and + // repeat until n is exceeded. The resulting accumulations will be + // fed into the msgsCh channel. + msgsCh := make(chan Pairing, numThreads) + valid := int32(1) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + pairing := PairingCtx() + var temp P2Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + // pull Public Key and augmentation blob + curPk, aug := pkFn(work, &temp) + if curPk == nil { + atomic.StoreInt32(&valid, 0) + break + } + + // Pairing and accumulate + // TODO: delay subgroup check until miller loop by default + PairingAggregatePkInG2(pairing, curPk, nil, + useHash, msgs[work], dst, aug) + + // application might have some async work to do + runtime.Gosched() + } + if atomic.LoadInt32(&valid) > 0 { + PairingCommit(pairing) + msgsCh <- pairing + } else { + msgsCh <- nil + } + }() + } + + // Uncompress and check signature + var gtsig Fp12 + sig := sigFn() + if sig == nil { + atomic.StoreInt32(&valid, 0) + } else { + C.blst_aggregated_in_g1(>sig, sig) + } + + // Accumulate the thread results + var pairings Pairing + for i := 0; i < numThreads; i++ { + msg := <-msgsCh + if msg != nil { + if pairings == nil { + pairings = msg + } else { + PairingMerge(pairings, msg) + } + } + } + if atomic.LoadInt32(&valid) == 0 || pairings == nil { + return false + } + + return PairingFinalVerify(pairings, >sig) +} + +func (sig *P1Affine) FastAggregateVerify(pks []*P2Affine, msg Message, + dst []byte, optional ...interface{}) bool { + n := len(pks) + + // TODO: return value for length zero? + if n == 0 { + return false + } + + aggregator := new(P2Aggregate).Aggregate(pks) + if aggregator == nil { + return false + } + pkAff := aggregator.ToAffine() + + // Verify + return sig.Verify(pkAff, msg, dst, optional...) +} + +// +// Aggregate P1 +// + +type aggGetterP1 func(i uint32, temp *P1Affine) *P1Affine +type P1Aggregate struct { + v *P1 +} + +// Aggregate uncompressed elements +func (agg *P1Aggregate) Aggregate(elmts []*P1Affine) *P1Aggregate { + if len(elmts) == 0 { + return agg + } + getter := func(i uint32, _ *P1Affine) *P1Affine { return elmts[i] } + if !agg.aggregate(getter, len(elmts)) { + return nil + } + return agg +} + +// Aggregate compressed elements +func (agg *P1Aggregate) AggregateCompressed(elmts [][]byte) *P1Aggregate { + if len(elmts) == 0 { + return agg + } + getter := func(i uint32, p *P1Affine) *P1Affine { + bytes := elmts[i] + if len(bytes) == 0 { + return nil + } + if bytes[0]&0x80 == 0 { + // Not compressed + if p.Deserialize(bytes) == nil { + return nil + } + } else { + if p.Uncompress(bytes) == nil { + return nil + } + } + return p + } + if !agg.aggregate(getter, len(elmts)) { + return nil + } + return agg +} + +func (agg *P1Aggregate) AddAggregate(other *P1Aggregate) *P1Aggregate { + if other.v == nil { + // do nothing + } else if agg.v == nil { + agg.v = other.v + } else { + C.blst_p1_add(agg.v, agg.v, other.v) + } + return agg +} + +func (agg *P1Aggregate) Add(elmt *P1Affine) *P1Aggregate { + if agg.v == nil { + agg.v = new(P1) + C.blst_p1_from_affine(agg.v, elmt) + } else { + C.blst_p1_add_or_double_affine(agg.v, agg.v, elmt) + } + return agg +} + +func (agg *P1Aggregate) ToAffine() *P1Affine { + if agg.v == nil { + return nil + } + return agg.v.ToAffine() +} + +func (agg *P1Aggregate) aggregate(getter aggGetterP1, n int) bool { + if n == 0 { + return true + } + numThreads := runtime.GOMAXPROCS(0) + if numThreads > n { + numThreads = n + } + + valid := int32(1) + type result struct { + agg *P1 + empty bool + } + msgs := make(chan result, numThreads) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + first := true + var agg P1 + var temp P1Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + // Signature validate + curElmt := getter(work, &temp) + if curElmt == nil { + atomic.StoreInt32(&valid, 0) + break + } + if first { + C.blst_p1_from_affine(&agg, curElmt) + first = false + } else { + C.blst_p1_add_or_double_affine(&agg, &agg, curElmt) + } + } + if first { + msgs <- result{nil, true} + } else if atomic.LoadInt32(&valid) > 0 { + msgs <- result{&agg, false} + } else { + msgs <- result{nil, false} + } + }() + } + + // Accumulate the thread results + first := agg.v == nil + validLocal := true + for i := 0; i < numThreads; i++ { + msg := <-msgs + if !validLocal || msg.empty { + // do nothing + } else if msg.agg == nil { + validLocal = false + // This should be unnecessary but seems safer + atomic.StoreInt32(&valid, 0) + } else { + if first { + agg.v = msg.agg + first = false + } else { + C.blst_p1_add(agg.v, agg.v, msg.agg) + } + } + } + if atomic.LoadInt32(&valid) == 0 { + agg.v = nil + return false + } + return true +} + +// +// Serialization/Deserialization. +// + +// P1 Serdes +func (p1 *P1Affine) Serialize() []byte { + var out [BLST_P1_SERIALIZE_BYTES]byte + C.blst_p1_affine_serialize((*C.byte)(&out[0]), p1) + return out[:] +} + +func (p1 *P1Affine) Deserialize(in []byte) *P1Affine { + if len(in) != BLST_P1_SERIALIZE_BYTES { + return nil + } + if C.blst_p1_deserialize(p1, + (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + // CLEANUP!! + // Check for infinities (eth spec) + var zero P1Affine + if p1.Equals(&zero) { + return p1 + } + // CLEANUP!! + + if !bool(C.blst_p1_affine_in_g1(p1)) { + return nil + } + return p1 +} +func (p1 *P1Affine) Compress() []byte { + var out [BLST_P1_COMPRESS_BYTES]byte + C.blst_p1_affine_compress((*C.byte)(&out[0]), p1) + return out[:] +} + +func (p1 *P1Affine) Uncompress(in []byte) *P1Affine { + if len(in) != BLST_P1_COMPRESS_BYTES { + return nil + } + if C.blst_p1_uncompress(p1, + (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + // CLEANUP!! + // Check for infinities (eth spec) + var zero P1Affine + if p1.Equals(&zero) { + return p1 + } + // CLEANUP!! + + if !bool(C.blst_p1_affine_in_g1(p1)) { + return nil + } + return p1 +} +func (p1 *P1) Serialize() []byte { + var out [BLST_P1_SERIALIZE_BYTES]byte + C.blst_p1_serialize((*C.byte)(&out[0]), p1) + return out[:] +} +func (p1 *P1) Compress() []byte { + var out [BLST_P1_COMPRESS_BYTES]byte + C.blst_p1_compress((*C.byte)(&out[0]), p1) + return out[:] +} + +// +// Affine +// + +func (p *P1) ToAffine() *P1Affine { + var pa P1Affine + C.blst_p1_to_affine(&pa, p) + return &pa +} + +// +// Hash +// +func HashToG1(msg []byte, dst []byte, optional ...[]byte) *P1 { + var q P1 + + var aug []byte + var uaug *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + uaug = (*C.byte)(&aug[0]) + } + } + + C.blst_hash_to_g1(&q, + (*C.byte)(&msg[0]), C.size_t(len(msg)), + (*C.byte)(&dst[0]), C.size_t(len(dst)), + uaug, C.size_t(len(aug))) + return &q +} + +func EncodeToG1(msg []byte, dst []byte, optional ...[]byte) *P1 { + var q P1 + + var aug []byte + var uaug *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + uaug = (*C.byte)(&aug[0]) + } + } + + C.blst_encode_to_g1(&q, + (*C.byte)(&msg[0]), C.size_t(len(msg)), + (*C.byte)(&dst[0]), C.size_t(len(dst)), + uaug, C.size_t(len(aug))) + return &q +} + +// +// Serialization/Deserialization. +// + +// P2 Serdes +func (p2 *P2Affine) Serialize() []byte { + var out [BLST_P2_SERIALIZE_BYTES]byte + C.blst_p2_affine_serialize((*C.byte)(&out[0]), p2) + return out[:] +} + +func (p2 *P2Affine) Deserialize(in []byte) *P2Affine { + if len(in) != BLST_P2_SERIALIZE_BYTES { + return nil + } + if C.blst_p2_deserialize(p2, + (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + // CLEANUP!! + // Check for infinities (eth spec) + var zero P2Affine + if p2.Equals(&zero) { + return p2 + } + // CLEANUP!! + + if !bool(C.blst_p2_affine_in_g2(p2)) { + return nil + } + return p2 +} +func (p2 *P2Affine) Compress() []byte { + var out [BLST_P2_COMPRESS_BYTES]byte + C.blst_p2_affine_compress((*C.byte)(&out[0]), p2) + return out[:] +} + +func (p2 *P2Affine) Uncompress(in []byte) *P2Affine { + if len(in) != BLST_P2_COMPRESS_BYTES { + return nil + } + if C.blst_p2_uncompress(p2, + (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + // CLEANUP!! + // Check for infinities (eth spec) + var zero P2Affine + if p2.Equals(&zero) { + return p2 + } + // CLEANUP!! + + if !bool(C.blst_p2_affine_in_g2(p2)) { + return nil + } + return p2 +} +func (p2 *P2) Serialize() []byte { + var out [BLST_P2_SERIALIZE_BYTES]byte + C.blst_p2_serialize((*C.byte)(&out[0]), p2) + return out[:] +} +func (p2 *P2) Compress() []byte { + var out [BLST_P2_COMPRESS_BYTES]byte + C.blst_p2_compress((*C.byte)(&out[0]), p2) + return out[:] +} + +// +// Affine +// + +func (p *P2) ToAffine() *P2Affine { + var pa P2Affine + C.blst_p2_to_affine(&pa, p) + return &pa +} + +// +// Hash +// +func HashToG2(msg []byte, dst []byte, optional ...[]byte) *P2 { + var q P2 + + var aug []byte + var uaug *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + uaug = (*C.byte)(&aug[0]) + } + } + + C.blst_hash_to_g2(&q, + (*C.byte)(&msg[0]), C.size_t(len(msg)), + (*C.byte)(&dst[0]), C.size_t(len(dst)), + uaug, C.size_t(len(aug))) + return &q +} + +func EncodeToG2(msg []byte, dst []byte, optional ...[]byte) *P2 { + var q P2 + + var aug []byte + var uaug *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + uaug = (*C.byte)(&aug[0]) + } + } + + C.blst_encode_to_g2(&q, + (*C.byte)(&msg[0]), C.size_t(len(msg)), + (*C.byte)(&dst[0]), C.size_t(len(dst)), + uaug, C.size_t(len(aug))) + return &q +} + +func parseOpts(optional ...interface{}) ([]byte, [][]byte, bool, bool) { + var aug [][]byte // For aggregate verify + var augSingle []byte // For signing + useHash := true // hash (true), encode (false) + + for _, arg := range optional { + switch v := arg.(type) { + case []byte: + augSingle = v + case [][]byte: + aug = v + case bool: + useHash = v + default: + return nil, nil, useHash, false + } + } + return augSingle, aug, useHash, true +} + +// +// Serialization/Deserialization. +// + +// Scalar serdes +func (s *Scalar) Serialize() []byte { + var out [BLST_SCALAR_BYTES]byte + C.blst_bendian_from_scalar((*C.byte)(&out[0]), s) + return out[:] +} + +func (s *Scalar) Deserialize(in []byte) *Scalar { + if len(in) != BLST_SCALAR_BYTES { + return nil + } + C.blst_scalar_from_bendian(s, (*C.byte)(&in[0])) + if !C.blst_scalar_fr_check(s) { + return nil + } + return s +} + +// +// LEndian +// + +func (fr *Scalar) ToLEndian() []byte { + var arr [BLST_SCALAR_BYTES]byte + C.blst_lendian_from_scalar((*C.byte)(&arr[0]), fr) + return arr[:] +} + +func (fp *Fp) ToLEndian() []byte { + var arr [BLST_FP_BYTES]byte + C.blst_lendian_from_fp((*C.byte)(&arr[0]), fp) + return arr[:] +} + +// +// BEndian +// + +func (fr *Scalar) ToBEndian() []byte { + var arr [BLST_SCALAR_BYTES]byte + C.blst_bendian_from_scalar((*C.byte)(&arr[0]), fr) + return arr[:] +} + +func (fp *Fp) ToBEndian() []byte { + var arr [BLST_FP_BYTES]byte + C.blst_bendian_from_fp((*C.byte)(&arr[0]), fp) + return arr[:] +} + +// +// Printing +// + +func PrintBytes(val []byte, name string) { + fmt.Printf("%s = %02x\n", name, val) +} + +func (s *Scalar) Print(name string) { + arr := s.ToBEndian() + PrintBytes(arr[:], name) +} + +func (p *P1) Print(name string) { + fmt.Printf("%s:\n", name) + aff := p.ToAffine() + arr := aff.x.ToBEndian() + PrintBytes(arr, " x") + arr = aff.y.ToBEndian() + PrintBytes(arr, " y") +} + +func (f *Fp2) Print(name string) { + fmt.Printf("%s:\n", name) + arr := f.fp[0].ToBEndian() + PrintBytes(arr, " 0") + arr = f.fp[1].ToBEndian() + PrintBytes(arr, " 1") +} + +func (p *P2) Print(name string) { + fmt.Printf("%s:\n", name) + aff := p.ToAffine() + aff.x.Print(" x") + aff.y.Print(" y") +} + +// +// Equality +// + +// TODO: replace with C functions + +func (s1 *Scalar) Equals(s2 *Scalar) bool { + equal := true + for i := 0; i < BLST_SCALAR_LIMBS; i++ { + if s1.l[i] != s2.l[i] { + equal = false + } + } + return equal +} + +func (e1 *Fp) Equals(e2 *Fp) bool { + equal := true + for i := 0; i < BLST_FP_LIMBS; i++ { + if e1.l[i] != e2.l[i] { + equal = false + } + } + return equal +} + +func (e1 *Fp2) Equals(e2 *Fp2) bool { + return (&(e1.fp[0])).Equals(&e2.fp[0]) && (&(e1.fp[1])).Equals(&e2.fp[1]) +} + +func (e1 *Fp6) Equals(e2 *Fp6) bool { + return (&(e1.fp2[0])).Equals(&e2.fp2[0]) && + (&(e1.fp2[1])).Equals(&e2.fp2[1]) && + (&(e1.fp2[2])).Equals(&e2.fp2[2]) +} + +func (e1 *P1Affine) Equals(e2 *P1Affine) bool { + return bool(C.blst_p1_affine_is_equal(e1, e2)) +} + +func (e1 *P2Affine) Equals(e2 *P2Affine) bool { + return bool(C.blst_p2_affine_is_equal(e1, e2)) +} diff --git a/bindings/go/blst.tgo b/bindings/go/blst.tgo new file mode 100644 index 00000000..c3233627 --- /dev/null +++ b/bindings/go/blst.tgo @@ -0,0 +1,135 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +package blst + +// #cgo CFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../../build -I${SRCDIR}/../../src -D__BLST_CGO__ -march=native -mno-avx +// #include "blst.h" +import "C" + +const BLST_SCALAR_BYTES = 256 / 8 +const BLST_SCALAR_LIMBS = 256 / 64 +const BLST_FP_BYTES = 384 / 8 +const BLST_FP_LIMBS = 384 / 64 +const BLST_P1_COMPRESS_BYTES = BLST_FP_BYTES +const BLST_P1_SERIALIZE_BYTES = BLST_FP_BYTES * 2 +const BLST_P2_COMPRESS_BYTES = BLST_FP_BYTES * 2 +const BLST_P2_SERIALIZE_BYTES = BLST_FP_BYTES * 4 + +type Scalar = C.blst_scalar +type Fp = C.blst_fp +type Fp2 = C.blst_fp2 +type Fp6 = C.blst_fp6 +type Fp12 = C.blst_fp12 +type P1 = C.blst_p1 +type P2 = C.blst_p2 +type P1Affine = C.blst_p1_affine +type P2Affine = C.blst_p2_affine +type Message = []byte +type Pairing = []uint64 +type SecretKey = Scalar + +// +// Secret key +// +func KeyGen(ikm []byte, optional ...[]byte) *SecretKey { + var sk SecretKey + var info []byte + var infoP *C.byte + if len(optional) > 0 { + info = optional[0] + infoP = (*C.byte)(&info[0]) + } + if len(ikm) < 32 { + return nil + } + C.blst_keygen(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), + infoP, C.size_t(len(info))) + return &sk +} + +// +// Pairing +// +func PairingCtx() Pairing { + ctx := make([]uint64, C.blst_pairing_sizeof()/8) + C.blst_pairing_init((*C.blst_pairing)(&ctx[0])) + return ctx +} + +func PairingAggregatePkInG1(ctx Pairing, PK *P1Affine, sig *P2Affine, + hash_or_encode bool, msg []byte, optional ...[]byte) int { + var DST []byte + var uDST *C.byte + if len(optional) > 0 { + DST = optional[0] + uDST = (*C.byte)(&DST[0]) + } + var aug []byte + var uaug *C.byte + if len(optional) > 1 { + aug = optional[1] + if aug != nil { + uaug = (*C.byte)(&aug[0]) + } + } + var umsg *C.byte + if msg != nil { + umsg = (*C.byte)(&msg[0]) + } + + r := C.blst_pairing_aggregate_pk_in_g1((*C.blst_pairing)(&ctx[0]), + PK, sig, C.bool(hash_or_encode), + umsg, C.size_t(len(msg)), + uDST, C.size_t(len(DST)), + uaug, C.size_t(len(aug))) + + return int(r) +} + +func PairingAggregatePkInG2(ctx Pairing, PK *P2Affine, sig *P1Affine, + hash_or_encode bool, msg []byte, optional ...[]byte) int { + var DST []byte + var uDST *C.byte + if len(optional) > 0 { + DST = optional[0] + uDST = (*C.byte)(&DST[0]) + } + var aug []byte + var uaug *C.byte + if len(optional) > 1 { + aug = optional[1] + if aug != nil { + uaug = (*C.byte)(&aug[0]) + } + } + + r := C.blst_pairing_aggregate_pk_in_g2((*C.blst_pairing)(&ctx[0]), + PK, sig, C.bool(hash_or_encode), + (*C.byte)(&msg[0]), C.size_t(len(msg)), + uDST, C.size_t(len(DST)), + uaug, C.size_t(len(aug))) + + return int(r) +} + +func PairingCommit(ctx Pairing) { + C.blst_pairing_commit((*C.blst_pairing)(&ctx[0])) +} + +func PairingMerge(ctx Pairing, ctx1 Pairing) int { + r := C.blst_pairing_merge((*C.blst_pairing)(&ctx[0]), + (*C.blst_pairing)(&ctx1[0])) + return int(r) +} + +func PairingFinalVerify(ctx Pairing, optional ...*Fp12) bool { + var gtsig *Fp12 = nil + if len(optional) > 0 { + gtsig = optional[0] + } + return bool(C.blst_pairing_finalverify((*C.blst_pairing)(&ctx[0]), gtsig)) +} diff --git a/bindings/go/blst_minpk.tgo b/bindings/go/blst_minpk.tgo new file mode 100644 index 00000000..06207f71 --- /dev/null +++ b/bindings/go/blst_minpk.tgo @@ -0,0 +1,424 @@ + +import ( + "runtime" + "sync/atomic" +) + +// +// PublicKey +// + +func (pk *P1Affine) From(s *Scalar) *P1Affine { + C.blst_sk_to_pk2_in_g1(nil, pk, s) + return pk +} + +// +// Sign +// + +func (sig *P2Affine) Sign(sk *SecretKey, msg []byte, dst []byte, + optional ...interface{}) *P2Affine { + augSingle, aug, useHash, ok := parseOpts(optional...) + if !ok || len(aug) != 0 { + return nil + } + + var q *P2 + if useHash { + q = HashToG2(msg, dst, augSingle) + } else { + q = EncodeToG2(msg, dst, augSingle) + } + C.blst_sign_pk2_in_g1(nil, sig, q, sk) + return sig +} + +// +// Signature +// + +// Functions to return a signature and public key+augmentation tuple. +// This enables point decompression (if needed) to happen in parallel. +type sigGetterP2 func() *P2Affine +type pkGetterP1 func(i uint32, temp *P1Affine) (*P1Affine, []byte) + +// Single verify with decompressed pk +func (sig *P2Affine) Verify(pk *P1Affine, msg Message, dst []byte, + optional ...interface{}) bool { // useHash bool, aug []byte + + // CLEANUP!! + // Check for infinities (eth spec) + var zeroSig P2Affine + var zeroPk P1Affine + if pk.Equals(&zeroPk) && sig.Equals(&zeroSig) { + return true + } + // CLEANUP!! + + aug, _, useHash, ok := parseOpts(optional...) + if !ok { + return false + } + return sig.AggregateVerify([]*P1Affine{pk}, []Message{msg}, dst, + useHash, [][]byte{aug}) +} + +// Single verify with compressed pk +// Uses a dummy signature to get the correct type +func (dummy *P2Affine) VerifyCompressed(sig []byte, pk []byte, + msg Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + return dummy.AggregateVerifyCompressed(sig, [][]byte{pk}, + []Message{msg}, dst, optional...) +} + +// Aggregate verify with uncompressed signature and public keys +func (sig *P2Affine) AggregateVerify(pks []*P1Affine, msgs []Message, + dst []byte, + optional ...interface{}) bool { // useHash bool, augs [][]byte + + // sanity checks and argument parsing + if len(pks) != len(msgs) { + return false + } + _, augs, useHash, ok := parseOpts(optional...) + useAugs := len(augs) != 0 + if !ok || (useAugs && len(augs) != len(msgs)) { + return false + } + + sigFn := func() *P2Affine { + return sig + } + + pkFn := func(i uint32, _ *P1Affine) (*P1Affine, []byte) { + if useAugs { + return pks[i], augs[i] + } else { + return pks[i], nil + } + } + + return coreAggregateVerifyPkInG1(sigFn, pkFn, msgs, dst, useHash) +} + +// Aggregate verify with compressed signature and public keys +// Uses a dummy signature to get the correct type +func (dummy *P2Affine) AggregateVerifyCompressed(sig []byte, pks [][]byte, + msgs []Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + // sanity checks and argument parsing + if len(pks) != len(msgs) { + return false + } + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + usePksAsAugs := false + if len(optional) > 1 { + usePksAsAugs = optional[1] + } + + sigFn := func() *P2Affine { + sigP := new(P2Affine) + if sig[0]&0x80 == 0 { + // Not compressed + if sigP.Deserialize(sig) == nil { + return nil + } + } else { + if sigP.Uncompress(sig) == nil { + return nil + } + } + return sigP + } + pkFn := func(i uint32, pk *P1Affine) (*P1Affine, []byte) { + bytes := pks[i] + if len(bytes) == 0 { + return nil, nil + } + if bytes[0]&0x80 == 0 { + // Not compressed + if pk.Deserialize(bytes) == nil { + return nil, nil + } + } else { + if pk.Uncompress(bytes) == nil { + return nil, nil + } + } + if usePksAsAugs { + return pk, bytes + } + return pk, nil + } + return coreAggregateVerifyPkInG1(sigFn, pkFn, msgs, dst, useHash) +} + +// TODO: check message uniqueness +func coreAggregateVerifyPkInG1(sigFn sigGetterP2, pkFn pkGetterP1, + msgs []Message, dst []byte, + optional ...bool) bool { // useHash + + n := len(msgs) + if n == 0 { + return true + } + + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + + numThreads := runtime.GOMAXPROCS(0) + if numThreads > n { + numThreads = n + } + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding pk,msg[,aug] tuple and + // repeat until n is exceeded. The resulting accumulations will be + // fed into the msgsCh channel. + msgsCh := make(chan Pairing, numThreads) + valid := int32(1) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + pairing := PairingCtx() + var temp P1Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + // pull Public Key and augmentation blob + curPk, aug := pkFn(work, &temp) + if curPk == nil { + atomic.StoreInt32(&valid, 0) + break + } + + // Pairing and accumulate + // TODO: delay subgroup check until miller loop by default + PairingAggregatePkInG1(pairing, curPk, nil, + useHash, msgs[work], dst, aug) + + // application might have some async work to do + runtime.Gosched() + } + if atomic.LoadInt32(&valid) > 0 { + PairingCommit(pairing) + msgsCh <- pairing + } else { + msgsCh <- nil + } + }() + } + + // Uncompress and check signature + var gtsig Fp12 + sig := sigFn() + if sig == nil { + atomic.StoreInt32(&valid, 0) + } else { + C.blst_aggregated_in_g2(>sig, sig) + } + + // Accumulate the thread results + var pairings Pairing + for i := 0; i < numThreads; i++ { + msg := <-msgsCh + if msg != nil { + if pairings == nil { + pairings = msg + } else { + PairingMerge(pairings, msg) + } + } + } + if atomic.LoadInt32(&valid) == 0 || pairings == nil { + return false + } + + return PairingFinalVerify(pairings, >sig) +} + +func (sig *P2Affine) FastAggregateVerify(pks []*P1Affine, msg Message, + dst []byte, optional ...interface{}) bool { + n := len(pks) + + // TODO: return value for length zero? + if n == 0 { + return false + } + + aggregator := new(P1Aggregate).Aggregate(pks) + if aggregator == nil { + return false + } + pkAff := aggregator.ToAffine() + + // Verify + return sig.Verify(pkAff, msg, dst, optional...) +} + +// +// Aggregate P2 +// + +type aggGetterP2 func(i uint32, temp *P2Affine) *P2Affine +type P2Aggregate struct { + v *P2 +} + +// Aggregate uncompressed elements +func (agg *P2Aggregate) Aggregate(elmts []*P2Affine) *P2Aggregate { + if len(elmts) == 0 { + return agg + } + getter := func(i uint32, _ *P2Affine) *P2Affine { return elmts[i] } + if !agg.aggregate(getter, len(elmts)) { + return nil + } + return agg +} + +// Aggregate compressed elements +func (agg *P2Aggregate) AggregateCompressed(elmts [][]byte) *P2Aggregate { + if len(elmts) == 0 { + return agg + } + getter := func(i uint32, p *P2Affine) *P2Affine { + bytes := elmts[i] + if len(bytes) == 0 { + return nil + } + if bytes[0]&0x80 == 0 { + // Not compressed + if p.Deserialize(bytes) == nil { + return nil + } + } else { + if p.Uncompress(bytes) == nil { + return nil + } + } + return p + } + if !agg.aggregate(getter, len(elmts)) { + return nil + } + return agg +} + +func (agg *P2Aggregate) AddAggregate(other *P2Aggregate) *P2Aggregate { + if other.v == nil { + // do nothing + } else if agg.v == nil { + agg.v = other.v + } else { + C.blst_p2_add(agg.v, agg.v, other.v) + } + return agg +} + +func (agg *P2Aggregate) Add(elmt *P2Affine) *P2Aggregate { + if agg.v == nil { + agg.v = new(P2) + C.blst_p2_from_affine(agg.v, elmt) + } else { + C.blst_p2_add_or_double_affine(agg.v, agg.v, elmt) + } + return agg +} + +func (agg *P2Aggregate) ToAffine() *P2Affine { + if agg.v == nil { + return nil + } + return agg.v.ToAffine() +} + +func (agg *P2Aggregate) aggregate(getter aggGetterP2, n int) bool { + if n == 0 { + return true + } + numThreads := runtime.GOMAXPROCS(0) + if numThreads > n { + numThreads = n + } + + valid := int32(1) + type result struct { + agg *P2 + empty bool + } + msgs := make(chan result, numThreads) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + first := true + var agg P2 + var temp P2Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + // Signature validate + curElmt := getter(work, &temp) + if curElmt == nil { + atomic.StoreInt32(&valid, 0) + break + } + if first { + C.blst_p2_from_affine(&agg, curElmt) + first = false + } else { + C.blst_p2_add_or_double_affine(&agg, &agg, curElmt) + } + } + if first { + msgs <- result{nil, true} + } else if atomic.LoadInt32(&valid) > 0 { + msgs <- result{&agg, false} + } else { + msgs <- result{nil, false} + } + }() + } + + // Accumulate the thread results + first := agg.v == nil + validLocal := true + for i := 0; i < numThreads; i++ { + msg := <-msgs + if !validLocal || msg.empty { + // do nothing + } else if msg.agg == nil { + validLocal = false + // This should be unnecessary but seems safer + atomic.StoreInt32(&valid, 0) + } else { + if first { + agg.v = msg.agg + first = false + } else { + C.blst_p2_add(agg.v, agg.v, msg.agg) + } + } + } + if atomic.LoadInt32(&valid) == 0 { + agg.v = nil + return false + } + return true +} diff --git a/bindings/go/blst_minpk_test.go b/bindings/go/blst_minpk_test.go new file mode 100644 index 00000000..c532658b --- /dev/null +++ b/bindings/go/blst_minpk_test.go @@ -0,0 +1,349 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +package blst + +import ( + "crypto/rand" + "fmt" + "testing" +) + +// Min PK +type PublicKeyMinPk = P1Affine +type SignatureMinPk = P2Affine +type AggregateSignatureMinPk = P2Aggregate + +// Names in this file must be unique to support min-sig so we can't use 'dst' +// here. +var dstMinPk = []byte("BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_NUL_") + +func TestInfinityMinPk(t *testing.T) { + var infComp [48]byte + infComp[0] |= 0xc0 + new(PublicKeyMinPk).Uncompress(infComp[:]) +} + +func TestSerdesMinPk(t *testing.T) { + var ikm = [...]byte{ + 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, + 0x08, 0x3a, 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, + 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, + 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99} + + sk := KeyGen(ikm[:]) + + // Serialize/deserialize sk + sk2 := new(SecretKey).Deserialize(sk.Serialize()) + if !sk.Equals(sk2) { + t.Errorf("sk2 != sk") + } + + // Negative test equals + sk.l[0] = sk.l[0] + 1 + if sk.Equals(sk2) { + t.Errorf("sk2 == sk") + } + + // pk + pk := new(PublicKeyMinPk).From(sk) + + // Compress/decompress sk + pk2 := new(PublicKeyMinPk).Uncompress(pk.Compress()) + if !pk.Equals(pk2) { + t.Errorf("pk2 != pk") + } + + // Serialize/deserialize sk + pk3 := new(PublicKeyMinPk).Deserialize(pk.Serialize()) + if !pk.Equals(pk3) { + t.Errorf("pk3 != pk") + } + + // Negative test equals + // pk.x.l[0] = pk.x.l[0] + 1 + // if pk.Equals(pk2) { + // t.Errorf("pk2 == pk") + // } +} + +func TestSignVerifyMinPk(t *testing.T) { + var ikm = [...]byte{ + 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, + 0x08, 0x3a, 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, + 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, + 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99} + + sk0 := KeyGen(ikm[:]) + ikm[0] = ikm[0] + 1 + sk1 := KeyGen(ikm[:]) + + // pk + pk0 := new(PublicKeyMinPk).From(sk0) + pk1 := new(PublicKeyMinPk).From(sk1) + + // Sign + msg0 := []byte("hello foo") + msg1 := []byte("hello bar!") + sig0 := new(SignatureMinPk).Sign(sk0, msg0, dstMinPk) + sig1 := new(SignatureMinPk).Sign(sk1, msg1, dstMinPk) + + // Verify + if !sig0.Verify(pk0, msg0, dstMinPk) { + t.Errorf("verify sig0") + } + if !sig1.Verify(pk1, msg1, dstMinPk) { + t.Errorf("verify sig1") + } + if !new(SignatureMinPk).VerifyCompressed(sig1.Compress(), pk1.Compress(), + msg1, dstMinPk) { + t.Errorf("verify sig1") + } + // Batch verify + if !sig0.AggregateVerify([]*PublicKeyMinPk{pk0}, []Message{msg0}, dstMinPk) { + t.Errorf("aggregate verify sig0") + } + // Verify compressed inputs + if !new(SignatureMinPk).AggregateVerifyCompressed(sig0.Compress(), + [][]byte{pk0.Compress()}, []Message{msg0}, dstMinPk) { + t.Errorf("aggregate verify sig0 compressed") + } + + // Verify serialized inputs + if !new(SignatureMinPk).AggregateVerifyCompressed(sig0.Serialize(), + [][]byte{pk0.Serialize()}, []Message{msg0}, dstMinPk) { + t.Errorf("aggregate verify sig0 serialized") + } + + // Compressed with empty pk + var emptyPk []byte + if new(SignatureMinPk).VerifyCompressed(sig0.Compress(), emptyPk, msg0, dstMinPk) { + t.Errorf("verify sig compressed inputs") + } + // Wrong message + if sig0.Verify(pk0, msg1, dstMinPk) { + t.Errorf("Expected Verify to return false") + } + // Wrong key + if sig0.Verify(pk1, msg0, dstMinPk) { + t.Errorf("Expected Verify to return false") + } + // Wrong sig + if sig1.Verify(pk0, msg0, dstMinPk) { + t.Errorf("Expected Verify to return false") + } +} + +func TestSignVerifyAugMinPk(t *testing.T) { + sk := genRandomKeyMinPk() + pk := new(PublicKeyMinPk).From(sk) + msg := []byte("hello foo") + aug := []byte("augmentation") + sig := new(SignatureMinPk).Sign(sk, msg, dstMinPk, aug) + if !sig.Verify(pk, msg, dstMinPk, aug) { + t.Errorf("verify sig") + } + aug2 := []byte("augmentation2") + if sig.Verify(pk, msg, dstMinPk, aug2) { + t.Errorf("verify sig, wrong augmentation") + } + if sig.Verify(pk, msg, dstMinPk) { + t.Errorf("verify sig, no augmentation") + } + // TODO: augmentation with aggregate verify +} + +func TestSignVerifyEncodeMinPk(t *testing.T) { + sk := genRandomKeyMinPk() + pk := new(PublicKeyMinPk).From(sk) + msg := []byte("hello foo") + sig := new(SignatureMinPk).Sign(sk, msg, dstMinPk, false) + if !sig.Verify(pk, msg, dstMinPk, false) { + t.Errorf("verify sig") + } + if sig.Verify(pk, msg, dstMinPk) { + t.Errorf("verify sig expected fail, wrong hashing engine") + } + if sig.Verify(pk, msg, dstMinPk, 0) { + t.Errorf("verify sig expected fail, illegal argument") + } +} + +func TestSignVerifyAggregateMinPk(t *testing.T) { + for size := 1; size < 20; size++ { + sks, msgs, _, pubks, _ := generateBatchTestDataUncompressedMinPk(size) + + // All signers sign the same message + sigs := make([]*SignatureMinPk, 0) + for i := 0; i < size; i++ { + sigs = append(sigs, new(SignatureMinPk).Sign(sks[i], msgs[0], + dstMinPk)) + } + agSig := new(AggregateSignatureMinPk).Aggregate(sigs).ToAffine() + + if !agSig.FastAggregateVerify(pubks, msgs[0], dstMinPk) { + t.Errorf("failed to verify size %d", size) + } + + // Test compressed/serialized signature aggregation + compSigs := make([][]byte, size) + for i := 0; i < size; i++ { + if (i % 2) == 0 { + compSigs[i] = sigs[i].Compress() + } else { + compSigs[i] = sigs[i].Serialize() + } + } + agSig = new(AggregateSignatureMinPk).AggregateCompressed(compSigs). + ToAffine() + if !agSig.FastAggregateVerify(pubks, msgs[0], dstMinPk) { + t.Errorf("failed to verify size %d", size) + } + + } +} + +func BenchmarkCoreSignMinPk(b *testing.B) { + var ikm = [...]byte{ + 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, + 0x08, 0x3a, 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, + 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, + 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99} + + sk := KeyGen(ikm[:]) + msg := []byte("hello foo") + for i := 0; i < b.N; i++ { + new(SignatureMinPk).Sign(sk, msg, dstMinPk) + } +} + +func BenchmarkCoreVerifyMinPk(b *testing.B) { + var ikm = [...]byte{ + 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, + 0x08, 0x3a, 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, + 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, + 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99} + + sk := KeyGen(ikm[:]) + pk := new(PublicKeyMinPk).From(sk) + msg := []byte("hello foo") + sig := new(SignatureMinPk).Sign(sk, msg, dstMinPk) + + // Verify + for i := 0; i < b.N; i++ { + if !sig.Verify(pk, msg, dstMinPk) { + b.Fatal("verify sig") + } + } +} + +func BenchmarkCoreVerifyAggregateMinPk(b *testing.B) { + run := func(size int) func(b *testing.B) { + return func(b *testing.B) { + msgs, _, pubks, agsig := generateBatchTestDataMinPk(size) + b.ResetTimer() + for i := 0; i < b.N; i++ { + if !new(SignatureMinPk).AggregateVerifyCompressed(agsig, pubks, + msgs, dstMinPk) { + b.Fatal("failed to verify") + } + } + } + } + + b.Run("1", run(1)) + b.Run("10", run(10)) + b.Run("50", run(50)) + b.Run("100", run(100)) + b.Run("300", run(300)) + b.Run("1000", run(1000)) + b.Run("4000", run(4000)) +} + +func BenchmarkVerifyAggregateUncompressedMinPk(b *testing.B) { + run := func(size int) func(b *testing.B) { + return func(b *testing.B) { + _, msgs, _, pubks, agsig := + generateBatchTestDataUncompressedMinPk(size) + b.ResetTimer() + for i := 0; i < b.N; i++ { + if !agsig.AggregateVerify(pubks, msgs, dstMinPk) { + b.Fatal("failed to verify") + } + } + } + } + + b.Run("1", run(1)) + b.Run("10", run(10)) + b.Run("50", run(50)) + b.Run("100", run(100)) + b.Run("300", run(300)) + b.Run("1000", run(1000)) + b.Run("4000", run(4000)) +} + +func BenchmarkCoreAggregateMinPk(b *testing.B) { + run := func(size int) func(b *testing.B) { + return func(b *testing.B) { + _, sigs, _, _ := generateBatchTestDataMinPk(size) + b.ResetTimer() + for i := 0; i < b.N; i++ { + var agg AggregateSignatureMinPk + agg.AggregateCompressed(sigs) + } + } + } + + b.Run("1", run(1)) + b.Run("10", run(10)) + b.Run("50", run(50)) + b.Run("100", run(100)) + b.Run("300", run(300)) + b.Run("1000", run(1000)) + b.Run("4000", run(4000)) +} + +func genRandomKeyMinPk() *SecretKey { + // Generate 32 bytes of randomness + var ikm [32]byte + _, err := rand.Read(ikm[:]) + + if err != nil { + return nil + } + return KeyGen(ikm[:]) +} + +func generateBatchTestDataMinPk(size int) (msgs []Message, + sigs [][]byte, pubks [][]byte, agsig []byte) { + for i := 0; i < size; i++ { + msg := Message(fmt.Sprintf("blst is a blast!! %d", i)) + msgs = append(msgs, msg) + priv := genRandomKeyMinPk() + sigs = append(sigs, new(SignatureMinPk).Sign(priv, msg, dstMinPk). + Compress()) + pubks = append(pubks, new(PublicKeyMinPk).From(priv).Compress()) + } + agsig = new(AggregateSignatureMinPk).AggregateCompressed(sigs).ToAffine(). + Compress() + return +} + +func generateBatchTestDataUncompressedMinPk(size int) (sks []*SecretKey, + msgs []Message, sigs []*SignatureMinPk, pubks []*PublicKeyMinPk, + agsig *SignatureMinPk) { + for i := 0; i < size; i++ { + msg := Message(fmt.Sprintf("blst is a blast!! %d", i)) + msgs = append(msgs, msg) + priv := genRandomKeyMinPk() + sks = append(sks, priv) + sigs = append(sigs, new(SignatureMinPk).Sign(priv, msg, dstMinPk)) + pubks = append(pubks, new(PublicKeyMinPk).From(priv)) + } + agsig = new(AggregateSignatureMinPk).Aggregate(sigs).ToAffine() + return +} diff --git a/bindings/go/blst_minsig_test.go b/bindings/go/blst_minsig_test.go new file mode 100644 index 00000000..df79b226 --- /dev/null +++ b/bindings/go/blst_minsig_test.go @@ -0,0 +1,353 @@ +//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +// DO NOT EDIT THIS FILE!! +// The file is generated from blst_minpk_test.go by generate.py +//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +package blst + +import ( + "crypto/rand" + "fmt" + "testing" +) + +// Min PK +type PublicKeyMinSig = P2Affine +type SignatureMinSig = P1Affine +type AggregateSignatureMinSig = P1Aggregate + +// Names in this file must be unique to support min-sig so we can't use 'dst' +// here. +var dstMinSig = []byte("BLS_SIG_BLS12381G1_XMD:SHA-256_SSWU_RO_NUL_") + +func TestInfinityMinSig(t *testing.T) { + var infComp [48]byte + infComp[0] |= 0xc0 + new(PublicKeyMinSig).Uncompress(infComp[:]) +} + +func TestSerdesMinSig(t *testing.T) { + var ikm = [...]byte{ + 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, + 0x08, 0x3a, 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, + 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, + 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99} + + sk := KeyGen(ikm[:]) + + // Serialize/deserialize sk + sk2 := new(SecretKey).Deserialize(sk.Serialize()) + if !sk.Equals(sk2) { + t.Errorf("sk2 != sk") + } + + // Negative test equals + sk.l[0] = sk.l[0] + 1 + if sk.Equals(sk2) { + t.Errorf("sk2 == sk") + } + + // pk + pk := new(PublicKeyMinSig).From(sk) + + // Compress/decompress sk + pk2 := new(PublicKeyMinSig).Uncompress(pk.Compress()) + if !pk.Equals(pk2) { + t.Errorf("pk2 != pk") + } + + // Serialize/deserialize sk + pk3 := new(PublicKeyMinSig).Deserialize(pk.Serialize()) + if !pk.Equals(pk3) { + t.Errorf("pk3 != pk") + } + + // Negative test equals + // pk.x.l[0] = pk.x.l[0] + 1 + // if pk.Equals(pk2) { + // t.Errorf("pk2 == pk") + // } +} + +func TestSignVerifyMinSig(t *testing.T) { + var ikm = [...]byte{ + 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, + 0x08, 0x3a, 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, + 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, + 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99} + + sk0 := KeyGen(ikm[:]) + ikm[0] = ikm[0] + 1 + sk1 := KeyGen(ikm[:]) + + // pk + pk0 := new(PublicKeyMinSig).From(sk0) + pk1 := new(PublicKeyMinSig).From(sk1) + + // Sign + msg0 := []byte("hello foo") + msg2 := []byte("hello bar!") + sig0 := new(SignatureMinSig).Sign(sk0, msg0, dstMinSig) + sig2 := new(SignatureMinSig).Sign(sk1, msg2, dstMinSig) + + // Verify + if !sig0.Verify(pk0, msg0, dstMinSig) { + t.Errorf("verify sig0") + } + if !sig2.Verify(pk1, msg2, dstMinSig) { + t.Errorf("verify sig2") + } + if !new(SignatureMinSig).VerifyCompressed(sig2.Compress(), pk1.Compress(), + msg2, dstMinSig) { + t.Errorf("verify sig2") + } + // Batch verify + if !sig0.AggregateVerify([]*PublicKeyMinSig{pk0}, []Message{msg0}, dstMinSig) { + t.Errorf("aggregate verify sig0") + } + // Verify compressed inputs + if !new(SignatureMinSig).AggregateVerifyCompressed(sig0.Compress(), + [][]byte{pk0.Compress()}, []Message{msg0}, dstMinSig) { + t.Errorf("aggregate verify sig0 compressed") + } + + // Verify serialized inputs + if !new(SignatureMinSig).AggregateVerifyCompressed(sig0.Serialize(), + [][]byte{pk0.Serialize()}, []Message{msg0}, dstMinSig) { + t.Errorf("aggregate verify sig0 serialized") + } + + // Compressed with empty pk + var emptyPk []byte + if new(SignatureMinSig).VerifyCompressed(sig0.Compress(), emptyPk, msg0, dstMinSig) { + t.Errorf("verify sig compressed inputs") + } + // Wrong message + if sig0.Verify(pk0, msg2, dstMinSig) { + t.Errorf("Expected Verify to return false") + } + // Wrong key + if sig0.Verify(pk1, msg0, dstMinSig) { + t.Errorf("Expected Verify to return false") + } + // Wrong sig + if sig2.Verify(pk0, msg0, dstMinSig) { + t.Errorf("Expected Verify to return false") + } +} + +func TestSignVerifyAugMinSig(t *testing.T) { + sk := genRandomKeyMinSig() + pk := new(PublicKeyMinSig).From(sk) + msg := []byte("hello foo") + aug := []byte("augmentation") + sig := new(SignatureMinSig).Sign(sk, msg, dstMinSig, aug) + if !sig.Verify(pk, msg, dstMinSig, aug) { + t.Errorf("verify sig") + } + aug1 := []byte("augmentation2") + if sig.Verify(pk, msg, dstMinSig, aug1) { + t.Errorf("verify sig, wrong augmentation") + } + if sig.Verify(pk, msg, dstMinSig) { + t.Errorf("verify sig, no augmentation") + } + // TODO: augmentation with aggregate verify +} + +func TestSignVerifyEncodeMinSig(t *testing.T) { + sk := genRandomKeyMinSig() + pk := new(PublicKeyMinSig).From(sk) + msg := []byte("hello foo") + sig := new(SignatureMinSig).Sign(sk, msg, dstMinSig, false) + if !sig.Verify(pk, msg, dstMinSig, false) { + t.Errorf("verify sig") + } + if sig.Verify(pk, msg, dstMinSig) { + t.Errorf("verify sig expected fail, wrong hashing engine") + } + if sig.Verify(pk, msg, dstMinSig, 0) { + t.Errorf("verify sig expected fail, illegal argument") + } +} + +func TestSignVerifyAggregateMinSig(t *testing.T) { + for size := 1; size < 20; size++ { + sks, msgs, _, pubks, _ := generateBatchTestDataUncompressedMinSig(size) + + // All signers sign the same message + sigs := make([]*SignatureMinSig, 0) + for i := 0; i < size; i++ { + sigs = append(sigs, new(SignatureMinSig).Sign(sks[i], msgs[0], + dstMinSig)) + } + agSig := new(AggregateSignatureMinSig).Aggregate(sigs).ToAffine() + + if !agSig.FastAggregateVerify(pubks, msgs[0], dstMinSig) { + t.Errorf("failed to verify size %d", size) + } + + // Test compressed/serialized signature aggregation + compSigs := make([][]byte, size) + for i := 0; i < size; i++ { + if (i % 2) == 0 { + compSigs[i] = sigs[i].Compress() + } else { + compSigs[i] = sigs[i].Serialize() + } + } + agSig = new(AggregateSignatureMinSig).AggregateCompressed(compSigs). + ToAffine() + if !agSig.FastAggregateVerify(pubks, msgs[0], dstMinSig) { + t.Errorf("failed to verify size %d", size) + } + + } +} + +func BenchmarkCoreSignMinSig(b *testing.B) { + var ikm = [...]byte{ + 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, + 0x08, 0x3a, 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, + 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, + 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99} + + sk := KeyGen(ikm[:]) + msg := []byte("hello foo") + for i := 0; i < b.N; i++ { + new(SignatureMinSig).Sign(sk, msg, dstMinSig) + } +} + +func BenchmarkCoreVerifyMinSig(b *testing.B) { + var ikm = [...]byte{ + 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, + 0x08, 0x3a, 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, + 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, + 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99} + + sk := KeyGen(ikm[:]) + pk := new(PublicKeyMinSig).From(sk) + msg := []byte("hello foo") + sig := new(SignatureMinSig).Sign(sk, msg, dstMinSig) + + // Verify + for i := 0; i < b.N; i++ { + if !sig.Verify(pk, msg, dstMinSig) { + b.Fatal("verify sig") + } + } +} + +func BenchmarkCoreVerifyAggregateMinSig(b *testing.B) { + run := func(size int) func(b *testing.B) { + return func(b *testing.B) { + msgs, _, pubks, agsig := generateBatchTestDataMinSig(size) + b.ResetTimer() + for i := 0; i < b.N; i++ { + if !new(SignatureMinSig).AggregateVerifyCompressed(agsig, pubks, + msgs, dstMinSig) { + b.Fatal("failed to verify") + } + } + } + } + + b.Run("1", run(1)) + b.Run("10", run(10)) + b.Run("50", run(50)) + b.Run("100", run(100)) + b.Run("300", run(300)) + b.Run("1000", run(1000)) + b.Run("4000", run(4000)) +} + +func BenchmarkVerifyAggregateUncompressedMinSig(b *testing.B) { + run := func(size int) func(b *testing.B) { + return func(b *testing.B) { + _, msgs, _, pubks, agsig := + generateBatchTestDataUncompressedMinSig(size) + b.ResetTimer() + for i := 0; i < b.N; i++ { + if !agsig.AggregateVerify(pubks, msgs, dstMinSig) { + b.Fatal("failed to verify") + } + } + } + } + + b.Run("1", run(1)) + b.Run("10", run(10)) + b.Run("50", run(50)) + b.Run("100", run(100)) + b.Run("300", run(300)) + b.Run("1000", run(1000)) + b.Run("4000", run(4000)) +} + +func BenchmarkCoreAggregateMinSig(b *testing.B) { + run := func(size int) func(b *testing.B) { + return func(b *testing.B) { + _, sigs, _, _ := generateBatchTestDataMinSig(size) + b.ResetTimer() + for i := 0; i < b.N; i++ { + var agg AggregateSignatureMinSig + agg.AggregateCompressed(sigs) + } + } + } + + b.Run("1", run(1)) + b.Run("10", run(10)) + b.Run("50", run(50)) + b.Run("100", run(100)) + b.Run("300", run(300)) + b.Run("1000", run(1000)) + b.Run("4000", run(4000)) +} + +func genRandomKeyMinSig() *SecretKey { + // Generate 32 bytes of randomness + var ikm [32]byte + _, err := rand.Read(ikm[:]) + + if err != nil { + return nil + } + return KeyGen(ikm[:]) +} + +func generateBatchTestDataMinSig(size int) (msgs []Message, + sigs [][]byte, pubks [][]byte, agsig []byte) { + for i := 0; i < size; i++ { + msg := Message(fmt.Sprintf("blst is a blast!! %d", i)) + msgs = append(msgs, msg) + priv := genRandomKeyMinSig() + sigs = append(sigs, new(SignatureMinSig).Sign(priv, msg, dstMinSig). + Compress()) + pubks = append(pubks, new(PublicKeyMinSig).From(priv).Compress()) + } + agsig = new(AggregateSignatureMinSig).AggregateCompressed(sigs).ToAffine(). + Compress() + return +} + +func generateBatchTestDataUncompressedMinSig(size int) (sks []*SecretKey, + msgs []Message, sigs []*SignatureMinSig, pubks []*PublicKeyMinSig, + agsig *SignatureMinSig) { + for i := 0; i < size; i++ { + msg := Message(fmt.Sprintf("blst is a blast!! %d", i)) + msgs = append(msgs, msg) + priv := genRandomKeyMinSig() + sks = append(sks, priv) + sigs = append(sigs, new(SignatureMinSig).Sign(priv, msg, dstMinSig)) + pubks = append(pubks, new(PublicKeyMinSig).From(priv)) + } + agsig = new(AggregateSignatureMinSig).Aggregate(sigs).ToAffine() + return +} diff --git a/bindings/go/blst_misc.tgo b/bindings/go/blst_misc.tgo new file mode 100644 index 00000000..48f06435 --- /dev/null +++ b/bindings/go/blst_misc.tgo @@ -0,0 +1,159 @@ + +import "fmt" + +// Parse out optional arguments for sign and verify. +// aug []byte - augmentation bytes (default: nil) +func parseOpts(optional ...interface{}) ([]byte, [][]byte, bool, bool) { + var aug [][]byte // For aggregate verify + var augSingle []byte // For signing + useHash := true // hash (true), encode (false) + + for _, arg := range optional { + switch v := arg.(type) { + case []byte: + augSingle = v + case [][]byte: + aug = v + case bool: + useHash = v + default: + return nil, nil, useHash, false + } + } + return augSingle, aug, useHash, true +} + +// +// Serialization/Deserialization. +// + +// Scalar serdes +func (s *Scalar) Serialize() []byte { + var out [BLST_SCALAR_BYTES]byte + C.blst_bendian_from_scalar((*C.byte)(&out[0]), s) + return out[:] +} + +func (s *Scalar) Deserialize(in []byte) *Scalar { + if len(in) != BLST_SCALAR_BYTES { + return nil + } + C.blst_scalar_from_bendian(s, (*C.byte)(&in[0])) + if !C.blst_scalar_fr_check(s) { + return nil + } + return s +} + +// +// LEndian +// + +func (fr *Scalar) ToLEndian() []byte { + var arr [BLST_SCALAR_BYTES]byte + C.blst_lendian_from_scalar((*C.byte)(&arr[0]), fr) + return arr[:] +} + +func (fp *Fp) ToLEndian() []byte { + var arr [BLST_FP_BYTES]byte + C.blst_lendian_from_fp((*C.byte)(&arr[0]), fp) + return arr[:] +} + +// +// BEndian +// + +func (fr *Scalar) ToBEndian() []byte { + var arr [BLST_SCALAR_BYTES]byte + C.blst_bendian_from_scalar((*C.byte)(&arr[0]), fr) + return arr[:] +} + +func (fp *Fp) ToBEndian() []byte { + var arr [BLST_FP_BYTES]byte + C.blst_bendian_from_fp((*C.byte)(&arr[0]), fp) + return arr[:] +} + +// +// Printing +// + +func PrintBytes(val []byte, name string) { + fmt.Printf("%s = %02x\n", name, val) +} + +func (s *Scalar) Print(name string) { + arr := s.ToBEndian() + PrintBytes(arr[:], name) +} + +func (p *P1) Print(name string) { + fmt.Printf("%s:\n", name) + aff := p.ToAffine() + arr := aff.x.ToBEndian() + PrintBytes(arr, " x") + arr = aff.y.ToBEndian() + PrintBytes(arr, " y") +} + +func (f *Fp2) Print(name string) { + fmt.Printf("%s:\n", name) + arr := f.fp[0].ToBEndian() + PrintBytes(arr, " 0") + arr = f.fp[1].ToBEndian() + PrintBytes(arr, " 1") +} + +func (p *P2) Print(name string) { + fmt.Printf("%s:\n", name) + aff := p.ToAffine() + aff.x.Print(" x") + aff.y.Print(" y") +} + +// +// Equality +// + +// TODO: replace with C functions + +func (s1 *Scalar) Equals(s2 *Scalar) bool { + equal := true + for i := 0; i < BLST_SCALAR_LIMBS; i++ { + if s1.l[i] != s2.l[i] { + equal = false + } + } + return equal +} + +func (e1 *Fp) Equals(e2 *Fp) bool { + equal := true + for i := 0; i < BLST_FP_LIMBS; i++ { + if e1.l[i] != e2.l[i] { + equal = false + } + } + return equal +} + +func (e1 *Fp2) Equals(e2 *Fp2) bool { + return (&(e1.fp[0])).Equals(&e2.fp[0]) && (&(e1.fp[1])).Equals(&e2.fp[1]) +} + +func (e1 *Fp6) Equals(e2 *Fp6) bool { + return (&(e1.fp2[0])).Equals(&e2.fp2[0]) && + (&(e1.fp2[1])).Equals(&e2.fp2[1]) && + (&(e1.fp2[2])).Equals(&e2.fp2[2]) +} + +func (e1 *P1Affine) Equals(e2 *P1Affine) bool { + return bool(C.blst_p1_affine_is_equal(e1, e2)) +} + +func (e1 *P2Affine) Equals(e2 *P2Affine) bool { + return bool(C.blst_p2_affine_is_equal(e1, e2)) +} diff --git a/bindings/go/blst_px.tgo b/bindings/go/blst_px.tgo new file mode 100644 index 00000000..435eb023 --- /dev/null +++ b/bindings/go/blst_px.tgo @@ -0,0 +1,122 @@ + +// +// Serialization/Deserialization. +// + +// P1 Serdes +func (p1 *P1Affine) Serialize() []byte { + var out [BLST_P1_SERIALIZE_BYTES]byte + C.blst_p1_affine_serialize((*C.byte)(&out[0]), p1) + return out[:] +} + +func (p1 *P1Affine) Deserialize(in []byte) *P1Affine { + if len(in) != BLST_P1_SERIALIZE_BYTES { + return nil + } + if C.blst_p1_deserialize(p1, + (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + // CLEANUP!! + // Check for infinities (eth spec) + var zero P1Affine + if p1.Equals(&zero) { + return p1 + } + // CLEANUP!! + + if !bool(C.blst_p1_affine_in_g1(p1)) { + return nil + } + return p1 +} +func (p1 *P1Affine) Compress() []byte { + var out [BLST_P1_COMPRESS_BYTES]byte + C.blst_p1_affine_compress((*C.byte)(&out[0]), p1) + return out[:] +} + +func (p1 *P1Affine) Uncompress(in []byte) *P1Affine { + if len(in) != BLST_P1_COMPRESS_BYTES { + return nil + } + if C.blst_p1_uncompress(p1, + (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + // CLEANUP!! + // Check for infinities (eth spec) + var zero P1Affine + if p1.Equals(&zero) { + return p1 + } + // CLEANUP!! + + if !bool(C.blst_p1_affine_in_g1(p1)) { + return nil + } + return p1 +} +func (p1 *P1) Serialize() []byte { + var out [BLST_P1_SERIALIZE_BYTES]byte + C.blst_p1_serialize((*C.byte)(&out[0]), p1) + return out[:] +} +func (p1 *P1) Compress() []byte { + var out [BLST_P1_COMPRESS_BYTES]byte + C.blst_p1_compress((*C.byte)(&out[0]), p1) + return out[:] +} + +// +// Affine +// + +func (p *P1) ToAffine() *P1Affine { + var pa P1Affine + C.blst_p1_to_affine(&pa, p) + return &pa +} + +// +// Hash +// +func HashToG1(msg []byte, dst []byte, optional ...[]byte) *P1 { + var q P1 + + var aug []byte + var uaug *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + uaug = (*C.byte)(&aug[0]) + } + } + + C.blst_hash_to_g1(&q, + (*C.byte)(&msg[0]), C.size_t(len(msg)), + (*C.byte)(&dst[0]), C.size_t(len(dst)), + uaug, C.size_t(len(aug))) + return &q +} + +func EncodeToG1(msg []byte, dst []byte, optional ...[]byte) *P1 { + var q P1 + + var aug []byte + var uaug *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + uaug = (*C.byte)(&aug[0]) + } + } + + C.blst_encode_to_g1(&q, + (*C.byte)(&msg[0]), C.size_t(len(msg)), + (*C.byte)(&dst[0]), C.size_t(len(dst)), + uaug, C.size_t(len(aug))) + return &q +} + diff --git a/bindings/go/generate.py b/bindings/go/generate.py new file mode 100755 index 00000000..1d645bbc --- /dev/null +++ b/bindings/go/generate.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 + +import os +import sys +import re + +here = re.split(r'/(?=[^/]*$)',sys.argv[0]) +if len(here) > 1: + os.chdir(here[0]) + +for dir in re.split(r':', os.getenv("GOPATH")): + goimports = dir + "/bin/goimports" + if os.path.isfile(goimports) and os.access(goimports, os.X_OK): + break + goimports = None + +if goimports == None: + print("goimports is not found on $GOPATH", file=sys.stderr) + print("install with 'go get golang.org/x/tools/cmd/goimports'", file=sys.stderr) + sys.exit(1) + +outFile = 'blst.go' + +def concatFile(fout, fin, removeImports): + for line in fin: + if removeImports and 'import' in line: + while ')' not in line: + line = fin.readline() + continue + print(line, file=fout, end='') + +def remap(fout, fin, mapping, dont_touch, removeImports): + for line in fin: + if removeImports and 'import' in line: + while ')' not in line: + line = fin.readline() + continue + for (a, b) in dont_touch: + line = line.replace(a, b) + + for (a, b) in mapping: + line = line.replace(a, a+"_tmp") + line = line.replace(b, b+"_tmp") + line = line.replace(a+"_tmp", b) + line = line.replace(b+"_tmp", a) + + for (a, b) in dont_touch: + line = line.replace(b, a) + print(line, file=fout, end='') + + +fout = open(outFile, "w") + +print("//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", file=fout) +print("// DO NOT EDIT THIS FILE!!", file=fout) +print("// The file is generated from *.tgo by " + here[-1], file=fout) +print("//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", file=fout) + +fin = open('blst.tgo', "r") +concatFile(fout, fin, False) +fin.close() + +# min-pk +print("//", file=fout) +print("// MIN-PK", file=fout) +print("//", file=fout) + +fin = open('blst_minpk.tgo', "r") +concatFile(fout, fin, True) +fin.close() + +# These are strings that overlap with the mapping names but we don't +# actually want to change. The second value should be a unique string. +dont_touch = (('Fp12', 'foo1234'),) + +# We're going to swap these names to get from min-pk to min-sig +mapping = [('P1', 'P2'), + ('p1', 'p2'), + ('G1', 'G2'), + ('g1', 'g2') + ] + +# min-sig +print("//", file=fout) +print("// MIN-SIG", file=fout) +print("//", file=fout) + +with open('blst_minpk.tgo', "r") as fin: + remap(fout, fin, mapping, dont_touch, True) + +# serdes and other functions +fin = open('blst_px.tgo', "r") +concatFile(fout, fin, True) +fin.close() + +with open('blst_px.tgo', "r") as fin: + remap(fout, fin, mapping, dont_touch, True) + +# final code +fin = open('blst_misc.tgo', "r") +concatFile(fout, fin, True) +fin.close() + +fout.close() + +# Use goimports to generate the import list +os.system(goimports + " -w blst.go") + + +# Generate min-sig tests +fout = open('blst_minsig_test.go', "w") +print("//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", file=fout) +print("// DO NOT EDIT THIS FILE!!", file=fout) +print("// The file is generated from blst_minpk_test.go by " + here[-1], file=fout) +print("//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", file=fout) + +mapping.append(('MinPk', 'MinSig')) + +with open('blst_minpk_test.go', "r") as fin: + remap(fout, fin, mapping, dont_touch, False) +fout.close() + diff --git a/bindings/go/server.c b/bindings/go/server.c new file mode 100644 index 00000000..73938dc7 --- /dev/null +++ b/bindings/go/server.c @@ -0,0 +1,20 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "keygen.c" +#include "hash_to_field.c" +#include "e1.c" +#include "exp.c" +#include "map_to_g1.c" +#include "e2.c" +#include "exp2.c" +#include "map_to_g2.c" +#include "fp12_tower.c" +#include "pairing.c" +#include "aggregate.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" diff --git a/bindings/rust/Cargo.toml b/bindings/rust/Cargo.toml new file mode 100644 index 00000000..8bb74d2f --- /dev/null +++ b/bindings/rust/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "blst" +version = "0.1.0" +authors = ["sean-sn "] +edition = "2018" +license = "Apache-2.0" +description = "Bindings for blst BLS12-381 library" +repository = "https://github.com/supranational/blst" +readme = "README.md" + +[build-dependencies] +#cc = "1.0.54" +cc = "1.0" +bindgen = "0.54.0" + +[dependencies] +rayon = "1.3" +threadpool = "^1.8.1" + +[dev-dependencies] +rand = "0.7" +rand_chacha = "0.2" +criterion = "0.3" + +[[bench]] +name = "blst_benches" +harness = false + +[profile.release] +#opt-level = 3 diff --git a/bindings/rust/README.md b/bindings/rust/README.md new file mode 100644 index 00000000..ea7b0cab --- /dev/null +++ b/bindings/rust/README.md @@ -0,0 +1,55 @@ +# blst + +The `blst` crate provides a rust interface to the blst BLS12-381 signature library. + +## Build +The build process uses [bindgen](https://github.com/rust-lang/rust-bindgen) on the blst.h C header file to automatically create the FFI bindings to blst. Currently [build.rs](https://github.com/supranational/blst/blob/master/bindings/rust/build.rs) also runs the assembly generation scripts and compiles everything into libblst.a within the rust target build area. Alternatively this can be modified to either call the appropriate build script in blst base directory or simply link to a prebuilt libblst.a. As more platforms are tested and feedback collected, this process may change. + +Everything can be built and run with the typical cargo commands: + +``` +cargo test +cargo bench +``` + +**Note this has primarily been tested on Ubuntu and may require further work for other operating systems.** + +## Usage +There are two primary modes of operation that can be chosen based on declaration path: + +For minimal-pubkey-size operations: +``` +use blst::min_pk::* +``` + +For minimal-signature-size operations: +``` +use blst::min_sig::* +``` + +There are five structs with inherent implementations that provide the BLS12-381 signature functionality. +``` +SecretKey +PublicKey +AggregatePublicKey +Signature +AggregateSignature +``` + +A simple example for generating a key, signing a message, and verifying the message: +``` +let mut ikm = [0u8; 32]; +rng.fill_bytes(&mut ikm); + +let sk = SecretKey::key_gen(&ikm, &[]).unwrap(); +let pk = sk.sk_to_pk(); + +let dst = b"BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_NUL_"; +let msg = b"blst is such a blast"; +let sig = sk.sign(msg, dst, &[]); + +let err = sig.verify(msg, dst, &[], &pk); +assert_eq!(err, BLST_ERROR::BLST_SUCCESS); +``` + +See the tests in src/lib.rs and benchmarks in benches/blst_benches.rs for further examples of usage. diff --git a/bindings/rust/benches/blst_benches.rs b/bindings/rust/benches/blst_benches.rs new file mode 100644 index 00000000..3c2a5ea9 --- /dev/null +++ b/bindings/rust/benches/blst_benches.rs @@ -0,0 +1,426 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +use blst::*; + +// Benchmark min_pk +use blst::min_pk::*; + +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +use rand::{RngCore, SeedableRng}; +use rand_chacha::ChaCha20Rng; + +struct BenchData { + sk: SecretKey, + pk: PublicKey, + msg: Vec, + dst: Vec, + sig: Signature, +} + +fn gen_bench_data(rng: &mut rand_chacha::ChaCha20Rng) -> BenchData { + let mut ikm = [0u8; 32]; + rng.fill_bytes(&mut ikm); + + let sk = SecretKey::key_gen(&ikm, &[]).unwrap(); + let pk = sk.sk_to_pk(); + let dst = "BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_NUL_" + .as_bytes() + .to_owned(); + let msg_len = (rng.next_u64() & 0x3F) + 1; + let mut msg = vec![0u8; msg_len as usize]; + rng.fill_bytes(&mut msg); + + let sig = sk.sign(&msg, &dst, &[]); + + let bd = BenchData { + sk, + pk, + dst, + msg, + sig, + }; + bd +} + +fn bench_verify_multi_aggregate(c: &mut Criterion) { + let mut group = c.benchmark_group("verify_multi_aggregate"); + + let dst = b"BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_"; + let mut ikm = [0u8; 32]; + + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + + let num_sigs = vec![8, 16, 32, 64, 128]; + let pks_per_sig = 3; + + for n in num_sigs.iter() { + let mut msgs: Vec> = vec![vec![]; *n]; + let mut sigs: Vec = Vec::with_capacity(*n); + let mut pks: Vec = Vec::with_capacity(*n); + let mut rands: Vec = Vec::with_capacity(*n); + + for i in 0..*n { + // Create public keys + rng.fill_bytes(&mut ikm); + let sks_i: Vec<_> = + (0..pks_per_sig) + .map(|_| { ikm[0] += 1; SecretKey::key_gen(&ikm, &[]).unwrap()}) + .collect(); + let pks_i = sks_i + .iter() + .map(|sk| sk.sk_to_pk()) + .collect::>(); + let pks_refs_i: Vec<&PublicKey> = + pks_i.iter().map(|pk| pk).collect(); + + // Create random message for pks to all sign + let msg_len = (rng.next_u64() & 0x3F) + 1; + msgs[i] = vec![0u8; msg_len as usize]; + rng.fill_bytes(&mut msgs[i]); + + // Generate signature for each key pair + let sigs_i = sks_i + .iter() + .map(|sk| sk.sign(&msgs[i], dst, &[])) + .collect::>(); + + // Aggregate signature + let sig_refs_i = + sigs_i.iter().map(|s| s).collect::>(); + let agg_i = AggregateSignature::aggregate(&sig_refs_i); + sigs.push(agg_i.to_signature()); + + // aggregate public keys and push into vec + let agg_pk_i = AggregatePublicKey::aggregate(&pks_refs_i); + pks.push(agg_pk_i.to_public_key()); + + // create random values + let mut vals = [0u64; 4]; + vals[0] = rng.next_u64(); + let mut rand_i = std::mem::MaybeUninit::::uninit(); + unsafe { + blst_scalar_from_uint64( + rand_i.as_mut_ptr(), + vals.as_ptr(), + ); + rands.push(rand_i.assume_init()); + } + } + + let msgs_refs: Vec<&[u8]> = + msgs.iter().map(|m| m.as_slice()).collect(); + let sig_refs = + sigs.iter().map(|s| s).collect::>(); + let pks_refs: Vec<&PublicKey> = + pks.iter().map(|pk| pk).collect(); + + let agg_ver = (sig_refs, pks_refs, msgs_refs, dst, rands); + + group.bench_with_input( + BenchmarkId::new("verify_multi_aggregate", n), + &agg_ver, + |b, (s, p, m, d, r)| { + b.iter(|| Signature::verify_multiple_aggregate_signatures(&m, *d, &p, &s, &r, 64)); + }, + ); + } + + group.finish(); +} + +fn bench_fast_aggregate_verify(c: &mut Criterion) { + let mut group = c.benchmark_group("fast_aggregate_verify"); + + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + + let sizes = vec![8, 16, 32, 64, 128]; + + let bds: Vec<_> = (0..sizes[sizes.len() - 1]) + .map(|_| gen_bench_data(&mut rng)) + .collect(); + + for size in sizes.iter() { + let pks_refs = bds + .iter() + .take(*size) + .map(|s| &s.pk) + .collect::>(); + + let sig_refs = bds + .iter() + .take(*size) + .map(|s| &s.sig) + .collect::>(); + + let agg = AggregateSignature::aggregate(&sig_refs); + let agg_sig = agg.to_signature(); + + let agg_pks = AggregatePublicKey::aggregate(&pks_refs); + let agg_pk = agg_pks.to_public_key(); + + let agg_ver = (agg_sig, pks_refs, &bds[0].msg, &bds[0].dst); + let agg_pre_ver = (agg_sig, agg_pk, &bds[0].msg, &bds[0].dst); + + group.bench_with_input( + BenchmarkId::new("fast_aggregate_verify", size), + &agg_ver, + |b, (a, p, m, d)| { + b.iter(|| a.fast_aggregate_verify(&m, &d, &p)); + }, + ); + + group.bench_with_input( + BenchmarkId::new("fast_aggregate_verify_preagg", size), + &agg_pre_ver, + |b, (a, p, m, d)| { + b.iter(|| a.fast_aggregate_verify_pre_aggregated(&m, &d, &p)); + }, + ); + } + + group.finish(); +} + +fn bench_aggregate_verify(c: &mut Criterion) { + let mut group = c.benchmark_group("aggregate_verify"); + + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + + let sizes = vec![8, 16, 32, 64, 128]; + // [10, 50, 100, 300, 1000, 4000]; + + let bds: Vec<_> = (0..sizes[sizes.len() - 1]) + .map(|_| gen_bench_data(&mut rng)) + .collect(); + + for size in sizes.iter() { + let msgs_refs = bds + .iter() + .take(*size) + .map(|s| s.msg.as_slice()) + .collect::>(); + + let pks_refs = bds + .iter() + .take(*size) + .map(|s| &s.pk) + .collect::>(); + + let sig_refs = bds + .iter() + .take(*size) + .map(|s| &s.sig) + .collect::>(); + + let agg = AggregateSignature::aggregate(&sig_refs); + let agg_sig = agg.to_signature(); + let agg_ver = (agg_sig, pks_refs, msgs_refs, &bds[0].dst); + + group.bench_with_input( + BenchmarkId::new("aggregate_verify", size), + &agg_ver, + |b, (a, p, m, d)| { + b.iter(|| a.aggregate_verify(&m, &d, &p)); + }, + ); + } + + group.finish(); +} + +fn bench_aggregate(c: &mut Criterion) { + let mut group = c.benchmark_group("aggregate"); + + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + + let sizes: [usize; 6] = [10, 50, 100, 300, 1000, 4000]; + + let bds: Vec<_> = (0..4000).map(|_| gen_bench_data(&mut rng)).collect(); + + for size in sizes.iter() { + let sig_refs = bds + .iter() + .take(*size) + .map(|s| &s.sig) + .collect::>(); + + group.bench_with_input( + BenchmarkId::new("aggregate_signature", size), + &sig_refs, + |b, s| { + b.iter(|| AggregateSignature::aggregate(&s)); + }, + ); + + let pks_refs = bds + .iter() + .take(*size) + .map(|s| &s.pk) + .collect::>(); + + group.bench_with_input( + BenchmarkId::new("aggregate_public_key", size), + &pks_refs, + |b, p| { + b.iter(|| AggregatePublicKey::aggregate(&p)); + }, + ); + } + + group.finish(); +} + +fn bench_single_message(c: &mut Criterion) { + let mut group = c.benchmark_group("single_message"); + + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + let bd = gen_bench_data(&mut rng); + + group.bench_function("sign", |b| { + b.iter(|| bd.sk.sign(&bd.msg, &bd.dst, &[])) + }); + + group.bench_function("verify", |b| { + b.iter(|| bd.sig.verify(&bd.msg, &bd.dst, &[], &bd.pk)) + }); + + group.finish(); +} + +fn bench_serdes(c: &mut Criterion) { + let mut group = c.benchmark_group("serdes"); + + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + let bd = gen_bench_data(&mut rng); + + let sk = bd.sk; + let sk_ser = sk.serialize(); + + let pk = bd.pk; + let pk_comp = pk.compress(); + let pk_ser = pk.serialize(); + + let sig = bd.sig; + let sig_comp = sig.compress(); + let sig_ser = sig.serialize(); + + let mut pk_jac = std::mem::MaybeUninit::::uninit(); + let mut sig_jac = std::mem::MaybeUninit::::uninit(); + + let mut p1_comp = [0; 48]; + let mut p2_comp = [0; 96]; + let mut p1_ser = [0; 96]; + let mut p2_ser = [0; 192]; + + unsafe { + blst_p1_from_affine(pk_jac.as_mut_ptr(), &pk.point); + blst_p2_from_affine(sig_jac.as_mut_ptr(), &sig.point); + + blst_p1_double(pk_jac.as_mut_ptr(), pk_jac.as_ptr()); // Make Z != 1 + blst_p2_double(sig_jac.as_mut_ptr(), sig_jac.as_ptr()); // Make Z != 1 + + pk_jac.assume_init(); + sig_jac.assume_init(); + } + + group.bench_function("secret_key_serialize", |b| b.iter(|| sk.serialize())); + + group.bench_function("secret_key_deserialize", |b| { + b.iter(|| SecretKey::deserialize(&sk_ser)); + }); + + group.bench_function("public_key_serialize", |b| b.iter(|| pk.serialize())); + + group.bench_function("public_key_compress", |b| b.iter(|| pk.compress())); + + group.bench_function("public_key_uncompress", |b| { + b.iter(|| PublicKey::uncompress(&pk_comp)) + }); + + group.bench_function("public_key_deserialize", |b| { + b.iter(|| PublicKey::deserialize(&pk_ser)); + }); + + group.bench_function("signature_serialize", |b| b.iter(|| sig.serialize())); + + group.bench_function("signature_compress", |b| b.iter(|| sig.compress())); + + group.bench_function("signature_uncompress", |b| { + b.iter(|| Signature::uncompress(&sig_comp)) + }); + + group.bench_function("signature_deserialize", |b| { + b.iter(|| Signature::deserialize(&sig_ser)) + }); + + group.bench_function("p1_serialize", |b| { + b.iter(|| unsafe { + blst_p1_serialize(p1_ser.as_mut_ptr(), pk_jac.as_ptr()) + }) + }); + + group.bench_function("p1_compress", |b| { + b.iter(|| unsafe { + blst_p1_compress(p1_comp.as_mut_ptr(), pk_jac.as_ptr()) + }) + }); + + group.bench_function("p2_serialize", |b| { + b.iter(|| unsafe { + blst_p2_serialize(p2_ser.as_mut_ptr(), sig_jac.as_ptr()) + }) + }); + + group.bench_function("p2_compress", |b| { + b.iter(|| unsafe { + blst_p2_compress(p2_comp.as_mut_ptr(), sig_jac.as_ptr()) + }) + }); + + group.finish(); +} + +fn bench_keys(c: &mut Criterion) { + let mut group = c.benchmark_group("keys"); + let ikm: [u8; 32] = [ + 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, 0x08, 0x3a, 0x91, 0x0c, + 0x8b, 0x72, 0x85, 0x91, 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, + 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99, + ]; + let sk = SecretKey::key_gen(&ikm, &[]).unwrap(); + let pk = sk.sk_to_pk(); + let pk_comp = pk.compress(); + + group + .bench_function("key_gen", |b| b.iter(|| SecretKey::key_gen(&ikm, &[]))); + + group.bench_function("sk_to_pk", |b| { + b.iter(|| sk.sk_to_pk()); + }); + + group.bench_function("key_validate", |b| { + b.iter(|| PublicKey::key_validate(&pk_comp)); + }); + + group.finish(); +} + +criterion_group!( + benches, + bench_verify_multi_aggregate, + bench_fast_aggregate_verify, + bench_aggregate_verify, + bench_aggregate, + bench_single_message, + bench_serdes, + bench_keys +); +criterion_main!(benches); diff --git a/bindings/rust/build.rs b/bindings/rust/build.rs new file mode 100644 index 00000000..4d5ef1e8 --- /dev/null +++ b/bindings/rust/build.rs @@ -0,0 +1,65 @@ +extern crate bindgen; +extern crate cc; + +use std::env; +use std::path::PathBuf; +use std::path::Path; +use std::process::Command; + +fn main() { + // TODO - could ls directory and find all files + let asm_to_build = [ + "add_mod_256-x86_64", + "add_mod_384-x86_64", + "mulq_mont_256-x86_64", + "mulx_mont_256-x86_64", + "sha256-x86_64", + "add_mod_384x384-x86_64", + "inverse_mod_384-x86_64", + "mulq_mont_384-x86_64", + "mulx_mont_384-x86_64", + ]; + + let mut file_vec = Vec::new(); + + let out_dir = env::var_os("OUT_DIR").unwrap(); + + for a in asm_to_build.iter() { + let dest_path = Path::new(&out_dir).join(a).with_extension("s"); + let src_path = Path::new("../../src/asm/") + .join(a).with_extension("pl"); + + Command::new(&src_path) + .args(&[">", dest_path.to_str().unwrap()]) + .status().unwrap(); + + file_vec.push(dest_path); + } + + file_vec.push(Path::new("../../src/").join("server.c")); + + // Set CC environment variable to choose alternative C compiler. + // Optimization level depends on whether or not --release is passed + // or implied. If default "release" level of 3 is deemed unsuitable, + // modify 'opt-level' in [profile.release] in Cargo.toml. + cc::Build::new() + .flag("-march=native") + .flag_if_supported("-mno-avx") // avoid costly transitions + .flag_if_supported("-Wno-unused-command-line-argument") + .files(&file_vec) + .compile("libblst.a"); + + let bindings = bindgen::Builder::default() + .header("../blst.h") + .opaque_type("blst_pairing") + .size_t_is_usize(true) + .rustified_enum("BLST_ERROR") + .generate() + .expect("Unable to generate bindings"); + + // Write the bindings to the $OUT_DIR/bindings.rs file. + let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); + bindings + .write_to_file(out_path.join("bindings.rs")) + .expect("Couldn't write bindings!"); +} diff --git a/bindings/rust/rustfmt.toml b/bindings/rust/rustfmt.toml new file mode 100644 index 00000000..df99c691 --- /dev/null +++ b/bindings/rust/rustfmt.toml @@ -0,0 +1 @@ +max_width = 80 diff --git a/bindings/rust/src/lib.rs b/bindings/rust/src/lib.rs new file mode 100644 index 00000000..dd861d7a --- /dev/null +++ b/bindings/rust/src/lib.rs @@ -0,0 +1,1378 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] + +use std::any::Any; +use std::mem::{transmute, MaybeUninit}; +use std::sync::{atomic::*, mpsc::channel, Arc, Mutex, Once}; +use std::{ptr, slice}; +use threadpool::ThreadPool; + +fn da_pool() -> ThreadPool { + static INIT: Once = Once::new(); + static mut POOL: *const Mutex = 0 as *const Mutex; + + INIT.call_once(|| { + let pool = Mutex::new(ThreadPool::default()); + unsafe { POOL = transmute(Box::new(pool)) }; + }); + unsafe { (*POOL).lock().unwrap().clone() } +} + +include!(concat!(env!("OUT_DIR"), "/bindings.rs")); + +impl blst_fp12 { + pub fn new() -> Self { + unsafe { MaybeUninit::::uninit().assume_init() } + } +} + +#[derive(Debug)] +pub struct Pairing { + v: Box<[u64]>, +} + +impl Pairing { + pub fn new() -> Self { + let v: Vec = vec![0; unsafe { blst_pairing_sizeof() } / 8]; + let mut obj = Self { + v: v.into_boxed_slice(), + }; + obj.init(); + obj + } + + pub fn init(&mut self) { + unsafe { blst_pairing_init(self.ctx()) } + } + fn ctx(&mut self) -> *mut blst_pairing { + self.v.as_mut_ptr() as *mut blst_pairing + } + fn const_ctx(&self) -> *const blst_pairing { + self.v.as_ptr() as *const blst_pairing + } + + pub fn aggregate( + &mut self, + pk: &dyn Any, + sig: &dyn Any, + hash_or_encode: bool, + msg: &[u8], + dst: &[u8], + aug: &[u8], + ) -> BLST_ERROR { + if pk.is::() { + unsafe { + blst_pairing_aggregate_pk_in_g1( + self.ctx(), + match pk.downcast_ref::() { + Some(pk) => pk, + None => ptr::null(), + }, + match sig.downcast_ref::() { + Some(sig) => sig, + None => ptr::null(), + }, + hash_or_encode, + msg.as_ptr(), + msg.len(), + dst.as_ptr(), + dst.len(), + aug.as_ptr(), + aug.len(), + ) + } + } else if pk.is::() { + unsafe { + blst_pairing_aggregate_pk_in_g2( + self.ctx(), + match pk.downcast_ref::() { + Some(pk) => pk, + None => ptr::null(), + }, + match sig.downcast_ref::() { + Some(sig) => sig, + None => ptr::null(), + }, + hash_or_encode, + msg.as_ptr(), + msg.len(), + dst.as_ptr(), + dst.len(), + aug.as_ptr(), + aug.len(), + ) + } + } else { + panic!("whaaaa?") + } + } + + pub fn aggregated(gtsig: &mut blst_fp12, sig: &dyn Any) { + if sig.is::() { + unsafe { + blst_aggregated_in_g1( + gtsig, + sig.downcast_ref::().unwrap(), + ) + } + } else if sig.is::() { + unsafe { + blst_aggregated_in_g2( + gtsig, + sig.downcast_ref::().unwrap(), + ) + } + } else { + panic!("whaaaa?") + } + } + + pub fn commit(&mut self) { + unsafe { blst_pairing_commit(self.ctx()) } + } + + pub fn merge(&mut self, ctx1: &Self) -> BLST_ERROR { + unsafe { blst_pairing_merge(self.ctx(), ctx1.const_ctx()) } + } + + pub fn finalverify(&self, gtsig: &blst_fp12) -> bool { + unsafe { blst_pairing_finalverify(self.const_ctx(), gtsig) } + } +} + +// TODO - add group checks after deserialization + +pub fn print_bytes(bytes: &[u8], name: &str) { + print!("{} ", name); + for b in bytes.iter() { + print!("{:02x}", b); + } + println!(); +} + +macro_rules! miller_pk_in_p1 { + ( + $out:expr, + $p2:expr, + $p1:expr + ) => { + blst_miller_loop($out, $p2, $p1); + }; +} + +macro_rules! miller_const_pk_in_p1 { + ( + $out:expr, + $p:expr + ) => { + blst_miller_loop($out, $p, &BLS12_381_NEG_G1); + }; +} + +macro_rules! miller_pk_in_p2 { + ( + $out:expr, + $p2:expr, + $p1:expr + ) => { + blst_miller_loop($out, $p1, $p2); + }; +} + +macro_rules! miller_const_pk_in_p2 { + ( + $out:expr, + $p:expr + ) => { + blst_miller_loop($out, &BLS12_381_NEG_G2, $p); + }; +} + +macro_rules! sig_variant_impl { + ( + $name:expr, + $pk:ty, + $pk_aff:ty, + $sig:ty, + $sig_aff:ty, + $sk_to_pk:ident, + $hash_or_encode_to:ident, + $sign:ident, + $pk_eq:ident, + $sig_eq:ident, + $verify:ident, + $pk_in_group:ident, + $pk_to_aff:ident, + $pk_from_aff:ident, + $pk_ser:ident, + $pk_comp:ident, + $pk_deser:ident, + $pk_uncomp:ident, + $pk_comp_size:expr, + $pk_ser_size:expr, + $sig_in_group:ident, + $sig_to_aff:ident, + $sig_from_aff:ident, + $sig_ser:ident, + $sig_comp:ident, + $sig_deser:ident, + $sig_uncomp:ident, + $sig_comp_size:expr, + $sig_ser_size:expr, + $pk_add_or_dbl:ident, + $pk_add_or_dbl_aff:ident, + $sig_add_or_dbl:ident, + $sig_add_or_dbl_aff:ident, + $pk_mul:ident, + $sig_mul:ident, + $ml_mac:ident, + $ml_const_mac:ident + ) => { + use rayon::prelude::*; + + /// Secret Key + #[derive(Debug, Clone)] + pub struct SecretKey { + pub value: blst_scalar, + } + + impl SecretKey { + /// Deterministically generate a secret key from key material + pub fn key_gen( + ikm: &[u8], + key_info: &[u8], + ) -> Result { + if ikm.len() < 32 { + return Err(BLST_ERROR::BLST_BAD_ENCODING); + } + let mut sk = std::mem::MaybeUninit::::uninit(); + unsafe { + blst_keygen( + sk.as_mut_ptr(), + ikm.as_ptr(), + ikm.len(), + key_info.as_ptr(), + key_info.len(), + ); + Ok(Self { + value: sk.assume_init(), + }) + } + } + + // sk_to_pk + pub fn sk_to_pk(&self) -> PublicKey { + // TODO - would the user like the serialized/compressed pk as well? + let mut pk_aff = std::mem::MaybeUninit::<$pk_aff>::uninit(); + //let mut pk_ser = [0u8; $pk_ser_size]; + + unsafe { + $sk_to_pk( + //pk_ser.as_mut_ptr(), + std::ptr::null_mut(), + pk_aff.as_mut_ptr(), + &self.value, + ); + PublicKey { + point: pk_aff.assume_init(), + } + } + } + + // Sign + pub fn sign( + &self, + msg: &[u8], + dst: &[u8], + aug: &[u8], + ) -> Signature { + // TODO - would the user like the serialized/compressed sig as well? + let mut q = std::mem::MaybeUninit::<$sig>::uninit(); + let mut sig_aff = std::mem::MaybeUninit::<$sig_aff>::uninit(); + //let mut sig_ser = [0u8; $sig_ser_size]; + unsafe { + $hash_or_encode_to( + q.as_mut_ptr(), + msg.as_ptr(), + msg.len(), + dst.as_ptr(), + dst.len(), + aug.as_ptr(), + aug.len(), + ); + $sign( + std::ptr::null_mut(), + sig_aff.as_mut_ptr(), + q.as_ptr(), + &self.value, + ); + Signature { + point: sig_aff.assume_init(), + } + } + } + + // TODO - formally speaking application is entitled to have + // ultimate control over secret key storage, which means that + // corresponding serialization/deserialization subroutines + // should accept reference to where to store the result, as + // opposite to returning one. + + // serialize + pub fn serialize(&self) -> [u8; 32] { + let mut sk_out = [0; 32]; + unsafe { + blst_bendian_from_scalar(sk_out.as_mut_ptr(), &self.value); + } + sk_out + } + + // deserialize + pub fn deserialize(sk_in: &[u8]) -> Result { + let mut sk = std::mem::MaybeUninit::::uninit(); + unsafe { + blst_scalar_from_bendian(sk.as_mut_ptr(), sk_in.as_ptr()); + if !blst_scalar_fr_check(sk.as_ptr()) { + return Err(BLST_ERROR::BLST_BAD_ENCODING); + } + Ok(Self { + value: sk.assume_init(), + }) + } + } + + pub fn to_bytes(&self) -> [u8; 32] { + SecretKey::serialize(&self) + } + + pub fn from_bytes(sk_in: &[u8]) -> Result { + Ok(SecretKey::deserialize(sk_in)?) // TODO - is this correct? + } + } + + #[derive(Debug, Clone, Copy)] + pub struct PublicKey { + pub point: $pk_aff, + } + + impl PublicKey { + // Core operations + + // key_validate + pub fn key_validate(key: &[u8]) -> Result { + let pk = PublicKey::from_bytes(key)?; + let err: bool; + unsafe { + err = $pk_in_group(&pk.point); + } + if err != true { + return Err(BLST_ERROR::BLST_POINT_NOT_IN_GROUP); + } + Ok(pk) + } + + pub fn from_aggregate(agg_pk: *const AggregatePublicKey) -> Self { + let mut pk_aff = std::mem::MaybeUninit::<$pk_aff>::uninit(); + unsafe { + $pk_to_aff(pk_aff.as_mut_ptr(), &(*agg_pk).point); + Self { + point: pk_aff.assume_init(), + } + } + } + + // Serdes + + pub fn compress(&self) -> [u8; $pk_comp_size] { + let mut pk = std::mem::MaybeUninit::<$pk>::uninit(); + let mut pk_comp = [0u8; $pk_comp_size]; // TODO - no need to initialize + unsafe { + $pk_from_aff(pk.as_mut_ptr(), &self.point); + $pk_comp(pk_comp.as_mut_ptr(), pk.as_ptr()); + } + pk_comp + } + + pub fn serialize(&self) -> [u8; $pk_ser_size] { + let mut pk = std::mem::MaybeUninit::<$pk>::uninit(); + let mut pk_out = [0u8; $pk_ser_size]; + unsafe { + $pk_from_aff(pk.as_mut_ptr(), &self.point); + $pk_ser(pk_out.as_mut_ptr(), pk.as_ptr()); + } + pk_out + } + + pub fn uncompress(pk_comp: &[u8]) -> Result { + let mut pk = std::mem::MaybeUninit::<$pk_aff>::uninit(); + + unsafe { + let err = $pk_uncomp(pk.as_mut_ptr(), pk_comp.as_ptr()); + if err != BLST_ERROR::BLST_SUCCESS { + return Err(err); + } + Ok(Self { + point: pk.assume_init(), + }) + } + } + + pub fn deserialize(pk_in: &[u8]) -> Result { + let mut pk = std::mem::MaybeUninit::<$pk_aff>::uninit(); + + unsafe { + let err = $pk_deser(pk.as_mut_ptr(), pk_in.as_ptr()); + if err != BLST_ERROR::BLST_SUCCESS { + return Err(err); + } + Ok(Self { + point: pk.assume_init(), + }) + } + } + + pub fn from_bytes(pk_in: &[u8]) -> Result { + if (pk_in[0] & 0x80) == 0 { + // Not compressed + let pk = PublicKey::deserialize(pk_in)?; + Ok(pk) + } else { + // compressed + let pk = PublicKey::uncompress(pk_in)?; + Ok(pk) + } + } + + pub fn to_bytes(&self) -> [u8; $pk_comp_size] { + self.compress() + } + } + + impl PartialEq for PublicKey { + fn eq(&self, other: &Self) -> bool { + unsafe { $pk_eq(&self.point, &other.point) } + } + } + + #[derive(Debug, Clone, Copy)] + pub struct AggregatePublicKey { + pub point: $pk, + } + + impl AggregatePublicKey { + pub fn from_public_key(pk: *const PublicKey) -> Self { + let mut agg_pk = std::mem::MaybeUninit::<$pk>::uninit(); + unsafe { + $pk_from_aff(agg_pk.as_mut_ptr(), &(*pk).point); + Self { + point: agg_pk.assume_init(), + } + } + } + + pub fn to_public_key(&self) -> PublicKey { + let mut pk = std::mem::MaybeUninit::<$pk_aff>::uninit(); + unsafe { + $pk_to_aff(pk.as_mut_ptr(), &self.point); + PublicKey { + point: pk.assume_init(), + } + } + } + + // Aggregate + pub fn aggregate(pks: &[&PublicKey]) -> Self { + // TODO - handle case of zero length array? What to return then? + unsafe { + let mut agg_pk = + AggregatePublicKey::from_public_key(pks[0]); + for s in pks.iter().skip(1) { + $pk_add_or_dbl_aff( + &mut agg_pk.point, + &agg_pk.point, + &s.point, + ); + } + agg_pk + } + } + + pub fn aggregate_serialized( + pks: &[&[u8]], + ) -> Result { + // TODO - handle case of zero length array? + // TODO - subgroup check + // TODO - threading + unsafe { + let mut pk = PublicKey::from_bytes(pks[0])?; + let mut agg_pk = AggregatePublicKey::from_public_key(&pk); + for s in pks.iter().skip(1) { + pk = PublicKey::from_bytes(s)?; + // TODO - does this need add_or_double? + $pk_add_or_dbl_aff( + &mut agg_pk.point, + &agg_pk.point, + &pk.point, + ); + } + Ok(agg_pk) + } + } + + pub fn add_aggregate(&mut self, agg_pk: *const AggregatePublicKey) { + unsafe { + $pk_add_or_dbl( + &mut self.point, + &self.point, + &(*agg_pk).point, + ); + } + } + + pub fn add_public_key(&mut self, pk: *const PublicKey) { + unsafe { + $pk_add_or_dbl_aff( + &mut self.point, + &self.point, + &(*pk).point, + ); + } + } + } + + #[derive(Debug, Clone, Copy)] + pub struct Signature { + pub point: $sig_aff, + } + + impl Signature { + pub fn verify( + &self, + msg: &[u8], + dst: &[u8], + aug: &[u8], + pk: *const PublicKey, + ) -> BLST_ERROR { + unsafe { + $verify( + &(*pk).point, + &self.point, + true, + msg.as_ptr(), + msg.len(), + dst.as_ptr(), + dst.len(), + aug.as_ptr(), + aug.len(), + ) + } + } + + pub fn aggregate_verify( + &self, + msgs: &[&[u8]], + dst: &[u8], + pks: &[&PublicKey], + ) -> BLST_ERROR { + let n_elems = pks.len(); + if msgs.len() != n_elems { + return BLST_ERROR::BLST_VERIFY_FAIL; + } + + // TODO - check msg uniqueness? + // TODO - since already in object form, any need to subgroup check? + + let pool = da_pool(); + let (tx, rx) = channel(); + let counter = Arc::new(AtomicUsize::new(0)); + let valid = Arc::new(AtomicBool::new(true)); + + // Bypass 'lifetime limitations by brute force. It works, + // because we explicitly join the threads... + let raw_pks = unsafe { + transmute::<*const &PublicKey, usize>(pks.as_ptr()) + }; + let raw_msgs = + unsafe { transmute::<*const &[u8], usize>(msgs.as_ptr()) }; + let dst = + unsafe { slice::from_raw_parts(dst.as_ptr(), dst.len()) }; + + let n_workers = std::cmp::min(pool.max_count(), n_elems); + for _ in 0..n_workers { + let tx = tx.clone(); + let counter = counter.clone(); + let valid = valid.clone(); + + pool.execute(move || { + let mut pairing = Pairing::new(); + // reconstruct input slices... + let msgs = unsafe { + slice::from_raw_parts( + transmute::(raw_msgs), + n_elems, + ) + }; + let pks = unsafe { + slice::from_raw_parts( + transmute::(raw_pks), + n_elems, + ) + }; + + while valid.load(Ordering::Relaxed) { + let work = counter.fetch_add(1, Ordering::Relaxed); + if work >= n_elems { + break; + } + if pairing.aggregate( + &pks[work].point, + &unsafe { ptr::null::<$sig_aff>().as_ref() }, + true, + &msgs[work], + &dst, + &[], + ) != BLST_ERROR::BLST_SUCCESS + { + valid.store(false, Ordering::Relaxed); + break; + } + } + if valid.load(Ordering::Relaxed) { + pairing.commit(); + } + tx.send(pairing).expect("disaster"); + }); + } + + let mut gtsig = blst_fp12::new(); + if valid.load(Ordering::Relaxed) { + Pairing::aggregated(&mut gtsig, &self.point); + } + + let mut acc = rx.recv().unwrap(); + for _ in 1..n_workers { + acc.merge(&rx.recv().unwrap()); + } + + if valid.load(Ordering::Relaxed) && acc.finalverify(>sig) { + BLST_ERROR::BLST_SUCCESS + } else { + BLST_ERROR::BLST_VERIFY_FAIL + } + } + + pub fn fast_aggregate_verify( + &self, + msg: &[u8], + dst: &[u8], + pks: &[&PublicKey], + ) -> BLST_ERROR { + let agg_pk = AggregatePublicKey::aggregate(pks); + let pk = agg_pk.to_public_key(); + unsafe { + $verify( + &pk.point, + &self.point, + true, + msg.as_ptr(), + msg.len(), + dst.as_ptr(), + dst.len(), + [].as_ptr(), + 0, + ) + } + } + + pub fn fast_aggregate_verify_pre_aggregated( + &self, + msg: &[u8], + dst: &[u8], + pk: &PublicKey, + ) -> BLST_ERROR { + unsafe { + $verify( + &pk.point, + &self.point, + true, + msg.as_ptr(), + msg.len(), + dst.as_ptr(), + dst.len(), + [].as_ptr(), + 0, + ) + } + } + + // https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407 + pub fn verify_multiple_aggregate_signatures( + msgs: &[&[u8]], + dst: &[u8], + pks: &[&PublicKey], + sigs: &[&Signature], + rands: &[blst_scalar], + rand_bits: usize, + ) -> BLST_ERROR { + let mut c1 = std::mem::MaybeUninit::::uninit(); + let mut cur_ml = std::mem::MaybeUninit::::uninit(); + unsafe { + let mut agg_sig = + std::mem::MaybeUninit::<$sig>::zeroed().assume_init(); + let mut agg_sig_aff = + std::mem::MaybeUninit::<$sig_aff>::zeroed() + .assume_init(); + let c1s = pks + .par_iter() + .zip(msgs.par_iter()) + .zip(sigs.par_iter()) + .zip(rands.par_iter()) + .map(|(((p, m), s), r)| { + let mut q_i = + std::mem::MaybeUninit::<$sig>::uninit(); + let mut q_i_aff = + std::mem::MaybeUninit::<$sig_aff>::uninit(); + let mut c1_pi = + std::mem::MaybeUninit::::uninit(); + let mut r_pk = + std::mem::MaybeUninit::<$pk>::uninit(); + let mut r_pk_aff = + std::mem::MaybeUninit::<$pk_aff>::uninit(); + let mut r_sig = + std::mem::MaybeUninit::<$sig>::uninit(); + let mut s_jac = + std::mem::MaybeUninit::<$sig>::uninit(); + let mut p_jac = + std::mem::MaybeUninit::<$pk>::uninit(); + let mut agg_sig_i = + std::mem::MaybeUninit::<$sig>::zeroed() + .assume_init(); + + $hash_or_encode_to( + q_i.as_mut_ptr(), + m.as_ptr(), + m.len(), + dst.as_ptr(), + dst.len(), + [].as_ptr(), + 0, + ); + $sig_to_aff(q_i_aff.as_mut_ptr(), q_i.as_ptr()); + $pk_from_aff(p_jac.as_mut_ptr(), &p.point); + $pk_mul( + r_pk.as_mut_ptr(), + p_jac.as_ptr(), + r, + rand_bits, + ); + $pk_to_aff(r_pk_aff.as_mut_ptr(), r_pk.as_ptr()); + $sig_from_aff(s_jac.as_mut_ptr(), &s.point); + $sig_mul( + r_sig.as_mut_ptr(), + s_jac.as_ptr(), + r, + rand_bits, + ); + + $sig_add_or_dbl( + &mut agg_sig_i, + &agg_sig_i, + r_sig.as_ptr(), + ); + + $ml_mac!( + c1_pi.as_mut_ptr(), + q_i_aff.as_ptr(), + r_pk_aff.as_ptr() + ); + (c1_pi.assume_init(), agg_sig_i) + }) + .collect::>(); + + blst_fp12_mul(c1.as_mut_ptr(), &c1s[0].0, &c1s[1].0); + $sig_add_or_dbl(&mut agg_sig, &c1s[0].1, &c1s[1].1); + for (c, agg) in c1s.iter().skip(2) { + blst_fp12_mul(c1.as_mut_ptr(), c1.as_ptr(), c); + $sig_add_or_dbl(&mut agg_sig, &agg_sig, agg); + } + $sig_to_aff(&mut agg_sig_aff, &agg_sig); + $ml_const_mac!(cur_ml.as_mut_ptr(), &agg_sig_aff); + blst_fp12_mul( + c1.as_mut_ptr(), + c1.as_ptr(), + cur_ml.as_ptr(), + ); + blst_fp12_conjugate(c1.as_mut_ptr()); + blst_final_exp(c1.as_mut_ptr(), c1.as_ptr()); + let result = blst_fp12_is_one(c1.as_ptr()); + if result == false { + return BLST_ERROR::BLST_VERIFY_FAIL; + } + BLST_ERROR::BLST_SUCCESS + } + } + + pub fn from_aggregate(agg_sig: *const AggregateSignature) -> Self { + let mut sig_aff = std::mem::MaybeUninit::<$sig_aff>::uninit(); + unsafe { + $sig_to_aff(sig_aff.as_mut_ptr(), &(*agg_sig).point); + Self { + point: sig_aff.assume_init(), + } + } + } + + pub fn compress(&self) -> [u8; $sig_comp_size] { + let mut sig = std::mem::MaybeUninit::<$sig>::uninit(); + let mut sig_comp = [0; $sig_comp_size]; + unsafe { + $sig_from_aff(sig.as_mut_ptr(), &self.point); + $sig_comp(sig_comp.as_mut_ptr(), sig.as_ptr()); + //$sig_comp(sig_comp.as_mut_ptr(), &self.point); + } + sig_comp + } + + pub fn serialize(&self) -> [u8; $sig_ser_size] { + let mut sig = std::mem::MaybeUninit::<$sig>::uninit(); + let mut sig_out = [0; $sig_ser_size]; + unsafe { + $sig_from_aff(sig.as_mut_ptr(), &self.point); + $sig_ser(sig_out.as_mut_ptr(), sig.as_ptr()); + //$sig_ser(sig_out.as_mut_ptr(), &self.point); + } + sig_out + } + + pub fn uncompress(sig_comp: &[u8]) -> Result { + let mut sig = std::mem::MaybeUninit::<$sig_aff>::uninit(); + + unsafe { + let err = $sig_uncomp(sig.as_mut_ptr(), sig_comp.as_ptr()); + if err != BLST_ERROR::BLST_SUCCESS { + return Err(err); + } + Ok(Self { + point: sig.assume_init(), + }) + } + } + + pub fn deserialize(sig_in: &[u8]) -> Result { + let mut sig = std::mem::MaybeUninit::<$sig_aff>::uninit(); + + unsafe { + let err = $sig_deser(sig.as_mut_ptr(), sig_in.as_ptr()); + if err != BLST_ERROR::BLST_SUCCESS { + return Err(err); + } + Ok(Self { + point: sig.assume_init(), + }) + } + } + + pub fn from_bytes(sig_in: &[u8]) -> Result { + if (sig_in[0] & 0x80) == 0 { + // Not compressed + let sig = Signature::deserialize(sig_in)?; + Ok(sig) + } else { + // compressed + let sig = Signature::uncompress(sig_in)?; + Ok(sig) + } + } + + pub fn to_bytes(&self) -> [u8; $sig_comp_size] { + self.compress() + } + } + + impl PartialEq for Signature { + fn eq(&self, other: &Self) -> bool { + unsafe { $sig_eq(&self.point, &other.point) } + } + } + + #[derive(Debug, Clone, Copy)] + pub struct AggregateSignature { + pub point: $sig, + } + + impl AggregateSignature { + pub fn from_signature(sig: *const Signature) -> Self { + let mut agg_sig = std::mem::MaybeUninit::<$sig>::uninit(); + unsafe { + $sig_from_aff(agg_sig.as_mut_ptr(), &(*sig).point); + Self { + point: agg_sig.assume_init(), + } + } + } + + pub fn to_signature(&self) -> Signature { + let mut sig = std::mem::MaybeUninit::<$sig_aff>::uninit(); + unsafe { + $sig_to_aff(sig.as_mut_ptr(), &self.point); + Signature { + point: sig.assume_init(), + } + } + } + + // Aggregate + pub fn aggregate(sigs: &[&Signature]) -> Self { + // TODO - handle case of zero length array? + unsafe { + let mut agg_sig = + AggregateSignature::from_signature(sigs[0]); + for s in sigs.iter().skip(1) { + // TODO - does this need add_or_double? + $sig_add_or_dbl_aff( + &mut agg_sig.point, + &agg_sig.point, + &s.point, + ); + } + agg_sig + } + } + + pub fn aggregate_serialized( + sigs: &[&[u8]], + ) -> Result { + // TODO - handle case of zero length array? + // TODO - subgroup check + // TODO - threading + unsafe { + let mut sig = Signature::from_bytes(sigs[0])?; + let mut agg_sig = AggregateSignature::from_signature(&sig); + for s in sigs.iter().skip(1) { + sig = Signature::from_bytes(s)?; + // TODO - does this need add_or_double? + $sig_add_or_dbl_aff( + &mut agg_sig.point, + &agg_sig.point, + &sig.point, + ); + } + Ok(agg_sig) + } + } + + pub fn add_aggregate( + &mut self, + agg_sig: *const AggregateSignature, + ) { + unsafe { + $sig_add_or_dbl( + &mut self.point, + &self.point, + &(*agg_sig).point, + ); + } + } + + pub fn add_signature(&mut self, sig: *const Signature) { + unsafe { + $sig_add_or_dbl_aff( + &mut self.point, + &self.point, + &(*sig).point, + ); + } + } + } + + #[cfg(test)] + mod tests { + use super::*; + use rand::{RngCore, SeedableRng}; + use rand_chacha::ChaCha20Rng; + + // Testing only - do not use for production + pub fn gen_random_key( + rng: &mut rand_chacha::ChaCha20Rng, + ) -> SecretKey { + let mut ikm = [0u8; 32]; + rng.fill_bytes(&mut ikm); + + let mut sk = std::mem::MaybeUninit::::uninit(); + unsafe { + blst_keygen( + sk.as_mut_ptr(), + ikm.as_ptr(), + 32, + std::ptr::null(), + 0, + ); + SecretKey { + value: sk.assume_init(), + } + } + } + + #[test] + fn test_sign() { + let ikm: [u8; 32] = [ + 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, 0x08, 0x3a, + 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, 0x46, 0x4c, 0xca, 0x56, + 0x60, 0x5b, 0xb0, 0x56, 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, + 0x48, 0x99, + ]; + + let sk = SecretKey::key_gen(&ikm, &[]).unwrap(); + let pk = sk.sk_to_pk(); + + let dst = b"BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_NUL_"; + let msg = b"hello foo"; + let sig = sk.sign(msg, dst, &[]); + + let err = sig.verify(msg, dst, &[], &pk); + assert_eq!(err, BLST_ERROR::BLST_SUCCESS); + } + + #[test] + fn test_aggregate() { + let num_msgs = 10; + let dst = b"BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_NUL_"; + + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + + let sks: Vec<_> = + (0..num_msgs).map(|_| gen_random_key(&mut rng)).collect(); + let pks = + sks.iter().map(|sk| sk.sk_to_pk()).collect::>(); + let pks_refs: Vec<&PublicKey> = + pks.iter().map(|pk| pk).collect(); + let pks_rev: Vec<&PublicKey> = + pks.iter().rev().map(|pk| pk).collect(); + + let pk_comp = pks[0].compress(); + let pk_uncomp = PublicKey::uncompress(&pk_comp); + assert_eq!(pk_uncomp.is_ok(), true); + + let mut msgs: Vec> = vec![vec![]; num_msgs]; + for i in 0..num_msgs { + let msg_len = (rng.next_u64() & 0x3F) + 1; + msgs[i] = vec![0u8; msg_len as usize]; + rng.fill_bytes(&mut msgs[i]); + } + + let msgs_refs: Vec<&[u8]> = + msgs.iter().map(|m| m.as_slice()).collect(); + + let sigs = sks + .iter() + .zip(msgs.iter()) + .map(|(sk, m)| (sk.sign(m, dst, &[]))) + .collect::>(); + + let mut errs = sigs + .iter() + .zip(msgs.iter()) + .zip(pks.iter()) + .map(|((s, m), pk)| (s.verify(m, dst, &[], pk))) + .collect::>(); + assert_eq!(errs, vec![BLST_ERROR::BLST_SUCCESS; num_msgs]); + + // Swap message/public key pairs to create bad signature + errs = sigs + .iter() + .zip(msgs.iter()) + .zip(pks.iter().rev()) + .map(|((s, m), pk)| (s.verify(m, dst, &[], pk))) + .collect::>(); + assert_ne!(errs, vec![BLST_ERROR::BLST_SUCCESS; num_msgs]); + + let sig_refs = + sigs.iter().map(|s| s).collect::>(); + let agg = AggregateSignature::aggregate(&sig_refs); + + let agg_sig = agg.to_signature(); + let mut result = + agg_sig.aggregate_verify(&msgs_refs, dst, &pks_refs); + assert_eq!(result, BLST_ERROR::BLST_SUCCESS); + + // Swap message/public key pairs to create bad signature + result = agg_sig.aggregate_verify(&msgs_refs, dst, &pks_rev); + assert_ne!(result, BLST_ERROR::BLST_SUCCESS); + } + + #[test] + fn test_multiple_agg_sigs() { + let dst = b"BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_"; + let num_pks_per_sig = 10; + let num_sigs = 10; + + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + + let mut msgs: Vec> = vec![vec![]; num_sigs]; + let mut sigs: Vec = Vec::with_capacity(num_sigs); + let mut pks: Vec = Vec::with_capacity(num_sigs); + let mut rands: Vec = Vec::with_capacity(num_sigs); + for i in 0..num_sigs { + // Create public keys + let sks_i: Vec<_> = (0..num_pks_per_sig) + .map(|_| gen_random_key(&mut rng)) + .collect(); + + let pks_i = sks_i + .iter() + .map(|sk| sk.sk_to_pk()) + .collect::>(); + let pks_refs_i: Vec<&PublicKey> = + pks_i.iter().map(|pk| pk).collect(); + + // Create random message for pks to all sign + let msg_len = (rng.next_u64() & 0x3F) + 1; + msgs[i] = vec![0u8; msg_len as usize]; + rng.fill_bytes(&mut msgs[i]); + + // Generate signature for each key pair + let sigs_i = sks_i + .iter() + .map(|sk| sk.sign(&msgs[i], dst, &[])) + .collect::>(); + + // Test each current single signature + let errs = sigs_i + .iter() + .zip(pks_i.iter()) + .map(|(s, pk)| (s.verify(&msgs[i], dst, &[], pk))) + .collect::>(); + assert_eq!( + errs, + vec![BLST_ERROR::BLST_SUCCESS; num_pks_per_sig] + ); + + let sig_refs_i = + sigs_i.iter().map(|s| s).collect::>(); + let agg_i = AggregateSignature::aggregate(&sig_refs_i); + + // Test current aggregate signature + sigs.push(agg_i.to_signature()); + let mut result = sigs[i].fast_aggregate_verify( + &msgs[i], + dst, + &pks_refs_i, + ); + assert_eq!(result, BLST_ERROR::BLST_SUCCESS); + + // negative test + if i != 0 { + result = sigs[i - 1].fast_aggregate_verify( + &msgs[i], + dst, + &pks_refs_i, + ); + assert_ne!(result, BLST_ERROR::BLST_SUCCESS); + } + + // aggregate public keys and push into vec + let agg_pk_i = AggregatePublicKey::aggregate(&pks_refs_i); + pks.push(agg_pk_i.to_public_key()); + + // Test current aggregate signature with aggregated pks + result = sigs[i].fast_aggregate_verify_pre_aggregated( + &msgs[i], dst, &pks[i], + ); + assert_eq!(result, BLST_ERROR::BLST_SUCCESS); + + // negative test + if i != 0 { + result = sigs[i - 1] + .fast_aggregate_verify_pre_aggregated( + &msgs[i], dst, &pks[i], + ); + assert_ne!(result, BLST_ERROR::BLST_SUCCESS); + } + + // create random values + let mut vals = [0u64; 4]; + vals[0] = rng.next_u64(); + let mut rand_i = + std::mem::MaybeUninit::::uninit(); + unsafe { + blst_scalar_from_uint64( + rand_i.as_mut_ptr(), + vals.as_ptr(), + ); + rands.push(rand_i.assume_init()); + } + } + + let msgs_refs: Vec<&[u8]> = + msgs.iter().map(|m| m.as_slice()).collect(); + let sig_refs = + sigs.iter().map(|s| s).collect::>(); + let pks_refs: Vec<&PublicKey> = + pks.iter().map(|pk| pk).collect(); + + let msgs_rev: Vec<&[u8]> = + msgs.iter().rev().map(|m| m.as_slice()).collect(); + let sig_rev = + sigs.iter().rev().map(|s| s).collect::>(); + let pks_rev: Vec<&PublicKey> = + pks.iter().rev().map(|pk| pk).collect(); + + let mut result = + Signature::verify_multiple_aggregate_signatures( + &msgs_refs, dst, &pks_refs, &sig_refs, &rands, 64, + ); + assert_eq!(result, BLST_ERROR::BLST_SUCCESS); + + // negative tests (use reverse msgs, pks, and sigs) + result = Signature::verify_multiple_aggregate_signatures( + &msgs_rev, dst, &pks_refs, &sig_refs, &rands, 64, + ); + assert_ne!(result, BLST_ERROR::BLST_SUCCESS); + + result = Signature::verify_multiple_aggregate_signatures( + &msgs_refs, dst, &pks_rev, &sig_refs, &rands, 64, + ); + assert_ne!(result, BLST_ERROR::BLST_SUCCESS); + + result = Signature::verify_multiple_aggregate_signatures( + &msgs_refs, dst, &pks_refs, &sig_rev, &rands, 64, + ); + assert_ne!(result, BLST_ERROR::BLST_SUCCESS); + } + + #[test] + fn test_serialization() { + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + + let sk = gen_random_key(&mut rng); + let sk2 = gen_random_key(&mut rng); + + let pk = sk.sk_to_pk(); + let pk_comp = pk.compress(); + let pk_ser = pk.serialize(); + + let pk_uncomp = PublicKey::uncompress(&pk_comp); + assert_eq!(pk_uncomp.is_ok(), true); + assert_eq!(pk_uncomp.unwrap(), pk); + + let pk_deser = PublicKey::deserialize(&pk_ser); + assert_eq!(pk_deser.is_ok(), true); + assert_eq!(pk_deser.unwrap(), pk); + + let pk2 = sk2.sk_to_pk(); + let pk_comp2 = pk2.compress(); + let pk_ser2 = pk2.serialize(); + + let pk_uncomp2 = PublicKey::uncompress(&pk_comp2); + assert_eq!(pk_uncomp2.is_ok(), true); + assert_eq!(pk_uncomp2.unwrap(), pk2); + + let pk_deser2 = PublicKey::deserialize(&pk_ser2); + assert_eq!(pk_deser2.is_ok(), true); + assert_eq!(pk_deser2.unwrap(), pk2); + + assert_ne!(pk, pk2); + assert_ne!(pk_uncomp.unwrap(), pk2); + assert_ne!(pk_deser.unwrap(), pk2); + assert_ne!(pk_uncomp2.unwrap(), pk); + assert_ne!(pk_deser2.unwrap(), pk); + } + } + }; +} + +pub mod min_pk { + use super::*; + + sig_variant_impl!( + "MinPk", + blst_p1, + blst_p1_affine, + blst_p2, + blst_p2_affine, + blst_sk_to_pk2_in_g1, + blst_hash_to_g2, + blst_sign_pk2_in_g1, + blst_p1_affine_is_equal, + blst_p2_affine_is_equal, + blst_core_verify_pk_in_g1, + blst_p1_affine_in_g1, + blst_p1_to_affine, + blst_p1_from_affine, + blst_p1_serialize, + blst_p1_compress, + blst_p1_deserialize, + blst_p1_uncompress, + 48, + 96, + blst_p2_affine_in_g2, + blst_p2_to_affine, + blst_p2_from_affine, + blst_p2_serialize, + blst_p2_compress, + blst_p2_deserialize, + blst_p2_uncompress, + 96, + 192, + blst_p1_add_or_double, + blst_p1_add_or_double_affine, + blst_p2_add_or_double, + blst_p2_add_or_double_affine, + blst_p1_mult_w5, + blst_p2_mult_w5, + miller_pk_in_p1, + miller_const_pk_in_p1 + ); +} + +pub mod min_sig { + use super::*; + + sig_variant_impl!( + "MinSig", + blst_p2, + blst_p2_affine, + blst_p1, + blst_p1_affine, + blst_sk_to_pk2_in_g2, + blst_hash_to_g1, + blst_sign_pk2_in_g2, + blst_p2_affine_is_equal, + blst_p1_affine_is_equal, + blst_core_verify_pk_in_g2, + blst_p2_affine_in_g2, + blst_p2_to_affine, + blst_p2_from_affine, + blst_p2_serialize, + blst_p2_compress, + blst_p2_deserialize, + blst_p2_uncompress, + 96, + 192, + blst_p1_affine_in_g1, + blst_p1_to_affine, + blst_p1_from_affine, + blst_p1_serialize, + blst_p1_compress, + blst_p1_deserialize, + blst_p1_uncompress, + 48, + 96, + blst_p2_add_or_double, + blst_p2_add_or_double_affine, + blst_p1_add_or_double, + blst_p1_add_or_double_affine, + blst_p2_mult_w5, + blst_p1_mult_w5, + miller_pk_in_p2, + miller_const_pk_in_p2 + ); +} diff --git a/blst_logo_small.png b/blst_logo_small.png new file mode 100644 index 0000000000000000000000000000000000000000..8570d9ff650c799d50d3acdac9f7f6763f3d7dda GIT binary patch literal 10176 zcma)i2UHWt);|_NL0}gHNE1X+N)k$F3C&QK4xxokfY3uA)F4GcWa&kEl`2K)y(+y1 z5h;R*(mT=#0{+9k-S@rkoOizSpL24ODfiB8Gn1L$4b#?CrlDe?A|oTCQTamwK}L23 zc=@?OL2AMKM|+aKm^~Dc9=cdt4{sC>O$N8YTA?{rFep1T0*$i4yY-``$jGh*JLn-j zkQ(YRYb*wgx*P-c!MKvB$;hN+eOyu2PG}EKE3}=1i!|3xb2}HOgN-zop{RzShO0c< z-r)~l99q{`Q_tGh$r@_IB`d=zrDUe+}_) zlIHrCP)H4JPI)X2%_#;J1X&9T3Ui7>!9rpp5HUd@r?8-~pn#yTfDi;EC@{~rztXm@LzgR6%F)`jzuBgzWv=^@QUlJs9HU|cmc{>|9M{cl5&OeWxiaupB) z3kqN`mwNp*+T8^>u7g9yenD&fp*7w;;c#ju)Y5eG0ArS_e7TiNz%Zya1JD= zpqv%3)}9!&i-(GWG#9A{Y~x@9vq6grL#!YoAZu|^C`iN>jRIMTiHd_zqC(c9)=(=U zVIhfs`206|K?Ol25it>x+29Z%Aw{^Hgpin`u&{`@1XMvmTp0cjt%{4g2g=17{m~&Hl8}%P5fKy>Lko*p zaS4z_6u69>|A?f2nn>Y!`SaiFf>inU%0j!4mJ^P&ZpII1Z-kJ?X5Rc_ zYfbDe@4qE9(ZgaC{(u{;LR9AAOVravBq6xe?p4`FqV4aadC~nJ6RLF8z3%ZU^5npc z?gyiKfIGh~pvN!kA(KX5kNuO}3H#>(GLx^r>_AqaD>5sM1W$|O+DMi=cLd3v!hTQ{ zl96d6$;fzsWRFo-$X-?aRejoV_21PHa|q&@yMRS^6}EP*;;4#a24% z5kmDvZv$R*vDrl~S!xu|V}E}wN%o22e}w!;ze}0_myk{GuBu--zNL-Sw{wmS4*tSs zqf(eB2Bcqn!KqAPl(6;~CCaYfd*Ty2XN16^w>T*ULA9tI&J7BvVBcGK9gVQ;SWt%} z-^-v3R-0kCwT0@rxZX?spiRhXyCF5sD|R831z@0?N^ip{dNjbP3=vIS`Lp4Hx;s=i zm_%lH9XiJR%Q6;}nl(_R6N|~%r4Gs%Ie1l(HgpeO2wmt^aH1{=Ig6h22;dh$&JSJ% z806sv`@}Qch?QPwU%uWW_=I=QQgc)K8XR?aZH@*P11S?f!l`Sakz9Q!!ig-z9QL%s z@xfRy+hQdD!;KA6&y;K*72zZYo@M2$jXduX2s-Owp5yIS}IimO~rEVn88f2)XSe z^2Sc)Iq^PKb*)KcPyD;DA_%=+VQC)VA1My~jJ|2zLFzWXO)pe0fLoCqfaK>-JA(Sq zaIja8?Cr&0DOu6`Neyi3?i#V#LL{0LrtzM%?_cur8V%W%a~`Zs8&Fwj-{~Y93+@QQ zH$wWt2q!P#b=JPHo{6OA!ue3?K9D*c6?N5U^!Ij>VXGMJe5VO=%R3+W4wtnr#Un~N z1axeLCvCFGJ^9=jFOnhhQrMi0Bm>~J5dtDLoE+SewdTO8>U@y~2{4SkopKkzm|*SI zL#GnAqbw|qkx$yBCo~M4wY(!)^_7%iYI|l`ndA=IS80d>?&Z-F$gn7PIgh;Zv&a$X z0+sun&V$I+#$!3fs;fR%LddxrCeIRL>q&BqL7YEnxmw$uQNms9NZu3lYhz#JAT;n% z0=o}xF?%{6d7%=Y2VB2YH7Kw25)0WhRp2q*2E8_+zSPU+CMRfq{k(E7n!r=I!o6s(g;g4ga#udK1u%NV%>=!n+YtwB@7}XJ+vJ>tkYZz1M)eW?3-bt7>?RYVpzJFH2Tc#G5(*yUGj#Uctl-t= zZ0~+m1d&&5{i{@BFGYwx_hPlSKzqpP;@y%51KAn7{y}fPgpPO^AWr6j*(M*k3kIIi z0AF;J$VeiquH|RGiaB`SNz5gpXd{dRlhpe8n(uQTnR1x^`U%_}T@(MI2r#H+wiEct z^Qr&%jK@j*?A|cnM!J4cS+Na+Sc`C9BTc(kh^u<^h#Y^3h5r?U$0&WKM1VP1^eE0A z(;rx=9?XICjM5a>)bp)B+nmlS)d_@{JcT0f|>bUCRX3010>?D~UmKuMg z7?ckH5j_I1Ve)lhhaYxm_x$brf*aIbU>wwYckRx4m+RF_C4HHBwNH{(>5&f>+_RlN zQst>oZDhk`Fe+M&K=a#+`J9u+5XliX_LX}Y29l+Pua4|qX1IwKWKO-?nC0mkhilf_ zC`&{<$wT~3ty0JHAufqo5m;io7sW?W+{CYt#{A{{Z%eAMP<%Jlz(L^b9VcNr| z0Y1sQS5r@I1HPWe(Ne5f7hPcN=xBCa zn}D)PCUZhye-d^?e2;fZyhKBS)@9Sx^B{m=4_ z72SPaHe4UF`hFsKCUfe$o)tQ|&LSPllFYGFMDf)QL}c{tq5IRod-n6gwx{9dC8v>c zEH#j`$)1Z6R%cW}a+{!gVgxrawf}Lk3i+sE2uif3CA1Ph; z-J%C=)FBk9ODzBf)U7$uk%RAALLaA|O`G{*qBu_Uf&H0k7U}VYE7+$Uh@u`E!f}6z zuWr;%`dQQ?B>it0$B~X<<=UCihlMsOQ%@E*BzR!=A2A_qrfJ)_!Jze&Ki({tnorte z+{Eu)HJ0pPg)KayAga-nD__*L<~q%*C2{z=2h@Patt;Dod5=;<5^df0**`gq(pmb^ zV9PUVk71K};9j-KsSZT+mum#?bN;?~(H1Sf;Qe0uksO(^T3Qt(a$G)#W+xn56N;0m zw~fuFYy|fCiZ;oB{9DUL!%t)zdX6y`YF?CMeOG?%u$U?H`)}rQVuy(t=7LX@OV@MT zF1n8zLlQ534}5e=vG*0X2P*o#YfrSXd-|}Yr!hKp;^M}t3lbS#?@J)w1r;bwE@lDQ z&$R>+d?sqT{51qL>8VSnr5NX|SQ%To7fq(5`#6I9x=T1V@+d7(wAH!`xE8IF@BSYP z`;m5EntMue?35*aRT-vVfBPJF*sgPPqrLnwYB<7cw65xb!|4b)V`|2NbQ!PeFdW_u8Rx;R-(u4;mB_I9ByPEU;QmKlG?8-5~w=FZ}nW!yMxDpT ze4l|&+{EGD$1mi;U8?e(h7k-cjNFPGnY+Q$c@)Q1kM>Pcio;i18Kf+_&om+@w(6kS zy6aPGNzTh`4(B>HzfqT}@tmIQ(PIaT;cb@VrSjZr>);i??5ZPx??ysMfp$n}rZxY> z8%Olh+Wtzj*e9w|k0y~J)IclLcy@XoHiyT;iCfLs)r{#JtY3A9uCIck#B^2TFe}Nu zhoMU7t=tMRFiB7RJd8Mg-}nnw#S*8J#czqKr5RTjNQVa+#7-KnehFO4SlG0O8w__K zj?0COK9mN8lMj%kf6mM(w*UhQ|b{0>I_MORLG3f zN(E+pp%72E7dU@q@?{-bT%tulRhQa^UD7ABavI>x^f0zI$*zJq8j_D#Y)A`J!Z)l# zRMV?xNsFi#vTX33i^HP_UHSFXf;%gR+4YxHz=yLt$0i8uIXD0eyjl7@V^dO*LvfV# zcwlvZl_9(?;1g6J^CuTWMFl_)AL=f}T~R*Oi63+Miss%f_d-X+r&Lx{RQRfv0*8Hf zhtmzyyv7=?OLo&g@~_u%{@isdaLABfFu6I^;-M1iKHbqbKXZ(V>cruw)qvcE7&(9+ zpyVK0ePU#Z8V@n}=rKBj9I%$%*VTlDeZ4SGB!h-pOv+QM4Sb&?RMtRdwxYjMBN32` z@AStyt@q}p6GEA&OMf*$`W^-Oy^EhVHsbkkkg}Rg^kE`xdV%G3*-6h^H(5a4;sj>H z2PgYxQgplW#b|3Q}+hJV_d%8hEDy;_TG zZ|&NXzg>-{9Fc!pbypr7eR%phwh@(Zy(b2Jtl7D8@xalfFe zh|J6!KX;g`c(lK2?I5mYLO+#ei{H?kv=_DfG0@dV{AEwsrd5ovqaa2Rhk+mWbBVH{ zy$sZ)bD%fxc{mX&W7X7OkNDuZ2%#;PhF45eQOorS#(BYv69YOz`M1Y$B8MMh&b~q*Z7bKB6F-s#Z@;K`ZuFLBu9$ne1_c~_QBtuyXcScTxp3nohauS? z4>pcf+iR$N@H|ck#0P{fkMNyr9wj*s5301R-e_}ojK#thN{r?RWuq{Qa`i~#+Bs;@ zf@6P+MMi`G0GJy1`v`n+BygX^p{v7DV5br{^CK{d6s5u$g26dRq}o(pCy&&?i?BaZ zG5rhAL&!4~2|myz@4B-R@>1o9O)6INN=eX5T3inUO~4%|=hL0&@b&M3I#FVaX44a!@^N6r zV$IW^-fty~H7||!jWau<%dBTbw`oJVjBl@F1a{w95JblyL z(Zy7^wNiGbI3%BQ=e)_C$rzDxTB4#z0{wZSP;;wyPom<%b4Li|)G^`bQR^ypKrX3w zc$6|B*%G+4;u)1>9XN6?V}?l>ke@K^Pu|v~GTJLDv*I3Jf2z}Rl!T=t?LZZdUE1jn zhqB!=gf;H%0pGL~G8qY3QvuI25}nrF5)bQ&DH%zn<_feR5s`-&#=lS5Bj>f;KV7}A znrBvbEz@mjv9?KxZjsI69{r%if}_&UNJyK*fkSa|FCVm5|IL)w?!LHd^W|=r7VHPR zk2%8^6QeZ7zuR)GV3id@7DCuft@p)T^gW-Cy=@SBTOz4m5vO=?jge6=Wo?wHR-=z{eDkUJDc6+yr?4U zdz}@@|A&+%39jgyPHNl+GLnCUu{i~}mL$`=+v&Nk?Q+n;=seTGx{34kN%;@q*~Um7 z;BTsOZLgnqG(6;kLca}4bW|_K+3WPHHsyA$=u4Uv|B=_I54?FLz)Gd&>^r#{a7obI zNyRR;`i-C7sb$SA{nod^x!ng}Ge$~=B1@F)U+;%@p2=9M{24NCIq|1VE?4P_?D8=k zke)a5wW^`Hzb@_aZ(YozBk zsVF*R!pnH=ORg%X#keTk{7-`qr86F;`U3iGc^Nyn(->3r}(oiAhTsDaZC`soAbjJv2Sk1;;vE7!?9i68?v zm7?{Rlij~!##&)Tzr7-{j*f8}6I%Mv$_u7pvJB_1JzyHsMs}YK#{+&VV&wbydYx*= zaY3k-zvzTt-?t%|mmsF<`3E zhO%ARK_*nGyH&LKcu(l?l5^1RDf5?U9X@)}XmZfQoQg!l5m zvIlcLDg28nkn~3!j`k&0om5j_U@Pua^wL6oO-Um1nZNgXM-70Z1|I(U%xw7fq%&hz ze?i#ehxgl;r{I`llaG=*`ghHTw`?w>OdX(6rbJ7ZP+v4!E$lb(lw$${-39JTuvV#Go7``(yxf~8$jt(V<0niM4SGhLeN=|Z zEAuASGWg#du`x_6Q^1o6mP~n^U53#^OJlH{p84i zx)rKrXRm`RWkc3I@xLj(Kxlq)Fyc1w5~xT4hfQ=C21yBb?2OZ97&e=^Y1h~9o^D4_ ziA5pY+XOqTotXBY+VfuIzT^7Y!j1aPxDecB-5dWKwbQDfAYVx?ItpP}=dy6jXSU3- zY>45Z>R~Gh;bt60wyCUvXL~lLh4~6{i7nEnH!hE!SJ$!8jPJXX^aT3knu9H(7`up= zvkQOWB<0ZV9cOW4`n&Uy0!}Kzp3Sj48uw_QAmc@(KN2Z{Z`$fA7Tuvu`6C59Vfco= z*9UL*GwYKMcxie&=&7t)z3#|VfB&eOmZj72$j6!0FX-}LMAUbNQH!y?oXBC~8w5g( zYOkA<&g+|Lv`9Okvg3}e=UyvwxqtLRXZojSJ5L^*Qiw;kwM%i#8_u_Wo>gl5oeYVr zRf<~^Pxw*FBU(7-CZBHOcP#M2HS)VL5~MjS5&9P9x5WWzH#amkcEcc%&qWUsI}qW0 zsoVjaZ~Hbi#qY3N2P_gjSdC$We=sC*ZDd#N#T{R0XsZN>?C} z=V5Hky1Cs6$y1W!^eaD=c0)gc0I+YO?kB!XK*<=UAv!ysM56v(nBqov4$LfYg!qMT zl&g7QuM|zjXi%UNq8`El(C_mDgF!uq-`M8t1IV?{k)iv-PN5;=^OJ;1TO+KoABZJ! zY0RK3Xrly)l$hNJN)jpA&?}^9%&2dD1K%D;W?K_QCz=Z?60&slmI6Rvj?AAiuwK27 z2VBxw^3c!icl=|}r&4X#sHv$-Cwb5dQzv!jmV_1aTYLU$a+J|AYMJJ_+rwB~&-lT? zGpgb8q;Ye2`{eYXS>BWGjMcnP(&q-TEABF-r{;V>pLoxQ1ym$X{7i{>9n%l8YAt4o z9?dQx1T&Rxb0aj!yC?d`)K*=E@x(U_!L2s9Q8i>)9~!fg=yNy!d18$@QCJL#9R7B` zizyEB#k;|caZxdCxhDJ`7gUBng4oAf@xJ6MCKIby2q(E>Ex3V9xnk)(RKQgH)#pY) zcForZY3PCtmHsY| z9%~CwYBr~oJMVjYPIpu_m^Ss3p69Sdqy9R;R4PUSug}*-Noir@WF)d^Ga2#7CWynl z;?0K`1zu*yXNUDHW^)SrgI#@GsXCoM$NzdnoGjbSXE6&M_~=K7S1q4h^mF-cA6Kki z?N^d_dOA7X%Q!)?YDTj2=h z2p{A3)y2sl9Ndih<12FdEUN3dO?sP=0h9^*dF~#v8|ngWEukK6MkLe2$AKk=bsny(EXF@)*9?_YGK-gCDhLBk?@ zJ_u$M+>*|iq}~1E=g`QcN#T`aoxvGaHK}^XkGm?sH>Y_ClnlkqoNf4O?VB%Ka}{xJ z*UC)DueX+v-3~fyh94VB+EYRo1|}sNu1tF44-9!z4stR^igBBZC&WY6^9Yxskb?S+ z`eA`zlE}qg*uB%*w4nL7E!(5h=yi|lp7kK$z0kvP!Z88pqcR?qBQ^G@ZXva&V_nL6 zerEM})FP3|E}vblKz-9zB{ik+RSG-lI(+6)}bEtXkhT&rwb4#mDL^W<4I*gsH!^p9YiG(Cje^ zb?dVv4FqzU{%a?5wv!6g^NhHc^K=biMX4(@8M_O>_HuLGlVoDtHnz_Poc3D{b!OtsWi z^8C1NSKRf_;dFOW^jPtP|73S#miRzPSIknmJs-&rT4CFe1xFevrLpIcf22DTG2hrp z^FnFRuw{-b%YIN1+nK06^ZH2wB95l!mN5l|cIPP+vilBccRJ+oscCs4xuDG5Y$SsA zwZ*8x1s&;>gRFItaG|>uuZYc7(;(@DHGDDoR34#WJqy%xiH|k&q~SH}Ob4?lQ?rsw zoY}S``QQ0qFY&Qi`K`Q`SwYb>iL5oEKzP7$S{mP|zy4ocaJ4o!62SF9cI4oU3Df2? z%*)tc(MH00F}Bibj?Z4}#D2r8k4UNarzB&5#ji=%nAP9Hjb)~nLAPI`RUQW=jf`dP zT1OTZkUH*U->52vfGvhG>RJ!iaR;sUkfo$s-|x9iv&s?+(=_RQ2Y~q6Cnb7iZkIUJ zbVS{JYbJU`Jb}6TLJ!|BF_sF-7?jpSlAb2%MQ<9w|{JBNl~sei92Fz??k3)KKVx zqly<`3}O{6+CI@*QUe~mOPHq!Om`FeJRhdQPCO{`AK0EN5Yoh|KdW|MncNATSo+?*_F-m z=v;f4#0PluD!v6G`N5Q9rmQx?V`HM=u=wS%Wji)9s#1ZtJpX5(QtGB#2d;iFqWyDX z!ZKlu7fWk zjl_eP^}S(EI?`LvP!oV;cSk^2}d|=l1}PTzPdbiP|f>0t*yY3@nTG$SrIf`Ls%%Je+Q}zvfk6S5<)ZL9j@3m|ozq|}a*6O%Wld;6;VZ*5*a)R=cDe#AtMhxKj3N0??yox%em z^sPU=dq?I?LH7SY$^UQu3MsqgzhsL1r?~%{)A+(K@;hWxFLgbyFh@;a=G3VuYATe; HSqA+-vGf?N literal 0 HcmV?d00001 diff --git a/build.bat b/build.bat new file mode 100644 index 00000000..8a89f6e1 --- /dev/null +++ b/build.bat @@ -0,0 +1,11 @@ +@echo off +set TOP=%~dp0 +cl /nologo /c /O2 /Zi /Zl %TOP%src\server.c || EXIT /B +FOR %%F IN (%TOP%build\win64\*.asm) DO ( + ml64 /nologo /c /Cp /Cx /Zi %%F || EXIT /B +) +rem FOR %%F IN (%TOP%src\asm\*-x86_64.pl) DO ( +rem IF NOT EXIST %%~nF.asm (perl %%F masm %%~nF.asm) +rem ) +rem FOR %%F IN (*.asm) DO (ml64 /nologo /c /Cp /Cx /Zi %%F || EXIT /B) +lib /nologo /OUT:blst.lib *.obj && del *.obj diff --git a/build.sh b/build.sh new file mode 100755 index 00000000..323b9563 --- /dev/null +++ b/build.sh @@ -0,0 +1,64 @@ +#!/bin/sh -e +# +# The script allows to override 'CC', 'CFLAGS' and 'flavour' at command +# line, as well as specify additional compiler flags. For example to +# compile for x32: +# +# /some/where/build.sh flavour=elf32 -mx32 +# +# To cross-compile for mingw/Windows: +# +# /some/where/build.sh flavour=mingw64 CC=x86_64-w64-mingw32-gcc +# +# In addition script recognizes -shared flag and creates shared library +# alongside libblst.lib. + +TOP=`dirname $0`/src + +CC=${CC:-cc} +# if -Werror stands in a way, bypass with -Wno-error on command line +CFLAGS=${CFLAGS:--O -march=native -mno-avx -fPIC -Wall -Wextra -Werror} +PERL=${PERL:-perl} + +case `uname -s` in + Darwin) flavour=macosx;; + CYGWIN*) flavour=mingw64;; + MINGW*) flavour=mingw64;; + *) flavour=elf;; +esac + +unset share +while [ "x$1" != "x" ]; do + case $1 in + -shared) shared=1;; + -*) CFLAGS="$CFLAGS $1";; + *=*) eval "$1";; + esac + shift +done + +rm -f libblst.a +trap '[ $? -ne 0 ] && rm -f libblst.a; rm -f *.o' 0 + +(set -x; ${CC} ${CFLAGS} -c ${TOP}/server.c) + +for pl in ${TOP}/asm/*-x86_64.pl; do + s=`basename $pl .pl`.s + ${PERL} $pl $flavour > $s + (set -x; ${CC} ${CFLAGS} -c $s) + rm -f $s +done + +(set -x; ${AR:-ar} rc libblst.a *.o) + +if [ $shared ]; then + case $flavour in + macosx) echo "-shared is not supported"; exit 1;; + mingw*) sharedlib=blst.dll;; + *) sharedlib=libblst.so;; + esac + echo "{ global: blst_*; BLS12_381_*; local: *; };" |\ + (set -x; ${CC} -shared -o $sharedlib ${CFLAGS} libblst.a \ + -Wl,-Bsymbolic,--require-defined=blst_keygen \ + -Wl,--version-script=/dev/fd/0) +fi diff --git a/build/coff/add_mod_256-x86_64.s b/build/coff/add_mod_256-x86_64.s new file mode 100644 index 00000000..4fee97a9 --- /dev/null +++ b/build/coff/add_mod_256-x86_64.s @@ -0,0 +1,643 @@ +.text + +.globl add_mod_256 + +.def add_mod_256; .scl 2; .type 32; .endef +.p2align 5 +add_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + subq $8,%rsp + +.LSEH_body_add_mod_256: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loaded_a_add_mod_256: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_add_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_mod_256: + + +.globl mul_by_3_mod_256 + +.def mul_by_3_mod_256; .scl 2; .type 32; .endef +.p2align 5 +mul_by_3_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_3_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + +.LSEH_body_mul_by_3_mod_256: + + + movq %rdx,%rcx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rsi,%rdx + movq 24(%rsi),%r11 + + call __lshift_mod_256 + movq 0(%rsp),%r12 + + jmp .Loaded_a_add_mod_256 + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_mul_by_3_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_3_mod_256: + +.def __lshift_mod_256; .scl 3; .type 32; .endef +.p2align 5 +__lshift_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + movq %r8,%rax + adcq %r10,%r10 + movq %r9,%rsi + adcq %r11,%r11 + sbbq %r12,%r12 + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + cmovcq %rbx,%r10 + cmovcq %rbp,%r11 + + .byte 0xf3,0xc3 + + + +.globl lshift_mod_256 + +.def lshift_mod_256; .scl 2; .type 32; .endef +.p2align 5 +lshift_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_lshift_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + +.LSEH_body_lshift_mod_256: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_lshift_mod_256: + call __lshift_mod_256 + decl %edx + jnz .Loop_lshift_mod_256 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 0(%rsp),%r12 + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_lshift_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_lshift_mod_256: + + +.globl rshift_mod_256 + +.def rshift_mod_256; .scl 2; .type 32; .endef +.p2align 5 +rshift_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_rshift_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + subq $8,%rsp + +.LSEH_body_rshift_mod_256: + + + movq 0(%rsi),%rbp + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_rshift_mod_256: + movq %rbp,%r8 + andq $1,%rbp + movq 0(%rcx),%rax + negq %rbp + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + + andq %rbp,%rax + andq %rbp,%rsi + andq %rbp,%rbx + andq 24(%rcx),%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + adcq %rbx,%r10 + adcq %rbp,%r11 + sbbq %rax,%rax + + shrq $1,%r8 + movq %r9,%rbp + shrq $1,%r9 + movq %r10,%rbx + shrq $1,%r10 + movq %r11,%rsi + shrq $1,%r11 + + shlq $63,%rbp + shlq $63,%rbx + orq %r8,%rbp + shlq $63,%rsi + orq %rbx,%r9 + shlq $63,%rax + orq %rsi,%r10 + orq %rax,%r11 + + decl %edx + jnz .Loop_rshift_mod_256 + + movq %rbp,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_rshift_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_rshift_mod_256: + + +.globl cneg_mod_256 + +.def cneg_mod_256; .scl 2; .type 32; .endef +.p2align 5 +cneg_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_cneg_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + +.LSEH_body_cneg_mod_256: + + + movq 0(%rsi),%r12 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %r12,%r8 + movq 24(%rsi),%r11 + orq %r9,%r12 + orq %r10,%r12 + orq %r11,%r12 + movq $-1,%rbp + + movq 0(%rcx),%rax + cmovnzq %rbp,%r12 + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + andq %r12,%rax + movq 24(%rcx),%rbp + andq %r12,%rsi + andq %r12,%rbx + andq %r12,%rbp + + subq %r8,%rax + sbbq %r9,%rsi + sbbq %r10,%rbx + sbbq %r11,%rbp + + orq %rdx,%rdx + + cmovzq %r8,%rax + cmovzq %r9,%rsi + movq %rax,0(%rdi) + cmovzq %r10,%rbx + movq %rsi,8(%rdi) + cmovzq %r11,%rbp + movq %rbx,16(%rdi) + movq %rbp,24(%rdi) + + movq 0(%rsp),%r12 + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_cneg_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_cneg_mod_256: + + +.globl sub_mod_256 + +.def sub_mod_256; .scl 2; .type 32; .endef +.p2align 5 +sub_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + subq $8,%rsp + +.LSEH_body_sub_mod_256: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_sub_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_mod_256: +.section .pdata +.p2align 2 +.rva .LSEH_begin_add_mod_256 +.rva .LSEH_body_add_mod_256 +.rva .LSEH_info_add_mod_256_prologue + +.rva .LSEH_body_add_mod_256 +.rva .LSEH_epilogue_add_mod_256 +.rva .LSEH_info_add_mod_256_body + +.rva .LSEH_epilogue_add_mod_256 +.rva .LSEH_end_add_mod_256 +.rva .LSEH_info_add_mod_256_epilogue + +.rva .LSEH_begin_mul_by_3_mod_256 +.rva .LSEH_body_mul_by_3_mod_256 +.rva .LSEH_info_mul_by_3_mod_256_prologue + +.rva .LSEH_body_mul_by_3_mod_256 +.rva .LSEH_epilogue_mul_by_3_mod_256 +.rva .LSEH_info_mul_by_3_mod_256_body + +.rva .LSEH_epilogue_mul_by_3_mod_256 +.rva .LSEH_end_mul_by_3_mod_256 +.rva .LSEH_info_mul_by_3_mod_256_epilogue + +.rva .LSEH_begin_lshift_mod_256 +.rva .LSEH_body_lshift_mod_256 +.rva .LSEH_info_lshift_mod_256_prologue + +.rva .LSEH_body_lshift_mod_256 +.rva .LSEH_epilogue_lshift_mod_256 +.rva .LSEH_info_lshift_mod_256_body + +.rva .LSEH_epilogue_lshift_mod_256 +.rva .LSEH_end_lshift_mod_256 +.rva .LSEH_info_lshift_mod_256_epilogue + +.rva .LSEH_begin_rshift_mod_256 +.rva .LSEH_body_rshift_mod_256 +.rva .LSEH_info_rshift_mod_256_prologue + +.rva .LSEH_body_rshift_mod_256 +.rva .LSEH_epilogue_rshift_mod_256 +.rva .LSEH_info_rshift_mod_256_body + +.rva .LSEH_epilogue_rshift_mod_256 +.rva .LSEH_end_rshift_mod_256 +.rva .LSEH_info_rshift_mod_256_epilogue + +.rva .LSEH_begin_cneg_mod_256 +.rva .LSEH_body_cneg_mod_256 +.rva .LSEH_info_cneg_mod_256_prologue + +.rva .LSEH_body_cneg_mod_256 +.rva .LSEH_epilogue_cneg_mod_256 +.rva .LSEH_info_cneg_mod_256_body + +.rva .LSEH_epilogue_cneg_mod_256 +.rva .LSEH_end_cneg_mod_256 +.rva .LSEH_info_cneg_mod_256_epilogue + +.rva .LSEH_begin_sub_mod_256 +.rva .LSEH_body_sub_mod_256 +.rva .LSEH_info_sub_mod_256_prologue + +.rva .LSEH_body_sub_mod_256 +.rva .LSEH_epilogue_sub_mod_256 +.rva .LSEH_info_sub_mod_256_body + +.rva .LSEH_epilogue_sub_mod_256 +.rva .LSEH_end_sub_mod_256 +.rva .LSEH_info_sub_mod_256_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_add_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_add_mod_256_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00 +.LSEH_info_add_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_3_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_3_mod_256_body: +.byte 1,0,11,0 +.byte 0x00,0xc4,0x00,0x00 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_mul_by_3_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_lshift_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_lshift_mod_256_body: +.byte 1,0,11,0 +.byte 0x00,0xc4,0x00,0x00 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_lshift_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_rshift_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_rshift_mod_256_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00 +.LSEH_info_rshift_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_cneg_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_cneg_mod_256_body: +.byte 1,0,11,0 +.byte 0x00,0xc4,0x00,0x00 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_cneg_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sub_mod_256_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00 +.LSEH_info_sub_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/build/coff/add_mod_384-x86_64.s b/build/coff/add_mod_384-x86_64.s new file mode 100644 index 00000000..c36cd7f8 --- /dev/null +++ b/build/coff/add_mod_384-x86_64.s @@ -0,0 +1,2154 @@ +.text + + + +.globl add_mod_384 + +.def add_mod_384; .scl 2; .type 32; .endef +.p2align 5 +add_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_add_mod_384: + + + call __add_mod_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_add_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_mod_384: + +.def __add_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__add_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__add_mod_384_a_is_loaded: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + + +.globl add_mod_384x + +.def add_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +add_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_mod_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $24,%rsp + +.LSEH_body_add_mod_384x: + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + movq 24+0(%rsp),%r15 + + movq 24+8(%rsp),%r14 + + movq 24+16(%rsp),%r13 + + movq 24+24(%rsp),%r12 + + movq 24+32(%rsp),%rbx + + movq 24+40(%rsp),%rbp + + leaq 24+48(%rsp),%rsp + +.LSEH_epilogue_add_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_mod_384x: + + +.globl lshift_mod_384 + +.def lshift_mod_384; .scl 2; .type 32; .endef +.p2align 5 +lshift_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_lshift_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_lshift_mod_384: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_lshift_mod_384: + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdi,%rdi + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdi + + movq (%rsp),%rdi + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + decl %edx + jnz .Loop_lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_lshift_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_lshift_mod_384: + +.def __lshift_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__lshift_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + .byte 0xf3,0xc3 + + + +.globl mul_by_3_mod_384 + +.def mul_by_3_mod_384; .scl 2; .type 32; .endef +.p2align 5 +mul_by_3_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_3_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_mul_by_3_mod_384: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_3_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_3_mod_384: + +.globl mul_by_8_mod_384 + +.def mul_by_8_mod_384; .scl 2; .type 32; .endef +.p2align 5 +mul_by_8_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_8_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_mul_by_8_mod_384: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_8_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_8_mod_384: + +.globl mul_by_b_onE1 + +.def mul_by_b_onE1; .scl 2; .type 32; .endef +.p2align 5 +mul_by_b_onE1: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_b_onE1: + movq %rcx,%rdi + movq %rdx,%rsi + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_mul_by_b_onE1: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq BLS12_381_P(%rip),%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_b_onE1: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_b_onE1: + +.globl mul_by_4b_onE1 + +.def mul_by_4b_onE1; .scl 2; .type 32; .endef +.p2align 5 +mul_by_4b_onE1: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_4b_onE1: + movq %rcx,%rdi + movq %rdx,%rsi + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_mul_by_4b_onE1: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq BLS12_381_P(%rip),%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_4b_onE1: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_4b_onE1: + + +.globl mul_by_3_mod_384x + +.def mul_by_3_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +mul_by_3_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_3_mod_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_mul_by_3_mod_384x: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq (%rsp),%rsi + leaq 48(%rdi),%rdi + + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + movq 72(%rsi),%r11 + movq 80(%rsi),%r12 + movq 88(%rsi),%r13 + + call __lshift_mod_384 + + movq $48,%rdx + addq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_3_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_3_mod_384x: + +.globl mul_by_8_mod_384x + +.def mul_by_8_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +mul_by_8_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_8_mod_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_mul_by_8_mod_384x: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq (%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,48+0(%rdi) + movq %r9,48+8(%rdi) + movq %r10,48+16(%rdi) + movq %r11,48+24(%rdi) + movq %r12,48+32(%rdi) + movq %r13,48+40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_8_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_8_mod_384x: + +.globl mul_by_b_onE2 + +.def mul_by_b_onE2; .scl 2; .type 32; .endef +.p2align 5 +mul_by_b_onE2: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_b_onE2: + movq %rcx,%rdi + movq %rdx,%rsi + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_mul_by_b_onE2: + + + leaq BLS12_381_P(%rip),%rcx + leaq 48(%rsi),%rdx + call __sub_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq 0(%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_b_onE2: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_b_onE2: + +.globl mul_by_4b_onE2 + +.def mul_by_4b_onE2; .scl 2; .type 32; .endef +.p2align 5 +mul_by_4b_onE2: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_4b_onE2: + movq %rcx,%rdi + movq %rdx,%rsi + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_mul_by_4b_onE2: + + + leaq BLS12_381_P(%rip),%rcx + leaq 48(%rsi),%rdx + call __sub_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq 0(%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_4b_onE2: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_4b_onE2: + + +.globl cneg_mod_384 + +.def cneg_mod_384; .scl 2; .type 32; .endef +.p2align 5 +cneg_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_cneg_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdx + +.LSEH_body_cneg_mod_384: + + + movq 0(%rsi),%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rdx,%r8 + movq 24(%rsi),%r11 + orq %r9,%rdx + movq 32(%rsi),%r12 + orq %r10,%rdx + movq 40(%rsi),%r13 + orq %r11,%rdx + movq $-1,%rsi + orq %r12,%rdx + orq %r13,%rdx + + movq 0(%rcx),%r14 + cmovnzq %rsi,%rdx + movq 8(%rcx),%r15 + movq 16(%rcx),%rax + andq %rdx,%r14 + movq 24(%rcx),%rbx + andq %rdx,%r15 + movq 32(%rcx),%rbp + andq %rdx,%rax + movq 40(%rcx),%rsi + andq %rdx,%rbx + movq 0(%rsp),%rcx + andq %rdx,%rbp + andq %rdx,%rsi + + subq %r8,%r14 + sbbq %r9,%r15 + sbbq %r10,%rax + sbbq %r11,%rbx + sbbq %r12,%rbp + sbbq %r13,%rsi + + orq %rcx,%rcx + + cmovzq %r8,%r14 + cmovzq %r9,%r15 + cmovzq %r10,%rax + movq %r14,0(%rdi) + cmovzq %r11,%rbx + movq %r15,8(%rdi) + cmovzq %r12,%rbp + movq %rax,16(%rdi) + cmovzq %r13,%rsi + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rsi,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_cneg_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_cneg_mod_384: + + +.globl sub_mod_384 + +.def sub_mod_384; .scl 2; .type 32; .endef +.p2align 5 +sub_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sub_mod_384: + + + call __sub_mod_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sub_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_mod_384: + +.def __sub_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + + +.globl sub_mod_384x + +.def sub_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +sub_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_mod_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $24,%rsp + +.LSEH_body_sub_mod_384x: + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __sub_mod_384 + + movq 24+0(%rsp),%r15 + + movq 24+8(%rsp),%r14 + + movq 24+16(%rsp),%r13 + + movq 24+24(%rsp),%r12 + + movq 24+32(%rsp),%rbx + + movq 24+40(%rsp),%rbp + + leaq 24+48(%rsp),%rsp + +.LSEH_epilogue_sub_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_mod_384x: +.globl mul_by_1_plus_i_mod_384x + +.def mul_by_1_plus_i_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +mul_by_1_plus_i_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_1_plus_i_mod_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $56,%rsp + +.LSEH_body_mul_by_1_plus_i_mod_384x: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rbx + adcq 72(%rsi),%r11 + movq %r12,%rcx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + movq %rdi,48(%rsp) + sbbq %rdi,%rdi + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rbx + sbbq 80(%rsi),%rcx + sbbq 88(%rsi),%rbp + sbbq %rsi,%rsi + + movq %r8,0(%rsp) + movq 0(%rdx),%r8 + movq %r9,8(%rsp) + movq 8(%rdx),%r9 + movq %r10,16(%rsp) + movq 16(%rdx),%r10 + movq %r11,24(%rsp) + movq 24(%rdx),%r11 + movq %r12,32(%rsp) + andq %rsi,%r8 + movq 32(%rdx),%r12 + movq %r13,40(%rsp) + andq %rsi,%r9 + movq 40(%rdx),%r13 + andq %rsi,%r10 + andq %rsi,%r11 + andq %rsi,%r12 + andq %rsi,%r13 + movq 48(%rsp),%rsi + + addq %r8,%r14 + movq 0(%rsp),%r8 + adcq %r9,%r15 + movq 8(%rsp),%r9 + adcq %r10,%rax + movq 16(%rsp),%r10 + adcq %r11,%rbx + movq 24(%rsp),%r11 + adcq %r12,%rcx + movq 32(%rsp),%r12 + adcq %r13,%rbp + movq 40(%rsp),%r13 + + movq %r14,0(%rsi) + movq %r8,%r14 + movq %r15,8(%rsi) + movq %rax,16(%rsi) + movq %r9,%r15 + movq %rbx,24(%rsi) + movq %rcx,32(%rsi) + movq %r10,%rax + movq %rbp,40(%rsi) + + subq 0(%rdx),%r8 + movq %r11,%rbx + sbbq 8(%rdx),%r9 + sbbq 16(%rdx),%r10 + movq %r12,%rcx + sbbq 24(%rdx),%r11 + sbbq 32(%rdx),%r12 + movq %r13,%rbp + sbbq 40(%rdx),%r13 + sbbq $0,%rdi + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,48(%rsi) + cmovcq %rbx,%r11 + movq %r9,56(%rsi) + cmovcq %rcx,%r12 + movq %r10,64(%rsi) + cmovcq %rbp,%r13 + movq %r11,72(%rsi) + movq %r12,80(%rsi) + movq %r13,88(%rsi) + + movq 56+0(%rsp),%r15 + + movq 56+8(%rsp),%r14 + + movq 56+16(%rsp),%r13 + + movq 56+24(%rsp),%r12 + + movq 56+32(%rsp),%rbx + + movq 56+40(%rsp),%rbp + + leaq 56+48(%rsp),%rsp + +.LSEH_epilogue_mul_by_1_plus_i_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_1_plus_i_mod_384x: +.globl sgn0_pty_mod_384 + +.def sgn0_pty_mod_384; .scl 2; .type 32; .endef +.p2align 5 +sgn0_pty_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0_pty_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + + +.LSEH_body_sgn0_pty_mod_384: + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + movq 40(%rdi),%rdx + + xorq %rax,%rax + movq %r8,%rdi + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + notq %rax + andq $1,%rdi + andq $2,%rax + orq %rdi,%rax + +.LSEH_epilogue_sgn0_pty_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0_pty_mod_384: + +.globl sgn0_pty_mod_384x + +.def sgn0_pty_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +sgn0_pty_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0_pty_mod_384x: + movq %rcx,%rdi + movq %rdx,%rsi + + + pushq %rbp + + pushq %rbx + + subq $8,%rsp + +.LSEH_body_sgn0_pty_mod_384x: + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + movq 40(%rdi),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + xorq %rax,%rax + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + movq %r8,0(%rsp) + notq %rax + andq $1,%rbp + andq $2,%rax + orq %rbp,%rax + + movq 48(%rdi),%r8 + movq 56(%rdi),%r9 + movq 64(%rdi),%r10 + movq 72(%rdi),%r11 + movq 80(%rdi),%rcx + movq 88(%rdi),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + xorq %rdi,%rdi + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rdi + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rdi + + movq 0(%rsp),%rbx + + notq %rdi + + testq %r8,%r8 + cmovnzq %rdi,%rax + + testq %rbx,%rbx + cmovzq %rdi,%rbp + + andq $1,%rbp + andq $2,%rax + orq %rbp,%rax + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_sgn0_pty_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0_pty_mod_384x: +.section .pdata +.p2align 2 +.rva .LSEH_begin_add_mod_384 +.rva .LSEH_body_add_mod_384 +.rva .LSEH_info_add_mod_384_prologue + +.rva .LSEH_body_add_mod_384 +.rva .LSEH_epilogue_add_mod_384 +.rva .LSEH_info_add_mod_384_body + +.rva .LSEH_epilogue_add_mod_384 +.rva .LSEH_end_add_mod_384 +.rva .LSEH_info_add_mod_384_epilogue + +.rva .LSEH_begin_add_mod_384x +.rva .LSEH_body_add_mod_384x +.rva .LSEH_info_add_mod_384x_prologue + +.rva .LSEH_body_add_mod_384x +.rva .LSEH_epilogue_add_mod_384x +.rva .LSEH_info_add_mod_384x_body + +.rva .LSEH_epilogue_add_mod_384x +.rva .LSEH_end_add_mod_384x +.rva .LSEH_info_add_mod_384x_epilogue + +.rva .LSEH_begin_lshift_mod_384 +.rva .LSEH_body_lshift_mod_384 +.rva .LSEH_info_lshift_mod_384_prologue + +.rva .LSEH_body_lshift_mod_384 +.rva .LSEH_epilogue_lshift_mod_384 +.rva .LSEH_info_lshift_mod_384_body + +.rva .LSEH_epilogue_lshift_mod_384 +.rva .LSEH_end_lshift_mod_384 +.rva .LSEH_info_lshift_mod_384_epilogue + +.rva .LSEH_begin_mul_by_3_mod_384 +.rva .LSEH_body_mul_by_3_mod_384 +.rva .LSEH_info_mul_by_3_mod_384_prologue + +.rva .LSEH_body_mul_by_3_mod_384 +.rva .LSEH_epilogue_mul_by_3_mod_384 +.rva .LSEH_info_mul_by_3_mod_384_body + +.rva .LSEH_epilogue_mul_by_3_mod_384 +.rva .LSEH_end_mul_by_3_mod_384 +.rva .LSEH_info_mul_by_3_mod_384_epilogue + +.rva .LSEH_begin_mul_by_8_mod_384 +.rva .LSEH_body_mul_by_8_mod_384 +.rva .LSEH_info_mul_by_8_mod_384_prologue + +.rva .LSEH_body_mul_by_8_mod_384 +.rva .LSEH_epilogue_mul_by_8_mod_384 +.rva .LSEH_info_mul_by_8_mod_384_body + +.rva .LSEH_epilogue_mul_by_8_mod_384 +.rva .LSEH_end_mul_by_8_mod_384 +.rva .LSEH_info_mul_by_8_mod_384_epilogue + +.rva .LSEH_begin_mul_by_b_onE1 +.rva .LSEH_body_mul_by_b_onE1 +.rva .LSEH_info_mul_by_b_onE1_prologue + +.rva .LSEH_body_mul_by_b_onE1 +.rva .LSEH_epilogue_mul_by_b_onE1 +.rva .LSEH_info_mul_by_b_onE1_body + +.rva .LSEH_epilogue_mul_by_b_onE1 +.rva .LSEH_end_mul_by_b_onE1 +.rva .LSEH_info_mul_by_b_onE1_epilogue + +.rva .LSEH_begin_mul_by_4b_onE1 +.rva .LSEH_body_mul_by_4b_onE1 +.rva .LSEH_info_mul_by_4b_onE1_prologue + +.rva .LSEH_body_mul_by_4b_onE1 +.rva .LSEH_epilogue_mul_by_4b_onE1 +.rva .LSEH_info_mul_by_4b_onE1_body + +.rva .LSEH_epilogue_mul_by_4b_onE1 +.rva .LSEH_end_mul_by_4b_onE1 +.rva .LSEH_info_mul_by_4b_onE1_epilogue + +.rva .LSEH_begin_mul_by_3_mod_384x +.rva .LSEH_body_mul_by_3_mod_384x +.rva .LSEH_info_mul_by_3_mod_384x_prologue + +.rva .LSEH_body_mul_by_3_mod_384x +.rva .LSEH_epilogue_mul_by_3_mod_384x +.rva .LSEH_info_mul_by_3_mod_384x_body + +.rva .LSEH_epilogue_mul_by_3_mod_384x +.rva .LSEH_end_mul_by_3_mod_384x +.rva .LSEH_info_mul_by_3_mod_384x_epilogue + +.rva .LSEH_begin_mul_by_8_mod_384x +.rva .LSEH_body_mul_by_8_mod_384x +.rva .LSEH_info_mul_by_8_mod_384x_prologue + +.rva .LSEH_body_mul_by_8_mod_384x +.rva .LSEH_epilogue_mul_by_8_mod_384x +.rva .LSEH_info_mul_by_8_mod_384x_body + +.rva .LSEH_epilogue_mul_by_8_mod_384x +.rva .LSEH_end_mul_by_8_mod_384x +.rva .LSEH_info_mul_by_8_mod_384x_epilogue + +.rva .LSEH_begin_mul_by_b_onE2 +.rva .LSEH_body_mul_by_b_onE2 +.rva .LSEH_info_mul_by_b_onE2_prologue + +.rva .LSEH_body_mul_by_b_onE2 +.rva .LSEH_epilogue_mul_by_b_onE2 +.rva .LSEH_info_mul_by_b_onE2_body + +.rva .LSEH_epilogue_mul_by_b_onE2 +.rva .LSEH_end_mul_by_b_onE2 +.rva .LSEH_info_mul_by_b_onE2_epilogue + +.rva .LSEH_begin_mul_by_4b_onE2 +.rva .LSEH_body_mul_by_4b_onE2 +.rva .LSEH_info_mul_by_4b_onE2_prologue + +.rva .LSEH_body_mul_by_4b_onE2 +.rva .LSEH_epilogue_mul_by_4b_onE2 +.rva .LSEH_info_mul_by_4b_onE2_body + +.rva .LSEH_epilogue_mul_by_4b_onE2 +.rva .LSEH_end_mul_by_4b_onE2 +.rva .LSEH_info_mul_by_4b_onE2_epilogue + +.rva .LSEH_begin_cneg_mod_384 +.rva .LSEH_body_cneg_mod_384 +.rva .LSEH_info_cneg_mod_384_prologue + +.rva .LSEH_body_cneg_mod_384 +.rva .LSEH_epilogue_cneg_mod_384 +.rva .LSEH_info_cneg_mod_384_body + +.rva .LSEH_epilogue_cneg_mod_384 +.rva .LSEH_end_cneg_mod_384 +.rva .LSEH_info_cneg_mod_384_epilogue + +.rva .LSEH_begin_sub_mod_384 +.rva .LSEH_body_sub_mod_384 +.rva .LSEH_info_sub_mod_384_prologue + +.rva .LSEH_body_sub_mod_384 +.rva .LSEH_epilogue_sub_mod_384 +.rva .LSEH_info_sub_mod_384_body + +.rva .LSEH_epilogue_sub_mod_384 +.rva .LSEH_end_sub_mod_384 +.rva .LSEH_info_sub_mod_384_epilogue + +.rva .LSEH_begin_sub_mod_384x +.rva .LSEH_body_sub_mod_384x +.rva .LSEH_info_sub_mod_384x_prologue + +.rva .LSEH_body_sub_mod_384x +.rva .LSEH_epilogue_sub_mod_384x +.rva .LSEH_info_sub_mod_384x_body + +.rva .LSEH_epilogue_sub_mod_384x +.rva .LSEH_end_sub_mod_384x +.rva .LSEH_info_sub_mod_384x_epilogue + +.rva .LSEH_begin_mul_by_1_plus_i_mod_384x +.rva .LSEH_body_mul_by_1_plus_i_mod_384x +.rva .LSEH_info_mul_by_1_plus_i_mod_384x_prologue + +.rva .LSEH_body_mul_by_1_plus_i_mod_384x +.rva .LSEH_epilogue_mul_by_1_plus_i_mod_384x +.rva .LSEH_info_mul_by_1_plus_i_mod_384x_body + +.rva .LSEH_epilogue_mul_by_1_plus_i_mod_384x +.rva .LSEH_end_mul_by_1_plus_i_mod_384x +.rva .LSEH_info_mul_by_1_plus_i_mod_384x_epilogue + +.rva .LSEH_begin_sgn0_pty_mod_384 +.rva .LSEH_body_sgn0_pty_mod_384 +.rva .LSEH_info_sgn0_pty_mod_384_prologue + +.rva .LSEH_body_sgn0_pty_mod_384 +.rva .LSEH_epilogue_sgn0_pty_mod_384 +.rva .LSEH_info_sgn0_pty_mod_384_body + +.rva .LSEH_epilogue_sgn0_pty_mod_384 +.rva .LSEH_end_sgn0_pty_mod_384 +.rva .LSEH_info_sgn0_pty_mod_384_epilogue + +.rva .LSEH_begin_sgn0_pty_mod_384x +.rva .LSEH_body_sgn0_pty_mod_384x +.rva .LSEH_info_sgn0_pty_mod_384x_prologue + +.rva .LSEH_body_sgn0_pty_mod_384x +.rva .LSEH_epilogue_sgn0_pty_mod_384x +.rva .LSEH_info_sgn0_pty_mod_384x_body + +.rva .LSEH_epilogue_sgn0_pty_mod_384x +.rva .LSEH_end_sgn0_pty_mod_384x +.rva .LSEH_info_sgn0_pty_mod_384x_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_add_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_add_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_add_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_add_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_add_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00 +.LSEH_info_add_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_lshift_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_lshift_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_lshift_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_3_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_3_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_by_3_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_8_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_8_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_by_8_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_b_onE1_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_b_onE1_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_by_b_onE1_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_4b_onE1_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_4b_onE1_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_by_4b_onE1_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_3_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_3_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_by_3_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_8_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_8_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_by_8_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_b_onE2_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_b_onE2_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_by_b_onE2_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_4b_onE2_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_4b_onE2_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_by_4b_onE2_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_cneg_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_cneg_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_cneg_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sub_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sub_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sub_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00 +.LSEH_info_sub_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_1_plus_i_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_1_plus_i_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x07,0x00 +.byte 0x00,0xe4,0x08,0x00 +.byte 0x00,0xd4,0x09,0x00 +.byte 0x00,0xc4,0x0a,0x00 +.byte 0x00,0x34,0x0b,0x00 +.byte 0x00,0x54,0x0c,0x00 +.byte 0x00,0x74,0x0e,0x00 +.byte 0x00,0x64,0x0f,0x00 +.byte 0x00,0xc2 +.byte 0x00,0x00 +.LSEH_info_mul_by_1_plus_i_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0_pty_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sgn0_pty_mod_384_body: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sgn0_pty_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0_pty_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sgn0_pty_mod_384x_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00 +.LSEH_info_sgn0_pty_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/build/coff/add_mod_384x384-x86_64.s b/build/coff/add_mod_384x384-x86_64.s new file mode 100644 index 00000000..79976cc0 --- /dev/null +++ b/build/coff/add_mod_384x384-x86_64.s @@ -0,0 +1,326 @@ +.text + +.def __add_mod_384x384; .scl 3; .type 32; .endef +.p2align 5 +__add_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + addq 0(%rdx),%r8 + movq 56(%rsi),%r15 + adcq 8(%rdx),%r9 + movq 64(%rsi),%rax + adcq 16(%rdx),%r10 + movq 72(%rsi),%rbx + adcq 24(%rdx),%r11 + movq 80(%rsi),%rbp + adcq 32(%rdx),%r12 + movq 88(%rsi),%rsi + adcq 40(%rdx),%r13 + movq %r8,0(%rdi) + adcq 48(%rdx),%r14 + movq %r9,8(%rdi) + adcq 56(%rdx),%r15 + movq %r10,16(%rdi) + adcq 64(%rdx),%rax + movq %r12,32(%rdi) + movq %r14,%r8 + adcq 72(%rdx),%rbx + movq %r11,24(%rdi) + movq %r15,%r9 + adcq 80(%rdx),%rbp + movq %r13,40(%rdi) + movq %rax,%r10 + adcq 88(%rdx),%rsi + movq %rbx,%r11 + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %rbp,%r12 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%rbx + sbbq 32(%rcx),%rbp + movq %rsi,%r13 + sbbq 40(%rcx),%rsi + sbbq $0,%rdx + + cmovcq %r8,%r14 + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %r14,48(%rdi) + cmovcq %r11,%rbx + movq %r15,56(%rdi) + cmovcq %r12,%rbp + movq %rax,64(%rdi) + cmovcq %r13,%rsi + movq %rbx,72(%rdi) + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 + + +.def __sub_mod_384x384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 + + +.globl add_mod_384x384 + +.def add_mod_384x384; .scl 2; .type 32; .endef +.p2align 5 +add_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_mod_384x384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_add_mod_384x384: + + + call __add_mod_384x384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_add_mod_384x384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_mod_384x384: + +.globl sub_mod_384x384 + +.def sub_mod_384x384; .scl 2; .type 32; .endef +.p2align 5 +sub_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_mod_384x384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sub_mod_384x384: + + + call __sub_mod_384x384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sub_mod_384x384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_mod_384x384: +.section .pdata +.p2align 2 +.rva .LSEH_begin_add_mod_384x384 +.rva .LSEH_body_add_mod_384x384 +.rva .LSEH_info_add_mod_384x384_prologue + +.rva .LSEH_body_add_mod_384x384 +.rva .LSEH_epilogue_add_mod_384x384 +.rva .LSEH_info_add_mod_384x384_body + +.rva .LSEH_epilogue_add_mod_384x384 +.rva .LSEH_end_add_mod_384x384 +.rva .LSEH_info_add_mod_384x384_epilogue + +.rva .LSEH_begin_sub_mod_384x384 +.rva .LSEH_body_sub_mod_384x384 +.rva .LSEH_info_sub_mod_384x384_prologue + +.rva .LSEH_body_sub_mod_384x384 +.rva .LSEH_epilogue_sub_mod_384x384 +.rva .LSEH_info_sub_mod_384x384_body + +.rva .LSEH_epilogue_sub_mod_384x384 +.rva .LSEH_end_sub_mod_384x384 +.rva .LSEH_info_sub_mod_384x384_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_add_mod_384x384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_add_mod_384x384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_add_mod_384x384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_mod_384x384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sub_mod_384x384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sub_mod_384x384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/build/coff/inverse_mod_384-x86_64.s b/build/coff/inverse_mod_384-x86_64.s new file mode 100644 index 00000000..4f5f4438 --- /dev/null +++ b/build/coff/inverse_mod_384-x86_64.s @@ -0,0 +1,412 @@ +.text + +.p2align 5 +.Lone: +.quad 1,0,0,0,0,0,0,0 + +.globl eucl_inverse_mod_384 + +.def eucl_inverse_mod_384; .scl 2; .type 32; .endef +.p2align 5 +eucl_inverse_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_eucl_inverse_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $216,%rsp + +.LSEH_body_eucl_inverse_mod_384: + + + movq %rdi,0(%rsp) + leaq .Lone(%rip),%rbp + cmpq $0,%rcx + cmoveq %rbp,%rcx + + movq 0(%rsi),%rax + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rax,%r8 + orq %r9,%rax + orq %r10,%rax + orq %r11,%rax + orq %r12,%rax + orq %r13,%rax + jz .Labort + + leaq 16(%rsp),%rsi + movq 0(%rcx),%r14 + movq 8(%rcx),%r15 + movq 16(%rcx),%rax + movq 24(%rcx),%rbx + movq 32(%rcx),%rbp + movq 40(%rcx),%rdi + + movq %r8,0(%rsi) + movq %r9,8(%rsi) + movq %r10,16(%rsi) + movq %r11,24(%rsi) + movq %r12,32(%rsi) + movq %r13,40(%rsi) + + leaq 112(%rsp),%rcx + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + movq %r14,48(%rsi) + movq %r15,56(%rsi) + movq %rax,64(%rsi) + movq %rbx,72(%rsi) + movq %rbp,80(%rsi) + movq %rdi,88(%rsi) + + movq %r8,0(%rcx) + movq %r9,8(%rcx) + movq %r10,16(%rcx) + movq %r11,24(%rcx) + movq %r12,32(%rcx) + movq %r13,40(%rcx) + + xorl %eax,%eax + movq %rax,48(%rcx) + movq %rax,56(%rcx) + movq %rax,64(%rcx) + movq %rax,72(%rcx) + movq %rax,80(%rcx) + movq %rax,88(%rcx) + jmp .Loop_inv + +.p2align 5 +.Loop_inv: + leaq 112(%rsp),%rsi + call __remove_powers_of_2 + + leaq 16(%rsp),%rsi + call __remove_powers_of_2 + + leaq 112(%rsp),%rcx + subq 112+0(%rsp),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + sbbq 40(%rcx),%r13 + jae .Lu_greater_than_v + + + xchgq %rcx,%rsi + + notq %r8 + notq %r9 + notq %r10 + notq %r11 + notq %r12 + notq %r13 + + addq $1,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + +.Lu_greater_than_v: + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rax + movq 72(%rsi),%rbx + movq 80(%rsi),%rbp + movq 88(%rsi),%rdi + + subq 48(%rcx),%r14 + sbbq 56(%rcx),%r15 + sbbq 64(%rcx),%rax + sbbq 72(%rcx),%rbx + sbbq 80(%rcx),%rbp + sbbq 88(%rcx),%rdi + + movq %r8,0(%rsi) + sbbq %r8,%r8 + movq %r9,8(%rsi) + movq %r8,%r9 + movq %r10,16(%rsi) + movq %r8,%r10 + movq %r11,24(%rsi) + movq %r8,%r11 + movq %r12,32(%rsi) + movq %r8,%r12 + movq %r13,40(%rsi) + movq %r8,%r13 + + andq 0(%rdx),%r8 + andq 8(%rdx),%r9 + andq 16(%rdx),%r10 + andq 24(%rdx),%r11 + andq 32(%rdx),%r12 + andq 40(%rdx),%r13 + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%rbx + adcq %r12,%rbp + adcq %r13,%rdi + + movq %r14,48(%rsi) + movq %r15,56(%rsi) + movq %rax,64(%rsi) + movq %rbx,72(%rsi) + movq %rbp,80(%rsi) + movq %rdi,88(%rsi) + + movq 16+0(%rsp),%r8 + movq 16+8(%rsp),%r9 + movq 16+16(%rsp),%r10 + movq 16+24(%rsp),%r11 + orq %r9,%r8 + orq 16+32(%rsp),%r10 + orq 16+40(%rsp),%r11 +.byte 0x67 + orq %r10,%r8 + orq %r11,%r8 + jnz .Loop_inv + + leaq 112(%rsp),%rsi + movq 0(%rsp),%rdi + movl $1,%eax + + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + movq 72(%rsi),%r11 + movq 80(%rsi),%r12 + movq 88(%rsi),%r13 + +.Labort: + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + leaq 216(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_eucl_inverse_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_eucl_inverse_mod_384: + +.def __remove_powers_of_2; .scl 3; .type 32; .endef +.p2align 5 +__remove_powers_of_2: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_of_2: + bsfq %r8,%rcx + movl $63,%eax + cmovzl %eax,%ecx + + cmpl $0,%ecx + je .Loop_of_2_done + + shrq %cl,%r8 + movq %r9,%r14 + shrq %cl,%r9 + movq %r10,%r15 + shrq %cl,%r10 + movq %r11,%rax + shrq %cl,%r11 + movq %r12,%rbx + shrq %cl,%r12 + movq %r13,%rbp + shrq %cl,%r13 + negb %cl + shlq %cl,%r14 + shlq %cl,%r15 + orq %r14,%r8 + movq 48(%rsi),%r14 + shlq %cl,%rax + orq %r15,%r9 + movq 56(%rsi),%r15 + shlq %cl,%rbx + orq %rax,%r10 + movq 64(%rsi),%rax + shlq %cl,%rbp + orq %rbx,%r11 + movq 72(%rsi),%rbx + orq %rbp,%r12 + movq 80(%rsi),%rbp + negb %cl + movq 88(%rsi),%rdi + + movq %r8,0(%rsi) + movq %r9,8(%rsi) + movq %r10,16(%rsi) + movq %r11,24(%rsi) + movq %r12,32(%rsi) + movq %r13,40(%rsi) + jmp .Loop_div_by_2 + +.p2align 5 +.Loop_div_by_2: + movq $1,%r13 + movq 0(%rdx),%r8 + andq %r14,%r13 + movq 8(%rdx),%r9 + negq %r13 + movq 16(%rdx),%r10 + andq %r13,%r8 + movq 24(%rdx),%r11 + andq %r13,%r9 + movq 32(%rdx),%r12 + andq %r13,%r10 + andq %r13,%r11 + andq %r13,%r12 + andq 40(%rdx),%r13 + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%rbx + adcq %r12,%rbp + adcq %r13,%rdi + sbbq %r13,%r13 + + shrq $1,%r14 + movq %r15,%r8 + shrq $1,%r15 + movq %rax,%r9 + shrq $1,%rax + movq %rbx,%r10 + shrq $1,%rbx + movq %rbp,%r11 + shrq $1,%rbp + movq %rdi,%r12 + shrq $1,%rdi + shlq $63,%r8 + shlq $63,%r9 + orq %r8,%r14 + shlq $63,%r10 + orq %r9,%r15 + shlq $63,%r11 + orq %r10,%rax + shlq $63,%r12 + orq %r11,%rbx + shlq $63,%r13 + orq %r12,%rbp + orq %r13,%rdi + + decl %ecx + jnz .Loop_div_by_2 + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r14,48(%rsi) + movq %r15,56(%rsi) + movq %rax,64(%rsi) + movq %rbx,72(%rsi) + movq %rbp,80(%rsi) + movq %rdi,88(%rsi) + + testq $1,%r8 +.byte 0x2e + jz .Loop_of_2 + +.Loop_of_2_done: + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_eucl_inverse_mod_384 +.rva .LSEH_body_eucl_inverse_mod_384 +.rva .LSEH_info_eucl_inverse_mod_384_prologue + +.rva .LSEH_body_eucl_inverse_mod_384 +.rva .LSEH_epilogue_eucl_inverse_mod_384 +.rva .LSEH_info_eucl_inverse_mod_384_body + +.rva .LSEH_epilogue_eucl_inverse_mod_384 +.rva .LSEH_end_eucl_inverse_mod_384 +.rva .LSEH_info_eucl_inverse_mod_384_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_eucl_inverse_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_eucl_inverse_mod_384_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x1b,0x00 +.byte 0x00,0xe4,0x1c,0x00 +.byte 0x00,0xd4,0x1d,0x00 +.byte 0x00,0xc4,0x1e,0x00 +.byte 0x00,0x34,0x1f,0x00 +.byte 0x00,0x54,0x20,0x00 +.byte 0x00,0x74,0x22,0x00 +.byte 0x00,0x64,0x23,0x00 +.byte 0x00,0x01,0x21,0x00 +.LSEH_info_eucl_inverse_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/build/coff/mulq_mont_256-x86_64.s b/build/coff/mulq_mont_256-x86_64.s new file mode 100644 index 00000000..dd1e00fa --- /dev/null +++ b/build/coff/mulq_mont_256-x86_64.s @@ -0,0 +1,872 @@ +.text + +.globl mul_mont_sparse_256 + +.def mul_mont_sparse_256; .scl 2; .type 32; .endef +.p2align 5 +mul_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_mont_sparse_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_mul_mont_sparse_256: + + + movq 0(%rdx),%rax + movq 0(%rsi),%r13 + movq 8(%rsi),%r14 + movq 16(%rsi),%r12 + movq 24(%rsi),%rbp + movq %rdx,%rbx + + movq %rax,%r15 + mulq %r13 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_mont_sparse_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_mont_sparse_256: + +.globl sqr_mont_sparse_256 + +.def sqr_mont_sparse_256; .scl 2; .type 32; .endef +.p2align 5 +sqr_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_mont_sparse_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_sqr_mont_sparse_256: + + + movq 0(%rsi),%rax + movq %rcx,%r8 + movq 8(%rsi),%r14 + movq %rdx,%rcx + movq 16(%rsi),%r12 + leaq (%rsi),%rbx + movq 24(%rsi),%rbp + + movq %rax,%r15 + mulq %rax + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqr_mont_sparse_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_mont_sparse_256: +.def __mulq_mont_sparse_256; .scl 3; .type 32; .endef +.p2align 5 +__mulq_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + + mulq %r14 + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq 8(%rbx),%rax + adcq $0,%rdx + xorq %r14,%r14 + movq %rdx,%r13 + + movq %r9,%rdi + imulq %r8,%r9 + + + movq %rax,%r15 + mulq 0(%rsi) + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + xorq %r15,%r15 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r9,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rdi,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + addq %rdx,%r13 + adcq $0,%r14 + adcq $0,%r15 + movq %r10,%rdi + imulq %r8,%r10 + + + movq %rax,%r9 + mulq 0(%rsi) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + xorq %r9,%r9 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r10,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rdi,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r13 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + addq %rdx,%r14 + adcq $0,%r15 + adcq $0,%r9 + movq %r11,%rdi + imulq %r8,%r11 + + + movq %rax,%r10 + mulq 0(%rsi) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r9 + xorq %r10,%r10 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r11,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rdi,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + addq %rdx,%r15 + adcq $0,%r9 + adcq $0,%r10 + imulq %r8,%rax + movq 8(%rsp),%rsi + + + movq %rax,%r11 + mulq 0(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r12,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + movq %r14,%rbx + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rdx,%r9 + adcq $0,%r10 + + + + + movq %r15,%r12 + subq 0(%rcx),%r13 + sbbq 8(%rcx),%r14 + sbbq 16(%rcx),%r15 + movq %r9,%rbp + sbbq 24(%rcx),%r9 + sbbq $0,%r10 + + cmovcq %rax,%r13 + cmovcq %rbx,%r14 + cmovcq %r12,%r15 + movq %r13,0(%rsi) + cmovcq %rbp,%r9 + movq %r14,8(%rsi) + movq %r15,16(%rsi) + movq %r9,24(%rsi) + + .byte 0xf3,0xc3 + + +.globl from_mont_256 + +.def from_mont_256; .scl 2; .type 32; .endef +.p2align 5 +from_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_from_mont_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_from_mont_256: + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + + + + + movq %r14,%r10 + movq %r15,%r11 + movq %r9,%r12 + + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + sbbq 24(%rbx),%r9 + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_from_mont_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_from_mont_256: + +.globl redc_mont_256 + +.def redc_mont_256; .scl 2; .type 32; .endef +.p2align 5 +redc_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_redc_mont_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_redc_mont_256: + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + addq 32(%rsi),%r13 + adcq 40(%rsi),%r14 + movq %r13,%rax + adcq 48(%rsi),%r15 + movq %r14,%r10 + adcq 56(%rsi),%r9 + sbbq %rsi,%rsi + + + + + movq %r15,%r11 + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + movq %r9,%r12 + sbbq 24(%rbx),%r9 + sbbq $0,%rsi + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_redc_mont_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_redc_mont_256: +.def __mulq_by_1_mont_256; .scl 3; .type 32; .endef +.p2align 5 +__mulq_by_1_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + movq %rax,%r13 + imulq %rcx,%rax + movq %rax,%r9 + + mulq 0(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq %rdx,%r13 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r10 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 16(%rbx) + movq %r10,%r14 + imulq %rcx,%r10 + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r11 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r13,%r12 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r9 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_mul_mont_sparse_256 +.rva .LSEH_body_mul_mont_sparse_256 +.rva .LSEH_info_mul_mont_sparse_256_prologue + +.rva .LSEH_body_mul_mont_sparse_256 +.rva .LSEH_epilogue_mul_mont_sparse_256 +.rva .LSEH_info_mul_mont_sparse_256_body + +.rva .LSEH_epilogue_mul_mont_sparse_256 +.rva .LSEH_end_mul_mont_sparse_256 +.rva .LSEH_info_mul_mont_sparse_256_epilogue + +.rva .LSEH_begin_sqr_mont_sparse_256 +.rva .LSEH_body_sqr_mont_sparse_256 +.rva .LSEH_info_sqr_mont_sparse_256_prologue + +.rva .LSEH_body_sqr_mont_sparse_256 +.rva .LSEH_epilogue_sqr_mont_sparse_256 +.rva .LSEH_info_sqr_mont_sparse_256_body + +.rva .LSEH_epilogue_sqr_mont_sparse_256 +.rva .LSEH_end_sqr_mont_sparse_256 +.rva .LSEH_info_sqr_mont_sparse_256_epilogue + +.rva .LSEH_begin_from_mont_256 +.rva .LSEH_body_from_mont_256 +.rva .LSEH_info_from_mont_256_prologue + +.rva .LSEH_body_from_mont_256 +.rva .LSEH_epilogue_from_mont_256 +.rva .LSEH_info_from_mont_256_body + +.rva .LSEH_epilogue_from_mont_256 +.rva .LSEH_end_from_mont_256 +.rva .LSEH_info_from_mont_256_epilogue + +.rva .LSEH_begin_redc_mont_256 +.rva .LSEH_body_redc_mont_256 +.rva .LSEH_info_redc_mont_256_prologue + +.rva .LSEH_body_redc_mont_256 +.rva .LSEH_epilogue_redc_mont_256 +.rva .LSEH_info_redc_mont_256_body + +.rva .LSEH_epilogue_redc_mont_256 +.rva .LSEH_end_redc_mont_256 +.rva .LSEH_info_redc_mont_256_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_mul_mont_sparse_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_mont_sparse_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_mont_sparse_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_mont_sparse_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_mont_sparse_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sqr_mont_sparse_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_from_mont_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_from_mont_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_from_mont_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_redc_mont_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_redc_mont_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_redc_mont_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/build/coff/mulq_mont_384-x86_64.s b/build/coff/mulq_mont_384-x86_64.s new file mode 100644 index 00000000..f0d52f5c --- /dev/null +++ b/build/coff/mulq_mont_384-x86_64.s @@ -0,0 +1,4205 @@ +.text + + + + + + + +.def __sub_mod_384x384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 + + +.def __add_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__add_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + + +.def __sub_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + +.globl mul_mont_384x + +.def mul_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +mul_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_mont_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $328,%rsp + +.LSEH_body_mul_mont_384x: + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulq_384 + + + leaq 48(%rbx),%rbx + leaq 48(%rsi),%rsi + leaq 40+96(%rsp),%rdi + call __mulq_384 + + + movq 8(%rsp),%rcx + leaq -48(%rsi),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulq_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + movq %rcx,%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_mul_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_mont_384x: +.globl sqr_mont_384x + +.def sqr_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +sqr_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_mont_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqr_mont_384x: + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) +.byte 102,72,15,110,199 + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + call __mulq_mont_384 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + movq %r14,%r12 + adcq %r9,%r9 + movq %r15,%r13 + adcq %r10,%r10 + movq %r8,%rax + adcq %r11,%r11 + movq %r9,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r10,%rbp + sbbq 16(%rcx),%r8 + sbbq 24(%rcx),%r9 + sbbq 32(%rcx),%r10 + movq %r11,%rsi + sbbq 40(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r14 + cmovcq %r13,%r15 + cmovcq %rax,%r8 + movq %r14,48(%rdi) + cmovcq %rbx,%r9 + movq %r15,56(%rdi) + cmovcq %rbp,%r10 + movq %r8,64(%rdi) + cmovcq %rsi,%r11 + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_mont_384x: + +.globl mul_382x + +.def mul_382x; .scl 2; .type 32; .endef +.p2align 5 +mul_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_382x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_mul_382x: + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulq_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulq_384 + + + leaq 48(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulq_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_mul_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_382x: +.globl sqr_382x + +.def sqr_382x; .scl 2; .type 32; .endef +.p2align 5 +sqr_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_382x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_sqr_382x: + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulq_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulq_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqr_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_382x: +.globl mul_384 + +.def mul_384; .scl 2; .type 32; .endef +.p2align 5 +mul_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + +.LSEH_body_mul_384: + + + movq %rdx,%rbx + call __mulq_384 + + movq 0(%rsp),%r12 + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_mul_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_384: + +.def __mulq_384; .scl 3; .type 32; .endef +.p2align 5 +__mulq_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rax + + movq %rax,%rbp + mulq 0(%rsi) + movq %rax,0(%rdi) + movq %rbp,%rax + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r11 + movq 8(%rbx),%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,8(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,16(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,24(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,32(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,40(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq %rax,%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rcx,48(%rdi) + movq %r8,56(%rdi) + movq %r9,64(%rdi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + + .byte 0xf3,0xc3 + +.globl sqr_384 + +.def sqr_384; .scl 2; .type 32; .endef +.p2align 5 +sqr_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_384: + movq %rcx,%rdi + movq %rdx,%rsi + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sqr_384: + + + call __sqrq_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqr_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_384: + +.def __sqrq_384; .scl 3; .type 32; .endef +.p2align 5 +__sqrq_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r15 + movq 16(%rsi),%rcx + movq 24(%rsi),%rbx + + + movq %rax,%r14 + mulq %r15 + movq %rax,%r9 + movq %r14,%rax + movq 32(%rsi),%rbp + movq %rdx,%r10 + + mulq %rcx + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + movq 40(%rsi),%rsi + movq %rdx,%r11 + + mulq %rbx + addq %rax,%r11 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq %rax + xorq %r8,%r8 + movq %rax,0(%rdi) + movq %r15,%rax + addq %r9,%r9 + adcq $0,%r8 + addq %rdx,%r9 + adcq $0,%r8 + movq %r9,8(%rdi) + + mulq %rcx + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbx + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbp + addq %rax,%r13 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rax + xorq %r9,%r9 + addq %rax,%r8 + movq %rcx,%rax + addq %r10,%r10 + adcq %r11,%r11 + adcq $0,%r9 + addq %r8,%r10 + adcq %rdx,%r11 + adcq $0,%r9 + movq %r10,16(%rdi) + + mulq %rbx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + movq %r11,24(%rdi) + movq %rdx,%r8 + + mulq %rbp + addq %rax,%r14 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq %rsi + addq %rax,%r15 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + xorq %r11,%r11 + addq %rax,%r9 + movq %rbx,%rax + addq %r12,%r12 + adcq %r13,%r13 + adcq $0,%r11 + addq %r9,%r12 + adcq %rdx,%r13 + adcq $0,%r11 + movq %r12,32(%rdi) + + + mulq %rbp + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %r13,40(%rdi) + movq %rdx,%r8 + + mulq %rsi + addq %rax,%rcx + movq %rbx,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%rbx + + mulq %rax + xorq %r12,%r12 + addq %rax,%r11 + movq %rbp,%rax + addq %r14,%r14 + adcq %r15,%r15 + adcq $0,%r12 + addq %r11,%r14 + adcq %rdx,%r15 + movq %r14,48(%rdi) + adcq $0,%r12 + movq %r15,56(%rdi) + + + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + xorq %r13,%r13 + addq %rax,%r12 + movq %rsi,%rax + addq %rcx,%rcx + adcq %rbx,%rbx + adcq $0,%r13 + addq %r12,%rcx + adcq %rdx,%rbx + movq %rcx,64(%rdi) + adcq $0,%r13 + movq %rbx,72(%rdi) + + + mulq %rax + addq %r13,%rax + addq %rbp,%rbp + adcq $0,%rdx + addq %rbp,%rax + adcq $0,%rdx + movq %rax,80(%rdi) + movq %rdx,88(%rdi) + + .byte 0xf3,0xc3 + + +.globl sqr_mont_384 + +.def sqr_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sqr_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $120,%rsp + +.LSEH_body_sqr_mont_384: + + + movq %rcx,96(%rsp) + movq %rdx,104(%rsp) + movq %rdi,112(%rsp) + + movq %rsp,%rdi + call __sqrq_384 + + leaq 0(%rsp),%rsi + movq 96(%rsp),%rcx + movq 104(%rsp),%rbx + movq 112(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 120(%rsp),%r8 + movq 120(%rsp),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_mont_384: + + + +.globl redc_mont_384 + +.def redc_mont_384; .scl 2; .type 32; .endef +.p2align 5 +redc_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_redc_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_redc_mont_384: + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_redc_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_redc_mont_384: + + + + +.globl from_mont_384 + +.def from_mont_384; .scl 2; .type 32; .endef +.p2align 5 +from_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_from_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_from_mont_384: + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + + + + + + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_from_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_from_mont_384: +.def __mulq_by_1_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__mulq_by_1_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r8 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r9 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r10 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %r9,%r15 + imulq %rcx,%r9 + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 32(%rbx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 40(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r9,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %r10,%r8 + imulq %rcx,%r10 + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r8 + movq %r10,%rax + adcq %rdx,%r8 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %r11,%r9 + imulq %rcx,%r11 + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r11,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %r12,%r10 + imulq %rcx,%r12 + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %r13,%r11 + imulq %rcx,%r13 + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 + + +.def __redc_tail_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__redc_tail_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 + + +.globl sgn0_pty_mont_384 + +.def sgn0_pty_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sgn0_pty_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0_pty_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sgn0_pty_mont_384: + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sgn0_pty_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0_pty_mont_384: + +.globl sgn0_pty_mont_384x + +.def sgn0_pty_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +sgn0_pty_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0_pty_mont_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sgn0_pty_mont_384x: + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sgn0_pty_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0_pty_mont_384x: +.globl mul_mont_384 + +.def mul_mont_384; .scl 2; .type 32; .endef +.p2align 5 +mul_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %r8 + +.LSEH_body_mul_mont_384: + + + movq 0(%rdx),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq %rdx,%rbx +.byte 102,72,15,110,199 + + call __mulq_mont_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_mont_384: +.def __mulq_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__mulq_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rdi + mulq %r14 + movq %rax,%r8 + movq %rdi,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%rbp + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + xorq %r15,%r15 + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r8,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + adcq $0,%r15 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r9,%rbp + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r14 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r14 + movq %r9,%rax + adcq %rdx,%r15 + adcq $0,%r8 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r9,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + adcq $0,%r8 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r10,%rbp + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r15 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r15 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r10,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r8 + adcq $0,%r9 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r11,%rbp + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r11,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq %rdx,%r9 + adcq $0,%r10 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r12,%rbp + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r9 + adcq $0,%rdx + xorq %r11,%r11 + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r10 + adcq $0,%r11 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r12,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + movq %r13,%rbp + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r8 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rsi) + addq %r12,%r10 + adcq $0,%rdx + xorq %r12,%r12 + addq %rax,%r10 + movq %r13,%rax + adcq %rdx,%r11 + adcq $0,%r12 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r13,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq %rdx,%r11 + adcq $0,%r12 + + + + +.byte 102,72,15,126,199 + subq 0(%rcx),%r14 + movq %r15,%rdx + sbbq 8(%rcx),%r15 + movq %r8,%rbx + sbbq 16(%rcx),%r8 + movq %r9,%rsi + sbbq 24(%rcx),%r9 + movq %r10,%rbp + sbbq 32(%rcx),%r10 + movq %r11,%r13 + sbbq 40(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rdx,%r15 + cmovcq %rbx,%r8 + movq %r14,0(%rdi) + cmovcq %rsi,%r9 + movq %r15,8(%rdi) + cmovcq %rbp,%r10 + movq %r8,16(%rdi) + cmovcq %r13,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 + +.globl sqr_n_mul_mont_384 + +.def sqr_n_mul_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sqr_n_mul_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_n_mul_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqr_n_mul_mont_384: + + + movq %r8,0(%rsp) + movq %rcx,8(%rsp) +.byte 102,72,15,110,199 + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_384: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 8(%rsp),%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movd %xmm1,%edx + leaq 0(%rdi),%rsi + decl %edx + jnz .Loop_sqr_384 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_n_mul_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_n_mul_mont_384: + +.globl sqr_n_mul_mont_383 + +.def sqr_n_mul_mont_383; .scl 2; .type 32; .endef +.p2align 5 +sqr_n_mul_mont_383: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_n_mul_mont_383: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqr_n_mul_mont_383: + + + movq %r8,0(%rsp) + movq %rcx,8(%rsp) +.byte 102,72,15,110,199 + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_383: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 8(%rsp),%rbx + call __mulq_by_1_mont_384 + + movd %xmm1,%edx + addq 48(%rsi),%r14 + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + leaq 0(%rdi),%rsi + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + decl %edx + jnz .Loop_sqr_383 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_n_mul_mont_383: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_n_mul_mont_383: +.def __mulq_mont_383_nonred; .scl 3; .type 32; .endef +.p2align 5 +__mulq_mont_383_nonred: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rbp + mulq %r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%r15 + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%r15 + movq %r8,%rax + adcq %rdx,%r15 + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r9 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rcx) + addq %r15,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %r15,%r13 + adcq %rdx,%r14 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + movq %r9,%r8 + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rsi) + addq %r15,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rcx) + addq %rax,%r8 + movq %r9,%rax + adcq %rdx,%r8 + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rcx) + addq %r8,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r8,%r14 + adcq %rdx,%r15 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r10,%r9 + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rcx) + addq %rax,%r9 + movq %r10,%rax + adcq %rdx,%r9 + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rcx) + addq %r9,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r9,%r15 + adcq %rdx,%r8 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r11,%r10 + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rcx) + addq %rax,%r10 + movq %r11,%rax + adcq %rdx,%r10 + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rcx) + addq %r10,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r10,%r8 + adcq %rdx,%r9 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r12,%r11 + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rcx) + addq %rax,%r11 + movq %r12,%rax + adcq %rdx,%r11 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rcx) + addq %r11,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r11,%r9 + adcq %rdx,%r10 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r13,%r12 + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 0(%rcx) + addq %rax,%r12 + movq %r13,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 24(%rcx) + addq %r12,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r12,%r10 + adcq %rdx,%r11 + .byte 0xf3,0xc3 + +.globl sqr_mont_382x + +.def sqr_mont_382x; .scl 2; .type 32; .endef +.p2align 5 +sqr_mont_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_mont_382x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqr_mont_382x: + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) + movq %rdi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq 24(%rsp),%rdi + call __mulq_mont_383_nonred + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %r8,64(%rdi) + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_383_nonred + movq 32+96(%rsp),%rsi + movq 32+0(%rsp),%r12 + movq 32+8(%rsp),%r13 + andq %rsi,%r12 + movq 32+16(%rsp),%rax + andq %rsi,%r13 + movq 32+24(%rsp),%rbx + andq %rsi,%rax + movq 32+32(%rsp),%rbp + andq %rsi,%rbx + andq %rsi,%rbp + andq 32+40(%rsp),%rsi + + subq %r12,%r14 + movq 0(%rcx),%r12 + sbbq %r13,%r15 + movq 8(%rcx),%r13 + sbbq %rax,%r8 + movq 16(%rcx),%rax + sbbq %rbx,%r9 + movq 24(%rcx),%rbx + sbbq %rbp,%r10 + movq 32(%rcx),%rbp + sbbq %rsi,%r11 + sbbq %rsi,%rsi + + andq %rsi,%r12 + andq %rsi,%r13 + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r12,%r14 + adcq %r13,%r15 + adcq %rax,%r8 + adcq %rbx,%r9 + adcq %rbp,%r10 + adcq %rsi,%r11 + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_mont_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_mont_382x: +.section .pdata +.p2align 2 +.rva .LSEH_begin_mul_mont_384x +.rva .LSEH_body_mul_mont_384x +.rva .LSEH_info_mul_mont_384x_prologue + +.rva .LSEH_body_mul_mont_384x +.rva .LSEH_epilogue_mul_mont_384x +.rva .LSEH_info_mul_mont_384x_body + +.rva .LSEH_epilogue_mul_mont_384x +.rva .LSEH_end_mul_mont_384x +.rva .LSEH_info_mul_mont_384x_epilogue + +.rva .LSEH_begin_sqr_mont_384x +.rva .LSEH_body_sqr_mont_384x +.rva .LSEH_info_sqr_mont_384x_prologue + +.rva .LSEH_body_sqr_mont_384x +.rva .LSEH_epilogue_sqr_mont_384x +.rva .LSEH_info_sqr_mont_384x_body + +.rva .LSEH_epilogue_sqr_mont_384x +.rva .LSEH_end_sqr_mont_384x +.rva .LSEH_info_sqr_mont_384x_epilogue + +.rva .LSEH_begin_mul_382x +.rva .LSEH_body_mul_382x +.rva .LSEH_info_mul_382x_prologue + +.rva .LSEH_body_mul_382x +.rva .LSEH_epilogue_mul_382x +.rva .LSEH_info_mul_382x_body + +.rva .LSEH_epilogue_mul_382x +.rva .LSEH_end_mul_382x +.rva .LSEH_info_mul_382x_epilogue + +.rva .LSEH_begin_sqr_382x +.rva .LSEH_body_sqr_382x +.rva .LSEH_info_sqr_382x_prologue + +.rva .LSEH_body_sqr_382x +.rva .LSEH_epilogue_sqr_382x +.rva .LSEH_info_sqr_382x_body + +.rva .LSEH_epilogue_sqr_382x +.rva .LSEH_end_sqr_382x +.rva .LSEH_info_sqr_382x_epilogue + +.rva .LSEH_begin_mul_384 +.rva .LSEH_body_mul_384 +.rva .LSEH_info_mul_384_prologue + +.rva .LSEH_body_mul_384 +.rva .LSEH_epilogue_mul_384 +.rva .LSEH_info_mul_384_body + +.rva .LSEH_epilogue_mul_384 +.rva .LSEH_end_mul_384 +.rva .LSEH_info_mul_384_epilogue + +.rva .LSEH_begin_sqr_384 +.rva .LSEH_body_sqr_384 +.rva .LSEH_info_sqr_384_prologue + +.rva .LSEH_body_sqr_384 +.rva .LSEH_epilogue_sqr_384 +.rva .LSEH_info_sqr_384_body + +.rva .LSEH_epilogue_sqr_384 +.rva .LSEH_end_sqr_384 +.rva .LSEH_info_sqr_384_epilogue + +.rva .LSEH_begin_sqr_mont_384 +.rva .LSEH_body_sqr_mont_384 +.rva .LSEH_info_sqr_mont_384_prologue + +.rva .LSEH_body_sqr_mont_384 +.rva .LSEH_epilogue_sqr_mont_384 +.rva .LSEH_info_sqr_mont_384_body + +.rva .LSEH_epilogue_sqr_mont_384 +.rva .LSEH_end_sqr_mont_384 +.rva .LSEH_info_sqr_mont_384_epilogue + +.rva .LSEH_begin_redc_mont_384 +.rva .LSEH_body_redc_mont_384 +.rva .LSEH_info_redc_mont_384_prologue + +.rva .LSEH_body_redc_mont_384 +.rva .LSEH_epilogue_redc_mont_384 +.rva .LSEH_info_redc_mont_384_body + +.rva .LSEH_epilogue_redc_mont_384 +.rva .LSEH_end_redc_mont_384 +.rva .LSEH_info_redc_mont_384_epilogue + +.rva .LSEH_begin_from_mont_384 +.rva .LSEH_body_from_mont_384 +.rva .LSEH_info_from_mont_384_prologue + +.rva .LSEH_body_from_mont_384 +.rva .LSEH_epilogue_from_mont_384 +.rva .LSEH_info_from_mont_384_body + +.rva .LSEH_epilogue_from_mont_384 +.rva .LSEH_end_from_mont_384 +.rva .LSEH_info_from_mont_384_epilogue + +.rva .LSEH_begin_sgn0_pty_mont_384 +.rva .LSEH_body_sgn0_pty_mont_384 +.rva .LSEH_info_sgn0_pty_mont_384_prologue + +.rva .LSEH_body_sgn0_pty_mont_384 +.rva .LSEH_epilogue_sgn0_pty_mont_384 +.rva .LSEH_info_sgn0_pty_mont_384_body + +.rva .LSEH_epilogue_sgn0_pty_mont_384 +.rva .LSEH_end_sgn0_pty_mont_384 +.rva .LSEH_info_sgn0_pty_mont_384_epilogue + +.rva .LSEH_begin_sgn0_pty_mont_384x +.rva .LSEH_body_sgn0_pty_mont_384x +.rva .LSEH_info_sgn0_pty_mont_384x_prologue + +.rva .LSEH_body_sgn0_pty_mont_384x +.rva .LSEH_epilogue_sgn0_pty_mont_384x +.rva .LSEH_info_sgn0_pty_mont_384x_body + +.rva .LSEH_epilogue_sgn0_pty_mont_384x +.rva .LSEH_end_sgn0_pty_mont_384x +.rva .LSEH_info_sgn0_pty_mont_384x_epilogue + +.rva .LSEH_begin_mul_mont_384 +.rva .LSEH_body_mul_mont_384 +.rva .LSEH_info_mul_mont_384_prologue + +.rva .LSEH_body_mul_mont_384 +.rva .LSEH_epilogue_mul_mont_384 +.rva .LSEH_info_mul_mont_384_body + +.rva .LSEH_epilogue_mul_mont_384 +.rva .LSEH_end_mul_mont_384 +.rva .LSEH_info_mul_mont_384_epilogue + +.rva .LSEH_begin_sqr_n_mul_mont_384 +.rva .LSEH_body_sqr_n_mul_mont_384 +.rva .LSEH_info_sqr_n_mul_mont_384_prologue + +.rva .LSEH_body_sqr_n_mul_mont_384 +.rva .LSEH_epilogue_sqr_n_mul_mont_384 +.rva .LSEH_info_sqr_n_mul_mont_384_body + +.rva .LSEH_epilogue_sqr_n_mul_mont_384 +.rva .LSEH_end_sqr_n_mul_mont_384 +.rva .LSEH_info_sqr_n_mul_mont_384_epilogue + +.rva .LSEH_begin_sqr_n_mul_mont_383 +.rva .LSEH_body_sqr_n_mul_mont_383 +.rva .LSEH_info_sqr_n_mul_mont_383_prologue + +.rva .LSEH_body_sqr_n_mul_mont_383 +.rva .LSEH_epilogue_sqr_n_mul_mont_383 +.rva .LSEH_info_sqr_n_mul_mont_383_body + +.rva .LSEH_epilogue_sqr_n_mul_mont_383 +.rva .LSEH_end_sqr_n_mul_mont_383 +.rva .LSEH_info_sqr_n_mul_mont_383_epilogue + +.rva .LSEH_begin_sqr_mont_382x +.rva .LSEH_body_sqr_mont_382x +.rva .LSEH_info_sqr_mont_382x_prologue + +.rva .LSEH_body_sqr_mont_382x +.rva .LSEH_epilogue_sqr_mont_382x +.rva .LSEH_info_sqr_mont_382x_body + +.rva .LSEH_epilogue_sqr_mont_382x +.rva .LSEH_end_sqr_mont_382x +.rva .LSEH_info_sqr_mont_382x_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_mul_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_mont_384x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x29,0x00 +.byte 0x00,0xe4,0x2a,0x00 +.byte 0x00,0xd4,0x2b,0x00 +.byte 0x00,0xc4,0x2c,0x00 +.byte 0x00,0x34,0x2d,0x00 +.byte 0x00,0x54,0x2e,0x00 +.byte 0x00,0x74,0x30,0x00 +.byte 0x00,0x64,0x31,0x00 +.byte 0x00,0x01,0x2f,0x00 +.LSEH_info_mul_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_mont_384x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_sqr_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_382x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_mul_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_382x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sqr_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_384_body: +.byte 1,0,11,0 +.byte 0x00,0xc4,0x00,0x00 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_mul_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sqr_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_mont_384_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x0f,0x00 +.byte 0x00,0xe4,0x10,0x00 +.byte 0x00,0xd4,0x11,0x00 +.byte 0x00,0xc4,0x12,0x00 +.byte 0x00,0x34,0x13,0x00 +.byte 0x00,0x54,0x14,0x00 +.byte 0x00,0x74,0x16,0x00 +.byte 0x00,0x64,0x17,0x00 +.byte 0x00,0x01,0x15,0x00 +.LSEH_info_sqr_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_redc_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_redc_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_redc_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_from_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_from_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_from_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0_pty_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sgn0_pty_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sgn0_pty_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0_pty_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sgn0_pty_mont_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sgn0_pty_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_n_mul_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_n_mul_mont_384_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_sqr_n_mul_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_n_mul_mont_383_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_n_mul_mont_383_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_sqr_n_mul_mont_383_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_mont_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_mont_382x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_sqr_mont_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/build/coff/mulx_mont_256-x86_64.s b/build/coff/mulx_mont_256-x86_64.s new file mode 100644 index 00000000..75c7e82b --- /dev/null +++ b/build/coff/mulx_mont_256-x86_64.s @@ -0,0 +1,784 @@ +.text + +.globl mulx_mont_sparse_256 + +.def mulx_mont_sparse_256; .scl 2; .type 32; .endef +.p2align 5 +mulx_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_mont_sparse_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_mulx_mont_sparse_256: + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mulx_mont_sparse_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_mont_sparse_256: + +.globl sqrx_mont_sparse_256 + +.def sqrx_mont_sparse_256; .scl 2; .type 32; .endef +.p2align 5 +sqrx_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_mont_sparse_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sqrx_mont_sparse_256: + + + movq %rsi,%rbx + movq %rcx,%r8 + movq %rdx,%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqrx_mont_sparse_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_mont_sparse_256: +.def __mulx_mont_sparse_256; .scl 3; .type 32; .endef +.p2align 5 +__mulx_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + + mulxq %r15,%r15,%r12 + mulxq %rbp,%rbp,%r13 + addq %r15,%r11 + mulxq %r9,%r9,%r14 + movq 8(%rbx),%rdx + adcq %rbp,%r12 + adcq %r9,%r13 + adcq $0,%r14 + + movq %rax,%r10 + imulq %r8,%rax + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r11 + adcxq %r9,%r12 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r14 + adcxq %r15,%r9 + adoxq %r9,%r15 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r10 + adoxq %r11,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r12 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r12 + adoxq %r9,%r13 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 16(%rbx),%rdx + adcxq %rbp,%r13 + adoxq %r9,%r14 + adcxq %r10,%r14 + adoxq %r10,%r15 + adcxq %r10,%r15 + adoxq %r10,%r10 + adcq $0,%r10 + movq %rax,%r11 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r15 + adcxq %r10,%r9 + adoxq %r9,%r10 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r11 + adoxq %r12,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r13 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r13 + adoxq %r9,%r14 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 24(%rbx),%rdx + adcxq %rbp,%r14 + adoxq %r9,%r15 + adcxq %r11,%r15 + adoxq %r11,%r10 + adcxq %r11,%r10 + adoxq %r11,%r11 + adcq $0,%r11 + movq %rax,%r12 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r15 + adcxq %r9,%r10 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r10 + adcxq %r11,%r9 + adoxq %r9,%r11 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r12 + adoxq %r13,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r14 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %rax,%rdx + adcxq %rbp,%r15 + adoxq %r9,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + adoxq %r12,%r12 + adcq $0,%r12 + imulq %r8,%rdx + + + xorq %rbp,%rbp + mulxq 0+128(%rcx),%r13,%r9 + adcxq %rax,%r13 + adoxq %r9,%r14 + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r15 + adoxq %r9,%r10 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %r14,%rdx + leaq 128(%rcx),%rcx + adcxq %rbp,%r10 + adoxq %r9,%r11 + movq %r15,%rax + adcxq %r13,%r11 + adoxq %r13,%r12 + adcq $0,%r12 + + + + + movq %r10,%rbp + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + sbbq 16(%rcx),%r10 + movq %r11,%r9 + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rdx,%r14 + cmovcq %rax,%r15 + cmovcq %rbp,%r10 + movq %r14,0(%rdi) + cmovcq %r9,%r11 + movq %r15,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + .byte 0xf3,0xc3 + +.globl fromx_mont_256 + +.def fromx_mont_256; .scl 2; .type 32; .endef +.p2align 5 +fromx_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_fromx_mont_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_fromx_mont_256: + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + + + + + movq %r15,%rdx + movq %r10,%r12 + movq %r11,%r13 + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + sbbq 24(%rbx),%r11 + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_fromx_mont_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_fromx_mont_256: + +.globl redcx_mont_256 + +.def redcx_mont_256; .scl 2; .type 32; .endef +.p2align 5 +redcx_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_redcx_mont_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_redcx_mont_256: + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + addq 32(%rsi),%r14 + adcq 40(%rsi),%r15 + movq %r14,%rax + adcq 48(%rsi),%r10 + movq %r15,%rdx + adcq 56(%rsi),%r11 + sbbq %rsi,%rsi + + + + + movq %r10,%r12 + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + movq %r11,%r13 + sbbq 24(%rbx),%r11 + sbbq $0,%rsi + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_redcx_mont_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_redcx_mont_256: +.def __mulx_by_1_mont_256; .scl 3; .type 32; .endef +.p2align 5 +__mulx_by_1_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r11 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r10 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r10 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + movq %r13,%r11 + imulq %rcx,%r13 + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_mulx_mont_sparse_256 +.rva .LSEH_body_mulx_mont_sparse_256 +.rva .LSEH_info_mulx_mont_sparse_256_prologue + +.rva .LSEH_body_mulx_mont_sparse_256 +.rva .LSEH_epilogue_mulx_mont_sparse_256 +.rva .LSEH_info_mulx_mont_sparse_256_body + +.rva .LSEH_epilogue_mulx_mont_sparse_256 +.rva .LSEH_end_mulx_mont_sparse_256 +.rva .LSEH_info_mulx_mont_sparse_256_epilogue + +.rva .LSEH_begin_sqrx_mont_sparse_256 +.rva .LSEH_body_sqrx_mont_sparse_256 +.rva .LSEH_info_sqrx_mont_sparse_256_prologue + +.rva .LSEH_body_sqrx_mont_sparse_256 +.rva .LSEH_epilogue_sqrx_mont_sparse_256 +.rva .LSEH_info_sqrx_mont_sparse_256_body + +.rva .LSEH_epilogue_sqrx_mont_sparse_256 +.rva .LSEH_end_sqrx_mont_sparse_256 +.rva .LSEH_info_sqrx_mont_sparse_256_epilogue + +.rva .LSEH_begin_fromx_mont_256 +.rva .LSEH_body_fromx_mont_256 +.rva .LSEH_info_fromx_mont_256_prologue + +.rva .LSEH_body_fromx_mont_256 +.rva .LSEH_epilogue_fromx_mont_256 +.rva .LSEH_info_fromx_mont_256_body + +.rva .LSEH_epilogue_fromx_mont_256 +.rva .LSEH_end_fromx_mont_256 +.rva .LSEH_info_fromx_mont_256_epilogue + +.rva .LSEH_begin_redcx_mont_256 +.rva .LSEH_body_redcx_mont_256 +.rva .LSEH_info_redcx_mont_256_prologue + +.rva .LSEH_body_redcx_mont_256 +.rva .LSEH_epilogue_redcx_mont_256 +.rva .LSEH_info_redcx_mont_256_body + +.rva .LSEH_epilogue_redcx_mont_256 +.rva .LSEH_end_redcx_mont_256 +.rva .LSEH_info_redcx_mont_256_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_mulx_mont_sparse_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mulx_mont_sparse_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mulx_mont_sparse_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_mont_sparse_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_mont_sparse_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sqrx_mont_sparse_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_fromx_mont_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_fromx_mont_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_fromx_mont_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_redcx_mont_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_redcx_mont_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_redcx_mont_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/build/coff/mulx_mont_384-x86_64.s b/build/coff/mulx_mont_384-x86_64.s new file mode 100644 index 00000000..defd0bdd --- /dev/null +++ b/build/coff/mulx_mont_384-x86_64.s @@ -0,0 +1,3560 @@ +.text + + + + + + + +.def __sub_mod_384x384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 + + +.def __add_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__add_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + + +.def __sub_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + +.globl mulx_mont_384x + +.def mulx_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +mulx_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_mont_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $328,%rsp + +.LSEH_body_mulx_mont_384x: + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulx_384 + + + leaq 48(%rbx),%rbx + leaq 128+48(%rsi),%rsi + leaq 96(%rdi),%rdi + call __mulx_384 + + + movq 8(%rsp),%rcx + leaq (%rbx),%rsi + leaq -48(%rbx),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulx_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + leaq (%rcx),%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_mulx_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_mont_384x: +.globl sqrx_mont_384x + +.def sqrx_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +sqrx_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_mont_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqrx_mont_384x: + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + + movq %rsi,16(%rsp) +.byte 102,72,15,110,199 + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + movq %rdx,%r8 + adcq %r12,%r12 + movq %r15,%r9 + adcq %rdi,%rdi + movq %rax,%r10 + adcq %rbp,%rbp + movq %r12,%r11 + sbbq %rsi,%rsi + + subq 0(%rcx),%rdx + sbbq 8(%rcx),%r15 + movq %rdi,%r13 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r12 + sbbq 32(%rcx),%rdi + movq %rbp,%r14 + sbbq 40(%rcx),%rbp + sbbq $0,%rsi + + cmovcq %r8,%rdx + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %rdx,48(%rbx) + cmovcq %r11,%r12 + movq %r15,56(%rbx) + cmovcq %r13,%rdi + movq %rax,64(%rbx) + cmovcq %r14,%rbp + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqrx_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_mont_384x: + +.globl mulx_382x + +.def mulx_382x; .scl 2; .type 32; .endef +.p2align 5 +mulx_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_382x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_mulx_382x: + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulx_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulx_384 + + + leaq 48+128(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulx_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_mulx_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_382x: +.globl sqrx_382x + +.def sqrx_382x; .scl 2; .type 32; .endef +.p2align 5 +sqrx_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_382x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_sqrx_382x: + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulx_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulx_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqrx_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_382x: +.globl mulx_384 + +.def mulx_384; .scl 2; .type 32; .endef +.p2align 5 +mulx_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +.LSEH_body_mulx_384: + + + movq %rdx,%rbx + call __mulx_384 + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +.LSEH_epilogue_mulx_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_384: + +.def __mulx_384; .scl 3; .type 32; .endef +.p2align 5 +__mulx_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq -128(%rsi),%rsi + + mulxq %r14,%r9,%rcx + xorq %rbp,%rbp + + mulxq %r15,%r8,%rax + adcxq %rcx,%r8 + movq %r9,0(%rdi) + + mulxq %r10,%r9,%rcx + adcxq %rax,%r9 + + mulxq %r11,%r10,%rax + adcxq %rcx,%r10 + + mulxq %r12,%r11,%rcx + adcxq %rax,%r11 + + mulxq %r13,%r12,%r13 + movq 8(%rbx),%rdx + adcxq %rcx,%r12 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,8(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 16(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,16(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 24(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,24(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 32(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,32(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 40(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,40(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq %rax,%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + movq %r10,64(%rdi) + movq %r11,72(%rdi) + movq %r12,80(%rdi) + movq %r13,88(%rdi) + + .byte 0xf3,0xc3 + +.globl sqrx_384 + +.def sqrx_384; .scl 2; .type 32; .endef +.p2align 5 +sqrx_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_384: + movq %rcx,%rdi + movq %rdx,%rsi + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_sqrx_384: + + + call __sqrx_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqrx_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_384: +.def __sqrx_384; .scl 3; .type 32; .endef +.p2align 5 +__sqrx_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%rcx + movq 32(%rsi),%rbx +.byte 102,72,15,110,199 + + + mulxq %r14,%r8,%rdi + movq 40(%rsi),%rbp + mulxq %r15,%r9,%rax + addq %rdi,%r9 + mulxq %rcx,%r10,%rdi + adcq %rax,%r10 + mulxq %rbx,%r11,%rax + adcq %rdi,%r11 + mulxq %rbp,%r12,%r13 + movq %r14,%rdx + adcq %rax,%r12 + adcq $0,%r13 + + + xorq %r14,%r14 + mulxq %r15,%rdi,%rax + adcxq %rdi,%r10 + adoxq %rax,%r11 + + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r11 + adoxq %rax,%r12 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbp,%rdi,%rax + movq %r15,%rdx + adcxq %rdi,%r13 + adoxq %r14,%rax + adcxq %rax,%r14 + + + xorq %r15,%r15 + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r13 + adoxq %rax,%r14 + + mulxq %rbp,%rdi,%rax + movq %rcx,%rdx + adcxq %rdi,%r14 + adoxq %r15,%rax + adcxq %rax,%r15 + + + xorq %rcx,%rcx + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r14 + adoxq %rax,%r15 + + mulxq %rbp,%rdi,%rax + movq %rbx,%rdx + adcxq %rdi,%r15 + adoxq %rcx,%rax + adcxq %rax,%rcx + + + mulxq %rbp,%rdi,%rbx + movq 0(%rsi),%rdx + addq %rdi,%rcx +.byte 102,72,15,126,199 + adcq $0,%rbx + + + xorq %rbp,%rbp + adcxq %r8,%r8 + adcxq %r9,%r9 + adcxq %r10,%r10 + adcxq %r11,%r11 + adcxq %r12,%r12 + + + mulxq %rdx,%rdx,%rax + movq %rdx,0(%rdi) + movq 8(%rsi),%rdx + adoxq %rax,%r8 + movq %r8,8(%rdi) + + mulxq %rdx,%r8,%rax + movq 16(%rsi),%rdx + adoxq %r8,%r9 + adoxq %rax,%r10 + movq %r9,16(%rdi) + movq %r10,24(%rdi) + + mulxq %rdx,%r8,%r9 + movq 24(%rsi),%rdx + adoxq %r8,%r11 + adoxq %r9,%r12 + adcxq %r13,%r13 + adcxq %r14,%r14 + movq %r11,32(%rdi) + movq %r12,40(%rdi) + + mulxq %rdx,%r8,%r9 + movq 32(%rsi),%rdx + adoxq %r8,%r13 + adoxq %r9,%r14 + adcxq %r15,%r15 + adcxq %rcx,%rcx + movq %r13,48(%rdi) + movq %r14,56(%rdi) + + mulxq %rdx,%r8,%r9 + movq 40(%rsi),%rdx + adoxq %r8,%r15 + adoxq %r9,%rcx + adcxq %rbx,%rbx + adcxq %rbp,%rbp + movq %r15,64(%rdi) + movq %rcx,72(%rdi) + + mulxq %rdx,%r8,%r9 + adoxq %r8,%rbx + adoxq %r9,%rbp + + movq %rbx,80(%rdi) + movq %rbp,88(%rdi) + + .byte 0xf3,0xc3 + + + + +.globl redcx_mont_384 + +.def redcx_mont_384; .scl 2; .type 32; .endef +.p2align 5 +redcx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_redcx_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_redcx_mont_384: + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_redcx_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_redcx_mont_384: + + + + +.globl fromx_mont_384 + +.def fromx_mont_384; .scl 2; .type 32; .endef +.p2align 5 +fromx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_fromx_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_fromx_mont_384: + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + + + + + movq %r14,%rax + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_fromx_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_fromx_mont_384: +.def __mulx_by_1_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__mulx_by_1_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq %rcx,%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + imulq %r8,%rdx + + + xorq %r14,%r14 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r13 + adoxq %r14,%rbp + adcxq %rbp,%r14 + imulq %r9,%rdx + + + xorq %r15,%r15 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r14 + adoxq %r15,%rbp + adcxq %rbp,%r15 + imulq %r10,%rdx + + + xorq %r8,%r8 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r15 + adoxq %r8,%rbp + adcxq %rbp,%r8 + imulq %r11,%rdx + + + xorq %r9,%r9 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r8 + adoxq %r9,%rbp + adcxq %rbp,%r9 + imulq %r12,%rdx + + + xorq %r10,%r10 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r9 + adoxq %r10,%rbp + adcxq %rbp,%r10 + imulq %r13,%rdx + + + xorq %r11,%r11 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r10 + adoxq %r11,%rbp + adcxq %rbp,%r11 + .byte 0xf3,0xc3 + + +.def __redc_tail_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__redc_tail_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 + + +.globl sgn0x_pty_mont_384 + +.def sgn0x_pty_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sgn0x_pty_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0x_pty_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sgn0x_pty_mont_384: + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sgn0x_pty_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0x_pty_mont_384: + +.globl sgn0x_pty_mont_384x + +.def sgn0x_pty_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +sgn0x_pty_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0x_pty_mont_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sgn0x_pty_mont_384x: + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sgn0x_pty_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0x_pty_mont_384x: +.globl mulx_mont_384 + +.def mulx_mont_384; .scl 2; .type 32; .endef +.p2align 5 +mulx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -24(%rsp),%rsp + +.LSEH_body_mulx_mont_384: + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 +.byte 102,72,15,110,199 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + movq %r8,(%rsp) + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 + + movq 32(%rsp),%r14 + + movq 40(%rsp),%r13 + + movq 48(%rsp),%r12 + + movq 56(%rsp),%rbx + + movq 64(%rsp),%rbp + + leaq 72(%rsp),%rsp + +.LSEH_epilogue_mulx_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_mont_384: +.def __mulx_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__mulx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + xorq %r15,%r15 + + movq %r8,16(%rsp) + imulq 8(%rsp),%r8 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %rbp,%r15 + adoxq %rax,%r15 + adoxq %rax,%rax + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %r8,%r14 + adoxq %r8,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r9,16(%rsp) + imulq 8(%rsp),%r9 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rbp,%rax + adoxq %r8,%rax + adoxq %r8,%r8 + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r9,%r15 + adoxq %r9,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r10,16(%rsp) + imulq 8(%rsp),%r10 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %rbp,%r8 + adoxq %r9,%r8 + adoxq %r9,%r9 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r10,%rax + adoxq %r10,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r11,16(%rsp) + imulq 8(%rsp),%r11 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %rbp,%r9 + adoxq %r10,%r9 + adoxq %r10,%r10 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r11,%r8 + adoxq %r11,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + movq %r12,16(%rsp) + imulq 8(%rsp),%r12 + + + xorq %r11,%r11 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %rbp,%r10 + adoxq %r11,%r10 + adoxq %r11,%r11 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r12,%r9 + adoxq %r12,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + imulq 8(%rsp),%rdx +.byte 102,72,15,126,195 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + movq %r15,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + movq %rax,%rsi + + mulxq 40+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + movq %r14,%rdx + adcxq %r12,%r10 + adoxq %r12,%r11 + leaq 128(%rcx),%rcx + movq %r8,%r12 + adcq $0,%r11 + + + + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r9,%rdi + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r8 + sbbq 32(%rcx),%r9 + movq %r10,%rbp + sbbq 40(%rcx),%r10 + sbbq $0,%r11 + + cmovncq %r14,%rdx + cmovcq %r13,%r15 + cmovcq %rsi,%rax + cmovncq %r8,%r12 + movq %rdx,0(%rbx) + cmovncq %r9,%rdi + movq %r15,8(%rbx) + cmovncq %r10,%rbp + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + + .byte 0xf3,0xc3 + + +.globl sqrx_mont_384 + +.def sqrx_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sqrx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -24(%rsp),%rsp + +.LSEH_body_sqrx_mont_384: + + + movq %rcx,%r8 + leaq -128(%rdx),%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 +.byte 102,72,15,110,199 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + leaq (%rsi),%rbx + movq %r8,(%rsp) + leaq -128(%rsi),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 + + movq 32(%rsp),%r14 + + movq 40(%rsp),%r13 + + movq 48(%rsp),%r12 + + movq 56(%rsp),%rbx + + movq 64(%rsp),%rbp + + leaq 72(%rsp),%rsp + +.LSEH_epilogue_sqrx_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_mont_384: + +.globl sqrx_n_mul_mont_384 + +.def sqrx_n_mul_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sqrx_n_mul_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_n_mul_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -24(%rsp),%rsp + +.LSEH_body_sqrx_n_mul_mont_384: + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 +.byte 102,72,15,110,199 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,16(%rsp) + movq 0(%r9),%xmm2 + +.Loop_sqrx_384: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_384 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 16(%rsp),%rbx + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 + + movq 32(%rsp),%r14 + + movq 40(%rsp),%r13 + + movq 48(%rsp),%r12 + + movq 56(%rsp),%rbx + + movq 64(%rsp),%rbp + + leaq 72(%rsp),%rsp + +.LSEH_epilogue_sqrx_n_mul_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_n_mul_mont_384: + +.globl sqrx_n_mul_mont_383 + +.def sqrx_n_mul_mont_383; .scl 2; .type 32; .endef +.p2align 5 +sqrx_n_mul_mont_383: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_n_mul_mont_383: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -24(%rsp),%rsp + +.LSEH_body_sqrx_n_mul_mont_383: + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 +.byte 102,72,15,110,199 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,16(%rsp) + movq 0(%r9),%xmm2 + leaq -128(%rcx),%rcx + +.Loop_sqrx_383: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_383_nonred + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_383 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 16(%rsp),%rbx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 + + movq 32(%rsp),%r14 + + movq 40(%rsp),%r13 + + movq 48(%rsp),%r12 + + movq 56(%rsp),%rbx + + movq 64(%rsp),%rbp + + leaq 72(%rsp),%rsp + +.LSEH_epilogue_sqrx_n_mul_mont_383: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_n_mul_mont_383: +.def __mulx_mont_383_nonred; .scl 3; .type 32; .endef +.p2align 5 +__mulx_mont_383_nonred: + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + movq %r8,%rax + imulq 8(%rsp),%r8 + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %r15,%rbp + adoxq %rbp,%r15 + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %rax,%r14 + adoxq %rax,%r15 + adcxq %rax,%r15 + movq %r9,%r8 + imulq 8(%rsp),%r9 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rax,%rbp + adoxq %rbp,%rax + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r10,%r9 + imulq 8(%rsp),%r10 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %r8,%rbp + adoxq %rbp,%r8 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r11,%r10 + imulq 8(%rsp),%r11 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %r9,%rbp + adoxq %rbp,%r9 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r12,%r11 + imulq 8(%rsp),%r12 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %r10,%rbp + adoxq %rbp,%r10 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + imulq 8(%rsp),%rdx +.byte 102,72,15,126,195 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r14,%rdx + adcxq %rdi,%r9 + adoxq %rbp,%r10 + adcq $0,%r10 + movq %r8,%r12 + + movq %r14,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r9,%rdi + movq %r8,24(%rbx) + movq %r9,32(%rbx) + movq %r10,40(%rbx) + movq %r10,%rbp + + .byte 0xf3,0xc3 + + +.globl sqrx_mont_382x + +.def sqrx_mont_382x; .scl 2; .type 32; .endef +.p2align 5 +sqrx_mont_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_mont_382x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqrx_mont_382x: + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) + movq %rdi,%xmm0 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + adcq %r12,%r12 + adcq %rdi,%rdi + adcq %rbp,%rbp + + movq %rdx,48(%rbx) + movq %r15,56(%rbx) + movq %rax,64(%rbx) + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32-128(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + + + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + movq 32+96(%rsp),%r14 + leaq 128(%rcx),%rcx + movq 32+0(%rsp),%r8 + andq %r14,%r8 + movq 32+8(%rsp),%r9 + andq %r14,%r9 + movq 32+16(%rsp),%r10 + andq %r14,%r10 + movq 32+24(%rsp),%r11 + andq %r14,%r11 + movq 32+32(%rsp),%r13 + andq %r14,%r13 + andq 32+40(%rsp),%r14 + + subq %r8,%rdx + movq 0(%rcx),%r8 + sbbq %r9,%r15 + movq 8(%rcx),%r9 + sbbq %r10,%rax + movq 16(%rcx),%r10 + sbbq %r11,%r12 + movq 24(%rcx),%r11 + sbbq %r13,%rdi + movq 32(%rcx),%r13 + sbbq %r14,%rbp + sbbq %r14,%r14 + + andq %r14,%r8 + andq %r14,%r9 + andq %r14,%r10 + andq %r14,%r11 + andq %r14,%r13 + andq 40(%rcx),%r14 + + addq %r8,%rdx + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%r12 + adcq %r13,%rdi + adcq %r14,%rbp + + movq %rdx,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqrx_mont_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_mont_382x: +.section .pdata +.p2align 2 +.rva .LSEH_begin_mulx_mont_384x +.rva .LSEH_body_mulx_mont_384x +.rva .LSEH_info_mulx_mont_384x_prologue + +.rva .LSEH_body_mulx_mont_384x +.rva .LSEH_epilogue_mulx_mont_384x +.rva .LSEH_info_mulx_mont_384x_body + +.rva .LSEH_epilogue_mulx_mont_384x +.rva .LSEH_end_mulx_mont_384x +.rva .LSEH_info_mulx_mont_384x_epilogue + +.rva .LSEH_begin_sqrx_mont_384x +.rva .LSEH_body_sqrx_mont_384x +.rva .LSEH_info_sqrx_mont_384x_prologue + +.rva .LSEH_body_sqrx_mont_384x +.rva .LSEH_epilogue_sqrx_mont_384x +.rva .LSEH_info_sqrx_mont_384x_body + +.rva .LSEH_epilogue_sqrx_mont_384x +.rva .LSEH_end_sqrx_mont_384x +.rva .LSEH_info_sqrx_mont_384x_epilogue + +.rva .LSEH_begin_mulx_382x +.rva .LSEH_body_mulx_382x +.rva .LSEH_info_mulx_382x_prologue + +.rva .LSEH_body_mulx_382x +.rva .LSEH_epilogue_mulx_382x +.rva .LSEH_info_mulx_382x_body + +.rva .LSEH_epilogue_mulx_382x +.rva .LSEH_end_mulx_382x +.rva .LSEH_info_mulx_382x_epilogue + +.rva .LSEH_begin_sqrx_382x +.rva .LSEH_body_sqrx_382x +.rva .LSEH_info_sqrx_382x_prologue + +.rva .LSEH_body_sqrx_382x +.rva .LSEH_epilogue_sqrx_382x +.rva .LSEH_info_sqrx_382x_body + +.rva .LSEH_epilogue_sqrx_382x +.rva .LSEH_end_sqrx_382x +.rva .LSEH_info_sqrx_382x_epilogue + +.rva .LSEH_begin_mulx_384 +.rva .LSEH_body_mulx_384 +.rva .LSEH_info_mulx_384_prologue + +.rva .LSEH_body_mulx_384 +.rva .LSEH_epilogue_mulx_384 +.rva .LSEH_info_mulx_384_body + +.rva .LSEH_epilogue_mulx_384 +.rva .LSEH_end_mulx_384 +.rva .LSEH_info_mulx_384_epilogue + +.rva .LSEH_begin_sqrx_384 +.rva .LSEH_body_sqrx_384 +.rva .LSEH_info_sqrx_384_prologue + +.rva .LSEH_body_sqrx_384 +.rva .LSEH_epilogue_sqrx_384 +.rva .LSEH_info_sqrx_384_body + +.rva .LSEH_epilogue_sqrx_384 +.rva .LSEH_end_sqrx_384 +.rva .LSEH_info_sqrx_384_epilogue + +.rva .LSEH_begin_redcx_mont_384 +.rva .LSEH_body_redcx_mont_384 +.rva .LSEH_info_redcx_mont_384_prologue + +.rva .LSEH_body_redcx_mont_384 +.rva .LSEH_epilogue_redcx_mont_384 +.rva .LSEH_info_redcx_mont_384_body + +.rva .LSEH_epilogue_redcx_mont_384 +.rva .LSEH_end_redcx_mont_384 +.rva .LSEH_info_redcx_mont_384_epilogue + +.rva .LSEH_begin_fromx_mont_384 +.rva .LSEH_body_fromx_mont_384 +.rva .LSEH_info_fromx_mont_384_prologue + +.rva .LSEH_body_fromx_mont_384 +.rva .LSEH_epilogue_fromx_mont_384 +.rva .LSEH_info_fromx_mont_384_body + +.rva .LSEH_epilogue_fromx_mont_384 +.rva .LSEH_end_fromx_mont_384 +.rva .LSEH_info_fromx_mont_384_epilogue + +.rva .LSEH_begin_sgn0x_pty_mont_384 +.rva .LSEH_body_sgn0x_pty_mont_384 +.rva .LSEH_info_sgn0x_pty_mont_384_prologue + +.rva .LSEH_body_sgn0x_pty_mont_384 +.rva .LSEH_epilogue_sgn0x_pty_mont_384 +.rva .LSEH_info_sgn0x_pty_mont_384_body + +.rva .LSEH_epilogue_sgn0x_pty_mont_384 +.rva .LSEH_end_sgn0x_pty_mont_384 +.rva .LSEH_info_sgn0x_pty_mont_384_epilogue + +.rva .LSEH_begin_sgn0x_pty_mont_384x +.rva .LSEH_body_sgn0x_pty_mont_384x +.rva .LSEH_info_sgn0x_pty_mont_384x_prologue + +.rva .LSEH_body_sgn0x_pty_mont_384x +.rva .LSEH_epilogue_sgn0x_pty_mont_384x +.rva .LSEH_info_sgn0x_pty_mont_384x_body + +.rva .LSEH_epilogue_sgn0x_pty_mont_384x +.rva .LSEH_end_sgn0x_pty_mont_384x +.rva .LSEH_info_sgn0x_pty_mont_384x_epilogue + +.rva .LSEH_begin_mulx_mont_384 +.rva .LSEH_body_mulx_mont_384 +.rva .LSEH_info_mulx_mont_384_prologue + +.rva .LSEH_body_mulx_mont_384 +.rva .LSEH_epilogue_mulx_mont_384 +.rva .LSEH_info_mulx_mont_384_body + +.rva .LSEH_epilogue_mulx_mont_384 +.rva .LSEH_end_mulx_mont_384 +.rva .LSEH_info_mulx_mont_384_epilogue + +.rva .LSEH_begin_sqrx_mont_384 +.rva .LSEH_body_sqrx_mont_384 +.rva .LSEH_info_sqrx_mont_384_prologue + +.rva .LSEH_body_sqrx_mont_384 +.rva .LSEH_epilogue_sqrx_mont_384 +.rva .LSEH_info_sqrx_mont_384_body + +.rva .LSEH_epilogue_sqrx_mont_384 +.rva .LSEH_end_sqrx_mont_384 +.rva .LSEH_info_sqrx_mont_384_epilogue + +.rva .LSEH_begin_sqrx_n_mul_mont_384 +.rva .LSEH_body_sqrx_n_mul_mont_384 +.rva .LSEH_info_sqrx_n_mul_mont_384_prologue + +.rva .LSEH_body_sqrx_n_mul_mont_384 +.rva .LSEH_epilogue_sqrx_n_mul_mont_384 +.rva .LSEH_info_sqrx_n_mul_mont_384_body + +.rva .LSEH_epilogue_sqrx_n_mul_mont_384 +.rva .LSEH_end_sqrx_n_mul_mont_384 +.rva .LSEH_info_sqrx_n_mul_mont_384_epilogue + +.rva .LSEH_begin_sqrx_n_mul_mont_383 +.rva .LSEH_body_sqrx_n_mul_mont_383 +.rva .LSEH_info_sqrx_n_mul_mont_383_prologue + +.rva .LSEH_body_sqrx_n_mul_mont_383 +.rva .LSEH_epilogue_sqrx_n_mul_mont_383 +.rva .LSEH_info_sqrx_n_mul_mont_383_body + +.rva .LSEH_epilogue_sqrx_n_mul_mont_383 +.rva .LSEH_end_sqrx_n_mul_mont_383 +.rva .LSEH_info_sqrx_n_mul_mont_383_epilogue + +.rva .LSEH_begin_sqrx_mont_382x +.rva .LSEH_body_sqrx_mont_382x +.rva .LSEH_info_sqrx_mont_382x_prologue + +.rva .LSEH_body_sqrx_mont_382x +.rva .LSEH_epilogue_sqrx_mont_382x +.rva .LSEH_info_sqrx_mont_382x_body + +.rva .LSEH_epilogue_sqrx_mont_382x +.rva .LSEH_end_sqrx_mont_382x +.rva .LSEH_info_sqrx_mont_382x_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_mulx_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mulx_mont_384x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x29,0x00 +.byte 0x00,0xe4,0x2a,0x00 +.byte 0x00,0xd4,0x2b,0x00 +.byte 0x00,0xc4,0x2c,0x00 +.byte 0x00,0x34,0x2d,0x00 +.byte 0x00,0x54,0x2e,0x00 +.byte 0x00,0x74,0x30,0x00 +.byte 0x00,0x64,0x31,0x00 +.byte 0x00,0x01,0x2f,0x00 +.LSEH_info_mulx_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_mont_384x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_sqrx_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mulx_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mulx_382x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_mulx_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_382x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sqrx_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mulx_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mulx_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x00,0x00 +.byte 0x00,0xe4,0x01,0x00 +.byte 0x00,0xd4,0x02,0x00 +.byte 0x00,0xc4,0x03,0x00 +.byte 0x00,0x34,0x04,0x00 +.byte 0x00,0x54,0x05,0x00 +.byte 0x00,0x74,0x07,0x00 +.byte 0x00,0x64,0x08,0x00 +.byte 0x00,0x52 +.byte 0x00,0x00 +.LSEH_info_mulx_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sqrx_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_redcx_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_redcx_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_redcx_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_fromx_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_fromx_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_fromx_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0x_pty_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sgn0x_pty_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sgn0x_pty_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0x_pty_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sgn0x_pty_mont_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sgn0x_pty_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mulx_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mulx_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00 +.LSEH_info_mulx_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00 +.LSEH_info_sqrx_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_n_mul_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_n_mul_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00 +.LSEH_info_sqrx_n_mul_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_n_mul_mont_383_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_n_mul_mont_383_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00 +.LSEH_info_sqrx_n_mul_mont_383_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_mont_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_mont_382x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_sqrx_mont_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/build/coff/sha256-x86_64.s b/build/coff/sha256-x86_64.s new file mode 100644 index 00000000..c0f07044 --- /dev/null +++ b/build/coff/sha256-x86_64.s @@ -0,0 +1,1560 @@ +.text + +.p2align 6 + +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl sha256_block_data_order_shaext + +.def sha256_block_data_order_shaext; .scl 2; .type 32; .endef +.p2align 6 +sha256_block_data_order_shaext: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sha256_block_data_order_shaext: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + subq $0x58,%rsp + + movaps %xmm6,-88(%r11) + + movaps %xmm7,-72(%r11) + + movaps %xmm8,-56(%r11) + + movaps %xmm9,-40(%r11) + + movaps %xmm10,-24(%r11) + +.LSEH_body_sha256_block_data_order_shaext: + + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 256-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shaext + +.p2align 4 +.Loop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 16-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 48-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 64-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 80-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 96-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 112-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 144-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 176-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 208-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 224-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 240-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + movaps -88(%r11),%xmm6 + movaps -72(%r11),%xmm7 + movaps -56(%r11),%xmm8 + movaps -40(%r11),%xmm9 + movaps -24(%r11),%xmm10 + movq %r11,%rsp + +.LSEH_epilogue_sha256_block_data_order_shaext: + mov 8(%r11),%rdi + mov 16(%r11),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sha256_block_data_order_shaext: +.globl sha256_block_data_order + +.def sha256_block_data_order; .scl 2; .type 32; .endef +.p2align 6 +sha256_block_data_order: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sha256_block_data_order: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $104,%rsp + + leaq (%rsi,%rdx,4),%rdx + movq %rdi,0(%rsp) + + movq %rdx,16(%rsp) + movaps %xmm6,32(%rsp) + + movaps %xmm7,48(%rsp) + + movaps %xmm8,64(%rsp) + + movaps %xmm9,80(%rsp) + + movq %rsp,%rbp + +.LSEH_body_sha256_block_data_order: + + + leaq -64(%rsp),%rsp + movl 0(%rdi),%eax + andq $-64,%rsp + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp .Lloop_ssse3 +.p2align 4 +.Lloop_ssse3: + movdqa K256+256(%rip),%xmm7 + movq %rsi,8(%rbp) + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rsi +.byte 102,15,56,0,207 + movdqa 0(%rsi),%xmm4 + movdqa 16(%rsi),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 32(%rsi),%xmm6 +.byte 102,15,56,0,223 + movdqa 48(%rsi),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.p2align 4 +.Lssse3_00_47: + subq $-64,%rsi + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 16(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 32(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 48(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,67(%rsi) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 0(%rbp),%rdi + movl %r14d,%eax + movq 8(%rbp),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + leaq 64(%rsi),%rsi + cmpq 16(%rbp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + xorps %xmm0,%xmm0 + leaq 104+48(%rbp),%r11 + + movaps %xmm0,0(%rsp) + movaps %xmm0,16(%rsp) + movaps %xmm0,32(%rsp) + movaps %xmm0,48(%rsp) + movaps 32(%rbp),%xmm6 + movaps 48(%rbp),%xmm7 + movaps 64(%rbp),%xmm8 + movaps 80(%rbp),%xmm9 + movq 104(%rbp),%r15 + + movq -40(%r11),%r14 + + movq -32(%r11),%r13 + + movq -24(%r11),%r12 + + movq -16(%r11),%rbx + + movq -8(%r11),%rbp + +.LSEH_epilogue_sha256_block_data_order: + mov 8(%r11),%rdi + mov 16(%r11),%rsi + + leaq (%r11),%rsp + .byte 0xf3,0xc3 + +.LSEH_end_sha256_block_data_order: +.globl sha256_emit + +.def sha256_emit; .scl 2; .type 32; .endef +.p2align 4 +sha256_emit: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + bswapq %r8 + movq 24(%rdx),%r11 + bswapq %r9 + movl %r8d,4(%rcx) + bswapq %r10 + movl %r9d,12(%rcx) + bswapq %r11 + movl %r10d,20(%rcx) + shrq $32,%r8 + movl %r11d,28(%rcx) + shrq $32,%r9 + movl %r8d,0(%rcx) + shrq $32,%r10 + movl %r9d,8(%rcx) + shrq $32,%r11 + movl %r10d,16(%rcx) + movl %r11d,24(%rcx) + .byte 0xf3,0xc3 + + +.globl sha256_bcopy + +.def sha256_bcopy; .scl 2; .type 32; .endef +.p2align 4 +sha256_bcopy: + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rdx,%rcx +.Loop_bcopy: + movzbl (%rdx),%eax + leaq 1(%rdx),%rdx + movb %al,-1(%rcx,%rdx,1) + decq %r8 + jnz .Loop_bcopy + .byte 0xf3,0xc3 + + +.globl sha256_hcopy + +.def sha256_hcopy; .scl 2; .type 32; .endef +.p2align 4 +sha256_hcopy: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq %r8,0(%rcx) + movq %r9,8(%rcx) + movq %r10,16(%rcx) + movq %r11,24(%rcx) + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_sha256_block_data_order_shaext +.rva .LSEH_body_sha256_block_data_order_shaext +.rva .LSEH_info_sha256_block_data_order_shaext_prologue + +.rva .LSEH_body_sha256_block_data_order_shaext +.rva .LSEH_epilogue_sha256_block_data_order_shaext +.rva .LSEH_info_sha256_block_data_order_shaext_body + +.rva .LSEH_epilogue_sha256_block_data_order_shaext +.rva .LSEH_end_sha256_block_data_order_shaext +.rva .LSEH_info_sha256_block_data_order_shaext_epilogue + +.rva .LSEH_begin_sha256_block_data_order +.rva .LSEH_body_sha256_block_data_order +.rva .LSEH_info_sha256_block_data_order_prologue + +.rva .LSEH_body_sha256_block_data_order +.rva .LSEH_epilogue_sha256_block_data_order +.rva .LSEH_info_sha256_block_data_order_body + +.rva .LSEH_epilogue_sha256_block_data_order +.rva .LSEH_end_sha256_block_data_order +.rva .LSEH_info_sha256_block_data_order_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_sha256_block_data_order_shaext_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sha256_block_data_order_shaext_body: +.byte 1,0,15,0 +.byte 0x00,0x68,0x00,0x00 +.byte 0x00,0x78,0x01,0x00 +.byte 0x00,0x88,0x02,0x00 +.byte 0x00,0x98,0x03,0x00 +.byte 0x00,0xa8,0x04,0x00 +.byte 0x00,0x74,0x0c,0x00 +.byte 0x00,0x64,0x0d,0x00 +.byte 0x00,0xa2 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_sha256_block_data_order_shaext_epilogue: +.byte 1,0,5,11 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x03 +.byte 0x00,0x00 + +.LSEH_info_sha256_block_data_order_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sha256_block_data_order_body: +.byte 1,0,26,5 +.byte 0x00,0x68,0x02,0x00 +.byte 0x00,0x78,0x03,0x00 +.byte 0x00,0x88,0x04,0x00 +.byte 0x00,0x98,0x05,0x00 +.byte 0x00,0xf4,0x0d,0x00 +.byte 0x00,0xe4,0x0e,0x00 +.byte 0x00,0xd4,0x0f,0x00 +.byte 0x00,0xc4,0x10,0x00 +.byte 0x00,0x34,0x11,0x00 +.byte 0x00,0x74,0x14,0x00 +.byte 0x00,0x64,0x15,0x00 +.byte 0x00,0x03 +.byte 0x00,0x01,0x12,0x00 +.byte 0x00,0x50 +.LSEH_info_sha256_block_data_order_epilogue: +.byte 1,0,5,11 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x03 +.byte 0x00,0x00 + diff --git a/build/elf/add_mod_256-x86_64.s b/build/elf/add_mod_256-x86_64.s new file mode 100644 index 00000000..30b99e75 --- /dev/null +++ b/build/elf/add_mod_256-x86_64.s @@ -0,0 +1,404 @@ +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,@function +.align 32 +add_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loaded_a_add_mod_256: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_256,.-add_mod_256 + + +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,@function +.align 32 +mul_by_3_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rcx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rsi,%rdx + movq 24(%rsi),%r11 + + call __lshift_mod_256 + movq 0(%rsp),%r12 +.cfi_restore %r12 + jmp .Loaded_a_add_mod_256 + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.type __lshift_mod_256,@function +.align 32 +__lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + movq %r8,%rax + adcq %r10,%r10 + movq %r9,%rsi + adcq %r11,%r11 + sbbq %r12,%r12 + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + cmovcq %rbx,%r10 + cmovcq %rbp,%r11 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __lshift_mod_256,.-__lshift_mod_256 + + +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,@function +.align 32 +lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_lshift_mod_256: + call __lshift_mod_256 + decl %edx + jnz .Loop_lshift_mod_256 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size lshift_mod_256,.-lshift_mod_256 + + +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,@function +.align 32 +rshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rbp + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_rshift_mod_256: + movq %rbp,%r8 + andq $1,%rbp + movq 0(%rcx),%rax + negq %rbp + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + + andq %rbp,%rax + andq %rbp,%rsi + andq %rbp,%rbx + andq 24(%rcx),%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + adcq %rbx,%r10 + adcq %rbp,%r11 + sbbq %rax,%rax + + shrq $1,%r8 + movq %r9,%rbp + shrq $1,%r9 + movq %r10,%rbx + shrq $1,%r10 + movq %r11,%rsi + shrq $1,%r11 + + shlq $63,%rbp + shlq $63,%rbx + orq %r8,%rbp + shlq $63,%rsi + orq %rbx,%r9 + shlq $63,%rax + orq %rsi,%r10 + orq %rax,%r11 + + decl %edx + jnz .Loop_rshift_mod_256 + + movq %rbp,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size rshift_mod_256,.-rshift_mod_256 + + +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,@function +.align 32 +cneg_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r12 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %r12,%r8 + movq 24(%rsi),%r11 + orq %r9,%r12 + orq %r10,%r12 + orq %r11,%r12 + movq $-1,%rbp + + movq 0(%rcx),%rax + cmovnzq %rbp,%r12 + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + andq %r12,%rax + movq 24(%rcx),%rbp + andq %r12,%rsi + andq %r12,%rbx + andq %r12,%rbp + + subq %r8,%rax + sbbq %r9,%rsi + sbbq %r10,%rbx + sbbq %r11,%rbp + + orq %rdx,%rdx + + cmovzq %r8,%rax + cmovzq %r9,%rsi + movq %rax,0(%rdi) + cmovzq %r10,%rbx + movq %rsi,8(%rdi) + cmovzq %r11,%rbp + movq %rbx,16(%rdi) + movq %rbp,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size cneg_mod_256,.-cneg_mod_256 + + +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,@function +.align 32 +sub_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_256,.-sub_mod_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/build/elf/add_mod_384-x86_64.s b/build/elf/add_mod_384-x86_64.s new file mode 100644 index 00000000..a1c59a4b --- /dev/null +++ b/build/elf/add_mod_384-x86_64.s @@ -0,0 +1,1485 @@ +.text + +.hidden BLS12_381_P + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,@function +.align 32 +add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,@function +.align 32 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__add_mod_384_a_is_loaded: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,@function +.align 32 +add_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_384x,.-add_mod_384x + + +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,@function +.align 32 +lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_lshift_mod_384: + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdi,%rdi + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdi + + movq (%rsp),%rdi + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + decl %edx + jnz .Loop_lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,@function +.align 32 +__lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __lshift_mod_384,.-__lshift_mod_384 + + +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,@function +.align 32 +mul_by_3_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,@function +.align 32 +mul_by_8_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + +.globl mul_by_b_onE1 +.hidden mul_by_b_onE1 +.type mul_by_b_onE1,@function +.align 32 +mul_by_b_onE1: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq BLS12_381_P(%rip),%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_b_onE1,.-mul_by_b_onE1 + +.globl mul_by_4b_onE1 +.hidden mul_by_4b_onE1 +.type mul_by_4b_onE1,@function +.align 32 +mul_by_4b_onE1: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq BLS12_381_P(%rip),%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_4b_onE1,.-mul_by_4b_onE1 + + +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,@function +.align 32 +mul_by_3_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq (%rsp),%rsi + leaq 48(%rdi),%rdi + + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + movq 72(%rsi),%r11 + movq 80(%rsi),%r12 + movq 88(%rsi),%r13 + + call __lshift_mod_384 + + movq $48,%rdx + addq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,@function +.align 32 +mul_by_8_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq (%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,48+0(%rdi) + movq %r9,48+8(%rdi) + movq %r10,48+16(%rdi) + movq %r11,48+24(%rdi) + movq %r12,48+32(%rdi) + movq %r13,48+40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + +.globl mul_by_b_onE2 +.hidden mul_by_b_onE2 +.type mul_by_b_onE2,@function +.align 32 +mul_by_b_onE2: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + leaq BLS12_381_P(%rip),%rcx + leaq 48(%rsi),%rdx + call __sub_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq 0(%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_b_onE2,.-mul_by_b_onE2 + +.globl mul_by_4b_onE2 +.hidden mul_by_4b_onE2 +.type mul_by_4b_onE2,@function +.align 32 +mul_by_4b_onE2: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + leaq BLS12_381_P(%rip),%rcx + leaq 48(%rsi),%rdx + call __sub_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq 0(%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_4b_onE2,.-mul_by_4b_onE2 + + +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,@function +.align 32 +cneg_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdx +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rdx,%r8 + movq 24(%rsi),%r11 + orq %r9,%rdx + movq 32(%rsi),%r12 + orq %r10,%rdx + movq 40(%rsi),%r13 + orq %r11,%rdx + movq $-1,%rsi + orq %r12,%rdx + orq %r13,%rdx + + movq 0(%rcx),%r14 + cmovnzq %rsi,%rdx + movq 8(%rcx),%r15 + movq 16(%rcx),%rax + andq %rdx,%r14 + movq 24(%rcx),%rbx + andq %rdx,%r15 + movq 32(%rcx),%rbp + andq %rdx,%rax + movq 40(%rcx),%rsi + andq %rdx,%rbx + movq 0(%rsp),%rcx + andq %rdx,%rbp + andq %rdx,%rsi + + subq %r8,%r14 + sbbq %r9,%r15 + sbbq %r10,%rax + sbbq %r11,%rbx + sbbq %r12,%rbp + sbbq %r13,%rsi + + orq %rcx,%rcx + + cmovzq %r8,%r14 + cmovzq %r9,%r15 + cmovzq %r10,%rax + movq %r14,0(%rdi) + cmovzq %r11,%rbx + movq %r15,8(%rdi) + cmovzq %r12,%rbp + movq %rax,16(%rdi) + cmovzq %r13,%rsi + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rsi,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size cneg_mod_384,.-cneg_mod_384 + + +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,@function +.align 32 +sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,@function +.align 32 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,@function +.align 32 +sub_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __sub_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_384x,.-sub_mod_384x +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,@function +.align 32 +mul_by_1_plus_i_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $56,%rsp +.cfi_adjust_cfa_offset 56 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rbx + adcq 72(%rsi),%r11 + movq %r12,%rcx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + movq %rdi,48(%rsp) + sbbq %rdi,%rdi + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rbx + sbbq 80(%rsi),%rcx + sbbq 88(%rsi),%rbp + sbbq %rsi,%rsi + + movq %r8,0(%rsp) + movq 0(%rdx),%r8 + movq %r9,8(%rsp) + movq 8(%rdx),%r9 + movq %r10,16(%rsp) + movq 16(%rdx),%r10 + movq %r11,24(%rsp) + movq 24(%rdx),%r11 + movq %r12,32(%rsp) + andq %rsi,%r8 + movq 32(%rdx),%r12 + movq %r13,40(%rsp) + andq %rsi,%r9 + movq 40(%rdx),%r13 + andq %rsi,%r10 + andq %rsi,%r11 + andq %rsi,%r12 + andq %rsi,%r13 + movq 48(%rsp),%rsi + + addq %r8,%r14 + movq 0(%rsp),%r8 + adcq %r9,%r15 + movq 8(%rsp),%r9 + adcq %r10,%rax + movq 16(%rsp),%r10 + adcq %r11,%rbx + movq 24(%rsp),%r11 + adcq %r12,%rcx + movq 32(%rsp),%r12 + adcq %r13,%rbp + movq 40(%rsp),%r13 + + movq %r14,0(%rsi) + movq %r8,%r14 + movq %r15,8(%rsi) + movq %rax,16(%rsi) + movq %r9,%r15 + movq %rbx,24(%rsi) + movq %rcx,32(%rsi) + movq %r10,%rax + movq %rbp,40(%rsi) + + subq 0(%rdx),%r8 + movq %r11,%rbx + sbbq 8(%rdx),%r9 + sbbq 16(%rdx),%r10 + movq %r12,%rcx + sbbq 24(%rdx),%r11 + sbbq 32(%rdx),%r12 + movq %r13,%rbp + sbbq 40(%rdx),%r13 + sbbq $0,%rdi + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,48(%rsi) + cmovcq %rbx,%r11 + movq %r9,56(%rsi) + cmovcq %rcx,%r12 + movq %r10,64(%rsi) + cmovcq %rbp,%r13 + movq %r11,72(%rsi) + movq %r12,80(%rsi) + movq %r13,88(%rsi) + + movq 56+0(%rsp),%r15 +.cfi_restore %r15 + movq 56+8(%rsp),%r14 +.cfi_restore %r14 + movq 56+16(%rsp),%r13 +.cfi_restore %r13 + movq 56+24(%rsp),%r12 +.cfi_restore %r12 + movq 56+32(%rsp),%rbx +.cfi_restore %rbx + movq 56+40(%rsp),%rbp +.cfi_restore %rbp + leaq 56+48(%rsp),%rsp +.cfi_adjust_cfa_offset -56-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,@function +.align 32 +sgn0_pty_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + movq 40(%rdi),%rdx + + xorq %rax,%rax + movq %r8,%rdi + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + notq %rax + andq $1,%rdi + andq $2,%rax + orq %rdi,%rax + + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,@function +.align 32 +sgn0_pty_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + movq 40(%rdi),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + xorq %rax,%rax + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + movq %r8,0(%rsp) + notq %rax + andq $1,%rbp + andq $2,%rax + orq %rbp,%rax + + movq 48(%rdi),%r8 + movq 56(%rdi),%r9 + movq 64(%rdi),%r10 + movq 72(%rdi),%r11 + movq 80(%rdi),%rcx + movq 88(%rdi),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + xorq %rdi,%rdi + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rdi + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rdi + + movq 0(%rsp),%rbx + + notq %rdi + + testq %r8,%r8 + cmovnzq %rdi,%rax + + testq %rbx,%rbx + cmovzq %rdi,%rbp + + andq $1,%rbp + andq $2,%rax + orq %rbp,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/build/elf/add_mod_384x384-x86_64.s b/build/elf/add_mod_384x384-x86_64.s new file mode 100644 index 00000000..084f3d82 --- /dev/null +++ b/build/elf/add_mod_384x384-x86_64.s @@ -0,0 +1,252 @@ +.text + +.type __add_mod_384x384,@function +.align 32 +__add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + addq 0(%rdx),%r8 + movq 56(%rsi),%r15 + adcq 8(%rdx),%r9 + movq 64(%rsi),%rax + adcq 16(%rdx),%r10 + movq 72(%rsi),%rbx + adcq 24(%rdx),%r11 + movq 80(%rsi),%rbp + adcq 32(%rdx),%r12 + movq 88(%rsi),%rsi + adcq 40(%rdx),%r13 + movq %r8,0(%rdi) + adcq 48(%rdx),%r14 + movq %r9,8(%rdi) + adcq 56(%rdx),%r15 + movq %r10,16(%rdi) + adcq 64(%rdx),%rax + movq %r12,32(%rdi) + movq %r14,%r8 + adcq 72(%rdx),%rbx + movq %r11,24(%rdi) + movq %r15,%r9 + adcq 80(%rdx),%rbp + movq %r13,40(%rdi) + movq %rax,%r10 + adcq 88(%rdx),%rsi + movq %rbx,%r11 + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %rbp,%r12 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%rbx + sbbq 32(%rcx),%rbp + movq %rsi,%r13 + sbbq 40(%rcx),%rsi + sbbq $0,%rdx + + cmovcq %r8,%r14 + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %r14,48(%rdi) + cmovcq %r11,%rbx + movq %r15,56(%rdi) + cmovcq %r12,%rbp + movq %rax,64(%rdi) + cmovcq %r13,%rsi + movq %rbx,72(%rdi) + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384x384,.-__add_mod_384x384 + +.type __sub_mod_384x384,@function +.align 32 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.globl add_mod_384x384 +.hidden add_mod_384x384 +.type add_mod_384x384,@function +.align 32 +add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_384x384,.-add_mod_384x384 + +.globl sub_mod_384x384 +.hidden sub_mod_384x384 +.type sub_mod_384x384,@function +.align 32 +sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_384x384,.-sub_mod_384x384 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/build/elf/inverse_mod_384-x86_64.s b/build/elf/inverse_mod_384-x86_64.s new file mode 100644 index 00000000..02d2c94b --- /dev/null +++ b/build/elf/inverse_mod_384-x86_64.s @@ -0,0 +1,378 @@ +.text + +.align 32 +.Lone: +.quad 1,0,0,0,0,0,0,0 + +.globl eucl_inverse_mod_384 +.hidden eucl_inverse_mod_384 +.type eucl_inverse_mod_384,@function +.align 32 +eucl_inverse_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $216,%rsp +.cfi_adjust_cfa_offset 216 + + + movq %rdi,0(%rsp) + leaq .Lone(%rip),%rbp + cmpq $0,%rcx + cmoveq %rbp,%rcx + + movq 0(%rsi),%rax + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rax,%r8 + orq %r9,%rax + orq %r10,%rax + orq %r11,%rax + orq %r12,%rax + orq %r13,%rax + jz .Labort + + leaq 16(%rsp),%rsi + movq 0(%rcx),%r14 + movq 8(%rcx),%r15 + movq 16(%rcx),%rax + movq 24(%rcx),%rbx + movq 32(%rcx),%rbp + movq 40(%rcx),%rdi + + movq %r8,0(%rsi) + movq %r9,8(%rsi) + movq %r10,16(%rsi) + movq %r11,24(%rsi) + movq %r12,32(%rsi) + movq %r13,40(%rsi) + + leaq 112(%rsp),%rcx + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + movq %r14,48(%rsi) + movq %r15,56(%rsi) + movq %rax,64(%rsi) + movq %rbx,72(%rsi) + movq %rbp,80(%rsi) + movq %rdi,88(%rsi) + + movq %r8,0(%rcx) + movq %r9,8(%rcx) + movq %r10,16(%rcx) + movq %r11,24(%rcx) + movq %r12,32(%rcx) + movq %r13,40(%rcx) + + xorl %eax,%eax + movq %rax,48(%rcx) + movq %rax,56(%rcx) + movq %rax,64(%rcx) + movq %rax,72(%rcx) + movq %rax,80(%rcx) + movq %rax,88(%rcx) + jmp .Loop_inv + +.align 32 +.Loop_inv: + leaq 112(%rsp),%rsi + call __remove_powers_of_2 + + leaq 16(%rsp),%rsi + call __remove_powers_of_2 + + leaq 112(%rsp),%rcx + subq 112+0(%rsp),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + sbbq 40(%rcx),%r13 + jae .Lu_greater_than_v + + + xchgq %rcx,%rsi + + notq %r8 + notq %r9 + notq %r10 + notq %r11 + notq %r12 + notq %r13 + + addq $1,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + +.Lu_greater_than_v: + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rax + movq 72(%rsi),%rbx + movq 80(%rsi),%rbp + movq 88(%rsi),%rdi + + subq 48(%rcx),%r14 + sbbq 56(%rcx),%r15 + sbbq 64(%rcx),%rax + sbbq 72(%rcx),%rbx + sbbq 80(%rcx),%rbp + sbbq 88(%rcx),%rdi + + movq %r8,0(%rsi) + sbbq %r8,%r8 + movq %r9,8(%rsi) + movq %r8,%r9 + movq %r10,16(%rsi) + movq %r8,%r10 + movq %r11,24(%rsi) + movq %r8,%r11 + movq %r12,32(%rsi) + movq %r8,%r12 + movq %r13,40(%rsi) + movq %r8,%r13 + + andq 0(%rdx),%r8 + andq 8(%rdx),%r9 + andq 16(%rdx),%r10 + andq 24(%rdx),%r11 + andq 32(%rdx),%r12 + andq 40(%rdx),%r13 + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%rbx + adcq %r12,%rbp + adcq %r13,%rdi + + movq %r14,48(%rsi) + movq %r15,56(%rsi) + movq %rax,64(%rsi) + movq %rbx,72(%rsi) + movq %rbp,80(%rsi) + movq %rdi,88(%rsi) + + movq 16+0(%rsp),%r8 + movq 16+8(%rsp),%r9 + movq 16+16(%rsp),%r10 + movq 16+24(%rsp),%r11 + orq %r9,%r8 + orq 16+32(%rsp),%r10 + orq 16+40(%rsp),%r11 +.byte 0x67 + orq %r10,%r8 + orq %r11,%r8 + jnz .Loop_inv + + leaq 112(%rsp),%rsi + movq 0(%rsp),%rdi + movl $1,%eax + + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + movq 72(%rsi),%r11 + movq 80(%rsi),%r12 + movq 88(%rsi),%r13 + +.Labort: + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + leaq 216(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -216-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size eucl_inverse_mod_384,.-eucl_inverse_mod_384 + +.type __remove_powers_of_2,@function +.align 32 +__remove_powers_of_2: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_of_2: + bsfq %r8,%rcx + movl $63,%eax + cmovzl %eax,%ecx + + cmpl $0,%ecx + je .Loop_of_2_done + + shrq %cl,%r8 + movq %r9,%r14 + shrq %cl,%r9 + movq %r10,%r15 + shrq %cl,%r10 + movq %r11,%rax + shrq %cl,%r11 + movq %r12,%rbx + shrq %cl,%r12 + movq %r13,%rbp + shrq %cl,%r13 + negb %cl + shlq %cl,%r14 + shlq %cl,%r15 + orq %r14,%r8 + movq 48(%rsi),%r14 + shlq %cl,%rax + orq %r15,%r9 + movq 56(%rsi),%r15 + shlq %cl,%rbx + orq %rax,%r10 + movq 64(%rsi),%rax + shlq %cl,%rbp + orq %rbx,%r11 + movq 72(%rsi),%rbx + orq %rbp,%r12 + movq 80(%rsi),%rbp + negb %cl + movq 88(%rsi),%rdi + + movq %r8,0(%rsi) + movq %r9,8(%rsi) + movq %r10,16(%rsi) + movq %r11,24(%rsi) + movq %r12,32(%rsi) + movq %r13,40(%rsi) + jmp .Loop_div_by_2 + +.align 32 +.Loop_div_by_2: + movq $1,%r13 + movq 0(%rdx),%r8 + andq %r14,%r13 + movq 8(%rdx),%r9 + negq %r13 + movq 16(%rdx),%r10 + andq %r13,%r8 + movq 24(%rdx),%r11 + andq %r13,%r9 + movq 32(%rdx),%r12 + andq %r13,%r10 + andq %r13,%r11 + andq %r13,%r12 + andq 40(%rdx),%r13 + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%rbx + adcq %r12,%rbp + adcq %r13,%rdi + sbbq %r13,%r13 + + shrq $1,%r14 + movq %r15,%r8 + shrq $1,%r15 + movq %rax,%r9 + shrq $1,%rax + movq %rbx,%r10 + shrq $1,%rbx + movq %rbp,%r11 + shrq $1,%rbp + movq %rdi,%r12 + shrq $1,%rdi + shlq $63,%r8 + shlq $63,%r9 + orq %r8,%r14 + shlq $63,%r10 + orq %r9,%r15 + shlq $63,%r11 + orq %r10,%rax + shlq $63,%r12 + orq %r11,%rbx + shlq $63,%r13 + orq %r12,%rbp + orq %r13,%rdi + + decl %ecx + jnz .Loop_div_by_2 + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r14,48(%rsi) + movq %r15,56(%rsi) + movq %rax,64(%rsi) + movq %rbx,72(%rsi) + movq %rbp,80(%rsi) + movq %rdi,88(%rsi) + + testq $1,%r8 +.byte 0x2e + jz .Loop_of_2 + +.Loop_of_2_done: + .byte 0xf3,0xc3 +.cfi_endproc +.size __remove_powers_of_2,.-__remove_powers_of_2 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/build/elf/mulq_mont_256-x86_64.s b/build/elf/mulq_mont_256-x86_64.s new file mode 100644 index 00000000..37abd439 --- /dev/null +++ b/build/elf/mulq_mont_256-x86_64.s @@ -0,0 +1,714 @@ +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,@function +.align 32 +mul_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r13 + movq 8(%rsi),%r14 + movq 16(%rsi),%r12 + movq 24(%rsi),%rbp + movq %rdx,%rbx + + movq %rax,%r15 + mulq %r13 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_mont_sparse_256,.-mul_mont_sparse_256 + +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,@function +.align 32 +sqr_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rax + movq %rcx,%r8 + movq 8(%rsi),%r14 + movq %rdx,%rcx + movq 16(%rsi),%r12 + leaq (%rsi),%rbx + movq 24(%rsi),%rbp + + movq %rax,%r15 + mulq %rax + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +.type __mulq_mont_sparse_256,@function +.align 32 +__mulq_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulq %r14 + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq 8(%rbx),%rax + adcq $0,%rdx + xorq %r14,%r14 + movq %rdx,%r13 + + movq %r9,%rdi + imulq %r8,%r9 + + + movq %rax,%r15 + mulq 0(%rsi) + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + xorq %r15,%r15 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r9,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rdi,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + addq %rdx,%r13 + adcq $0,%r14 + adcq $0,%r15 + movq %r10,%rdi + imulq %r8,%r10 + + + movq %rax,%r9 + mulq 0(%rsi) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + xorq %r9,%r9 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r10,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rdi,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r13 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + addq %rdx,%r14 + adcq $0,%r15 + adcq $0,%r9 + movq %r11,%rdi + imulq %r8,%r11 + + + movq %rax,%r10 + mulq 0(%rsi) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r9 + xorq %r10,%r10 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r11,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rdi,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + addq %rdx,%r15 + adcq $0,%r9 + adcq $0,%r10 + imulq %r8,%rax + movq 8(%rsp),%rsi + + + movq %rax,%r11 + mulq 0(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r12,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + movq %r14,%rbx + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rdx,%r9 + adcq $0,%r10 + + + + + movq %r15,%r12 + subq 0(%rcx),%r13 + sbbq 8(%rcx),%r14 + sbbq 16(%rcx),%r15 + movq %r9,%rbp + sbbq 24(%rcx),%r9 + sbbq $0,%r10 + + cmovcq %rax,%r13 + cmovcq %rbx,%r14 + cmovcq %r12,%r15 + movq %r13,0(%rsi) + cmovcq %rbp,%r9 + movq %r14,8(%rsi) + movq %r15,16(%rsi) + movq %r9,24(%rsi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_mont_sparse_256,.-__mulq_mont_sparse_256 +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,@function +.align 32 +from_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + + + + + movq %r14,%r10 + movq %r15,%r11 + movq %r9,%r12 + + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + sbbq 24(%rbx),%r9 + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,@function +.align 32 +redc_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + addq 32(%rsi),%r13 + adcq 40(%rsi),%r14 + movq %r13,%rax + adcq 48(%rsi),%r15 + movq %r14,%r10 + adcq 56(%rsi),%r9 + sbbq %rsi,%rsi + + + + + movq %r15,%r11 + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + movq %r9,%r12 + sbbq 24(%rbx),%r9 + sbbq $0,%rsi + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redc_mont_256,.-redc_mont_256 +.type __mulq_by_1_mont_256,@function +.align 32 +__mulq_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + movq %rax,%r13 + imulq %rcx,%rax + movq %rax,%r9 + + mulq 0(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq %rdx,%r13 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r10 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 16(%rbx) + movq %r10,%r14 + imulq %rcx,%r10 + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r11 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r13,%r12 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r9 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_by_1_mont_256,.-__mulq_by_1_mont_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/build/elf/mulq_mont_384-x86_64.s b/build/elf/mulq_mont_384-x86_64.s new file mode 100644 index 00000000..dabf3ff5 --- /dev/null +++ b/build/elf/mulq_mont_384-x86_64.s @@ -0,0 +1,3619 @@ +.text + + + + + + + +.type __sub_mod_384x384,@function +.align 32 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,@function +.align 32 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,@function +.align 32 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384,.-__sub_mod_384 +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,@function +.align 32 +mul_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulq_384 + + + leaq 48(%rbx),%rbx + leaq 48(%rsi),%rsi + leaq 40+96(%rsp),%rdi + call __mulq_384 + + + movq 8(%rsp),%rcx + leaq -48(%rsi),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulq_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + movq %rcx,%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_mont_384x,.-mul_mont_384x +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,@function +.align 32 +sqr_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) +.byte 102,72,15,110,199 + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + call __mulq_mont_384 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + movq %r14,%r12 + adcq %r9,%r9 + movq %r15,%r13 + adcq %r10,%r10 + movq %r8,%rax + adcq %r11,%r11 + movq %r9,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r10,%rbp + sbbq 16(%rcx),%r8 + sbbq 24(%rcx),%r9 + sbbq 32(%rcx),%r10 + movq %r11,%rsi + sbbq 40(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r14 + cmovcq %r13,%r15 + cmovcq %rax,%r8 + movq %r14,48(%rdi) + cmovcq %rbx,%r9 + movq %r15,56(%rdi) + cmovcq %rbp,%r10 + movq %r8,64(%rdi) + cmovcq %rsi,%r11 + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_382x +.hidden mul_382x +.type mul_382x,@function +.align 32 +mul_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulq_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulq_384 + + + leaq 48(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulq_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_382x,.-mul_382x +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,@function +.align 32 +sqr_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulq_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulq_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_382x,.-sqr_382x +.globl mul_384 +.hidden mul_384 +.type mul_384,@function +.align 32 +mul_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rbx + call __mulq_384 + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_384,.-mul_384 + +.type __mulq_384,@function +.align 32 +__mulq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rax + + movq %rax,%rbp + mulq 0(%rsi) + movq %rax,0(%rdi) + movq %rbp,%rax + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r11 + movq 8(%rbx),%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,8(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,16(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,24(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,32(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,40(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq %rax,%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rcx,48(%rdi) + movq %r8,56(%rdi) + movq %r9,64(%rdi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_384,.-__mulq_384 +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,@function +.align 32 +sqr_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sqrq_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_384,.-sqr_384 + +.type __sqrq_384,@function +.align 32 +__sqrq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r15 + movq 16(%rsi),%rcx + movq 24(%rsi),%rbx + + + movq %rax,%r14 + mulq %r15 + movq %rax,%r9 + movq %r14,%rax + movq 32(%rsi),%rbp + movq %rdx,%r10 + + mulq %rcx + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + movq 40(%rsi),%rsi + movq %rdx,%r11 + + mulq %rbx + addq %rax,%r11 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq %rax + xorq %r8,%r8 + movq %rax,0(%rdi) + movq %r15,%rax + addq %r9,%r9 + adcq $0,%r8 + addq %rdx,%r9 + adcq $0,%r8 + movq %r9,8(%rdi) + + mulq %rcx + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbx + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbp + addq %rax,%r13 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rax + xorq %r9,%r9 + addq %rax,%r8 + movq %rcx,%rax + addq %r10,%r10 + adcq %r11,%r11 + adcq $0,%r9 + addq %r8,%r10 + adcq %rdx,%r11 + adcq $0,%r9 + movq %r10,16(%rdi) + + mulq %rbx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + movq %r11,24(%rdi) + movq %rdx,%r8 + + mulq %rbp + addq %rax,%r14 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq %rsi + addq %rax,%r15 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + xorq %r11,%r11 + addq %rax,%r9 + movq %rbx,%rax + addq %r12,%r12 + adcq %r13,%r13 + adcq $0,%r11 + addq %r9,%r12 + adcq %rdx,%r13 + adcq $0,%r11 + movq %r12,32(%rdi) + + + mulq %rbp + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %r13,40(%rdi) + movq %rdx,%r8 + + mulq %rsi + addq %rax,%rcx + movq %rbx,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%rbx + + mulq %rax + xorq %r12,%r12 + addq %rax,%r11 + movq %rbp,%rax + addq %r14,%r14 + adcq %r15,%r15 + adcq $0,%r12 + addq %r11,%r14 + adcq %rdx,%r15 + movq %r14,48(%rdi) + adcq $0,%r12 + movq %r15,56(%rdi) + + + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + xorq %r13,%r13 + addq %rax,%r12 + movq %rsi,%rax + addq %rcx,%rcx + adcq %rbx,%rbx + adcq $0,%r13 + addq %r12,%rcx + adcq %rdx,%rbx + movq %rcx,64(%rdi) + adcq $0,%r13 + movq %rbx,72(%rdi) + + + mulq %rax + addq %r13,%rax + addq %rbp,%rbp + adcq $0,%rdx + addq %rbp,%rax + adcq $0,%rdx + movq %rax,80(%rdi) + movq %rdx,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sqrq_384,.-__sqrq_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,@function +.align 32 +sqr_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $120,%rsp +.cfi_adjust_cfa_offset 8*15 + + + movq %rcx,96(%rsp) + movq %rdx,104(%rsp) + movq %rdi,112(%rsp) + + movq %rsp,%rdi + call __sqrq_384 + + leaq 0(%rsp),%rsi + movq 96(%rsp),%rcx + movq 104(%rsp),%rbx + movq 112(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 120(%rsp),%r8 + movq 120(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*21 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_384,.-sqr_mont_384 + + + +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,@function +.align 32 +redc_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redc_mont_384,.-redc_mont_384 + + + + +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,@function +.align 32 +from_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + + + + + + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size from_mont_384,.-from_mont_384 +.type __mulq_by_1_mont_384,@function +.align 32 +__mulq_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r8 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r9 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r10 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %r9,%r15 + imulq %rcx,%r9 + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 32(%rbx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 40(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r9,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %r10,%r8 + imulq %rcx,%r10 + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r8 + movq %r10,%rax + adcq %rdx,%r8 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %r11,%r9 + imulq %rcx,%r11 + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r11,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %r12,%r10 + imulq %rcx,%r12 + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %r13,%r11 + imulq %rcx,%r13 + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_by_1_mont_384,.-__mulq_by_1_mont_384 + +.type __redc_tail_mont_384,@function +.align 32 +__redc_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,@function +.align 32 +sgn0_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,@function +.align 32 +sgn0_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,@function +.align 32 +mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %r8 +.cfi_adjust_cfa_offset 8 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq %rdx,%rbx +.byte 102,72,15,110,199 + + call __mulq_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_mont_384,.-mul_mont_384 +.type __mulq_mont_384,@function +.align 32 +__mulq_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rdi + mulq %r14 + movq %rax,%r8 + movq %rdi,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%rbp + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + xorq %r15,%r15 + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r8,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + adcq $0,%r15 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r9,%rbp + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r14 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r14 + movq %r9,%rax + adcq %rdx,%r15 + adcq $0,%r8 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r9,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + adcq $0,%r8 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r10,%rbp + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r15 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r15 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r10,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r8 + adcq $0,%r9 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r11,%rbp + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r11,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq %rdx,%r9 + adcq $0,%r10 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r12,%rbp + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r9 + adcq $0,%rdx + xorq %r11,%r11 + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r10 + adcq $0,%r11 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r12,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + movq %r13,%rbp + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r8 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rsi) + addq %r12,%r10 + adcq $0,%rdx + xorq %r12,%r12 + addq %rax,%r10 + movq %r13,%rax + adcq %rdx,%r11 + adcq $0,%r12 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r13,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq %rdx,%r11 + adcq $0,%r12 + + + + +.byte 102,72,15,126,199 + subq 0(%rcx),%r14 + movq %r15,%rdx + sbbq 8(%rcx),%r15 + movq %r8,%rbx + sbbq 16(%rcx),%r8 + movq %r9,%rsi + sbbq 24(%rcx),%r9 + movq %r10,%rbp + sbbq 32(%rcx),%r10 + movq %r11,%r13 + sbbq 40(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rdx,%r15 + cmovcq %rbx,%r8 + movq %r14,0(%rdi) + cmovcq %rsi,%r9 + movq %r15,8(%rdi) + cmovcq %rbp,%r10 + movq %r8,16(%rdi) + cmovcq %r13,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_mont_384,.-__mulq_mont_384 +.globl sqr_n_mul_mont_384 +.hidden sqr_n_mul_mont_384 +.type sqr_n_mul_mont_384,@function +.align 32 +sqr_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rcx,8(%rsp) +.byte 102,72,15,110,199 + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_384: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 8(%rsp),%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movd %xmm1,%edx + leaq 0(%rdi),%rsi + decl %edx + jnz .Loop_sqr_384 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_n_mul_mont_384,.-sqr_n_mul_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,@function +.align 32 +sqr_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rcx,8(%rsp) +.byte 102,72,15,110,199 + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_383: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 8(%rsp),%rbx + call __mulq_by_1_mont_384 + + movd %xmm1,%edx + addq 48(%rsi),%r14 + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + leaq 0(%rdi),%rsi + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + decl %edx + jnz .Loop_sqr_383 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +.type __mulq_mont_383_nonred,@function +.align 32 +__mulq_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rbp + mulq %r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%r15 + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%r15 + movq %r8,%rax + adcq %rdx,%r15 + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r9 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rcx) + addq %r15,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %r15,%r13 + adcq %rdx,%r14 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + movq %r9,%r8 + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rsi) + addq %r15,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rcx) + addq %rax,%r8 + movq %r9,%rax + adcq %rdx,%r8 + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rcx) + addq %r8,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r8,%r14 + adcq %rdx,%r15 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r10,%r9 + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rcx) + addq %rax,%r9 + movq %r10,%rax + adcq %rdx,%r9 + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rcx) + addq %r9,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r9,%r15 + adcq %rdx,%r8 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r11,%r10 + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rcx) + addq %rax,%r10 + movq %r11,%rax + adcq %rdx,%r10 + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rcx) + addq %r10,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r10,%r8 + adcq %rdx,%r9 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r12,%r11 + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rcx) + addq %rax,%r11 + movq %r12,%rax + adcq %rdx,%r11 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rcx) + addq %r11,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r11,%r9 + adcq %rdx,%r10 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r13,%r12 + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 0(%rcx) + addq %rax,%r12 + movq %r13,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 24(%rcx) + addq %r12,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r12,%r10 + adcq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_mont_383_nonred,.-__mulq_mont_383_nonred +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,@function +.align 32 +sqr_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) + movq %rdi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq 24(%rsp),%rdi + call __mulq_mont_383_nonred + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %r8,64(%rdi) + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_383_nonred + movq 32+96(%rsp),%rsi + movq 32+0(%rsp),%r12 + movq 32+8(%rsp),%r13 + andq %rsi,%r12 + movq 32+16(%rsp),%rax + andq %rsi,%r13 + movq 32+24(%rsp),%rbx + andq %rsi,%rax + movq 32+32(%rsp),%rbp + andq %rsi,%rbx + andq %rsi,%rbp + andq 32+40(%rsp),%rsi + + subq %r12,%r14 + movq 0(%rcx),%r12 + sbbq %r13,%r15 + movq 8(%rcx),%r13 + sbbq %rax,%r8 + movq 16(%rcx),%rax + sbbq %rbx,%r9 + movq 24(%rcx),%rbx + sbbq %rbp,%r10 + movq 32(%rcx),%rbp + sbbq %rsi,%r11 + sbbq %rsi,%rsi + + andq %rsi,%r12 + andq %rsi,%r13 + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r12,%r14 + adcq %r13,%r15 + adcq %rax,%r8 + adcq %rbx,%r9 + adcq %rbp,%r10 + adcq %rsi,%r11 + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_382x,.-sqr_mont_382x + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/build/elf/mulx_mont_256-x86_64.s b/build/elf/mulx_mont_256-x86_64.s new file mode 100644 index 00000000..20a02073 --- /dev/null +++ b/build/elf/mulx_mont_256-x86_64.s @@ -0,0 +1,627 @@ +.text + +.globl mulx_mont_sparse_256 +.hidden mulx_mont_sparse_256 +.type mulx_mont_sparse_256,@function +.align 32 +mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_mont_sparse_256,.-mulx_mont_sparse_256 + +.globl sqrx_mont_sparse_256 +.hidden sqrx_mont_sparse_256 +.type sqrx_mont_sparse_256,@function +.align 32 +sqrx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + movq %rcx,%r8 + movq %rdx,%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_sparse_256,.-sqrx_mont_sparse_256 +.type __mulx_mont_sparse_256,@function +.align 32 +__mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulxq %r15,%r15,%r12 + mulxq %rbp,%rbp,%r13 + addq %r15,%r11 + mulxq %r9,%r9,%r14 + movq 8(%rbx),%rdx + adcq %rbp,%r12 + adcq %r9,%r13 + adcq $0,%r14 + + movq %rax,%r10 + imulq %r8,%rax + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r11 + adcxq %r9,%r12 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r14 + adcxq %r15,%r9 + adoxq %r9,%r15 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r10 + adoxq %r11,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r12 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r12 + adoxq %r9,%r13 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 16(%rbx),%rdx + adcxq %rbp,%r13 + adoxq %r9,%r14 + adcxq %r10,%r14 + adoxq %r10,%r15 + adcxq %r10,%r15 + adoxq %r10,%r10 + adcq $0,%r10 + movq %rax,%r11 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r15 + adcxq %r10,%r9 + adoxq %r9,%r10 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r11 + adoxq %r12,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r13 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r13 + adoxq %r9,%r14 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 24(%rbx),%rdx + adcxq %rbp,%r14 + adoxq %r9,%r15 + adcxq %r11,%r15 + adoxq %r11,%r10 + adcxq %r11,%r10 + adoxq %r11,%r11 + adcq $0,%r11 + movq %rax,%r12 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r15 + adcxq %r9,%r10 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r10 + adcxq %r11,%r9 + adoxq %r9,%r11 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r12 + adoxq %r13,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r14 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %rax,%rdx + adcxq %rbp,%r15 + adoxq %r9,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + adoxq %r12,%r12 + adcq $0,%r12 + imulq %r8,%rdx + + + xorq %rbp,%rbp + mulxq 0+128(%rcx),%r13,%r9 + adcxq %rax,%r13 + adoxq %r9,%r14 + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r15 + adoxq %r9,%r10 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %r14,%rdx + leaq 128(%rcx),%rcx + adcxq %rbp,%r10 + adoxq %r9,%r11 + movq %r15,%rax + adcxq %r13,%r11 + adoxq %r13,%r12 + adcq $0,%r12 + + + + + movq %r10,%rbp + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + sbbq 16(%rcx),%r10 + movq %r11,%r9 + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rdx,%r14 + cmovcq %rax,%r15 + cmovcq %rbp,%r10 + movq %r14,0(%rdi) + cmovcq %r9,%r11 + movq %r15,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_mont_sparse_256,.-__mulx_mont_sparse_256 +.globl fromx_mont_256 +.hidden fromx_mont_256 +.type fromx_mont_256,@function +.align 32 +fromx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + + + + + movq %r15,%rdx + movq %r10,%r12 + movq %r11,%r13 + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + sbbq 24(%rbx),%r11 + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size fromx_mont_256,.-fromx_mont_256 + +.globl redcx_mont_256 +.hidden redcx_mont_256 +.type redcx_mont_256,@function +.align 32 +redcx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + addq 32(%rsi),%r14 + adcq 40(%rsi),%r15 + movq %r14,%rax + adcq 48(%rsi),%r10 + movq %r15,%rdx + adcq 56(%rsi),%r11 + sbbq %rsi,%rsi + + + + + movq %r10,%r12 + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + movq %r11,%r13 + sbbq 24(%rbx),%r11 + sbbq $0,%rsi + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redcx_mont_256,.-redcx_mont_256 +.type __mulx_by_1_mont_256,@function +.align 32 +__mulx_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r11 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r10 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r10 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + movq %r13,%r11 + imulq %rcx,%r13 + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_by_1_mont_256,.-__mulx_by_1_mont_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/build/elf/mulx_mont_384-x86_64.s b/build/elf/mulx_mont_384-x86_64.s new file mode 100644 index 00000000..37489544 --- /dev/null +++ b/build/elf/mulx_mont_384-x86_64.s @@ -0,0 +1,2969 @@ +.text + + + + + + + +.type __sub_mod_384x384,@function +.align 32 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,@function +.align 32 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,@function +.align 32 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384,.-__sub_mod_384 +.globl mulx_mont_384x +.hidden mulx_mont_384x +.type mulx_mont_384x,@function +.align 32 +mulx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulx_384 + + + leaq 48(%rbx),%rbx + leaq 128+48(%rsi),%rsi + leaq 96(%rdi),%rdi + call __mulx_384 + + + movq 8(%rsp),%rcx + leaq (%rbx),%rsi + leaq -48(%rbx),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulx_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + leaq (%rcx),%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_mont_384x,.-mulx_mont_384x +.globl sqrx_mont_384x +.hidden sqrx_mont_384x +.type sqrx_mont_384x,@function +.align 32 +sqrx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + + movq %rsi,16(%rsp) +.byte 102,72,15,110,199 + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + movq %rdx,%r8 + adcq %r12,%r12 + movq %r15,%r9 + adcq %rdi,%rdi + movq %rax,%r10 + adcq %rbp,%rbp + movq %r12,%r11 + sbbq %rsi,%rsi + + subq 0(%rcx),%rdx + sbbq 8(%rcx),%r15 + movq %rdi,%r13 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r12 + sbbq 32(%rcx),%rdi + movq %rbp,%r14 + sbbq 40(%rcx),%rbp + sbbq $0,%rsi + + cmovcq %r8,%rdx + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %rdx,48(%rbx) + cmovcq %r11,%r12 + movq %r15,56(%rbx) + cmovcq %r13,%rdi + movq %rax,64(%rbx) + cmovcq %r14,%rbp + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_384x,.-sqrx_mont_384x + +.globl mulx_382x +.hidden mulx_382x +.type mulx_382x,@function +.align 32 +mulx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulx_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulx_384 + + + leaq 48+128(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulx_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_382x,.-mulx_382x +.globl sqrx_382x +.hidden sqrx_382x +.type sqrx_382x,@function +.align 32 +sqrx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulx_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulx_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_382x,.-sqrx_382x +.globl mulx_384 +.hidden mulx_384 +.type mulx_384,@function +.align 32 +mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + + movq %rdx,%rbx + call __mulx_384 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_384,.-mulx_384 + +.type __mulx_384,@function +.align 32 +__mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq -128(%rsi),%rsi + + mulxq %r14,%r9,%rcx + xorq %rbp,%rbp + + mulxq %r15,%r8,%rax + adcxq %rcx,%r8 + movq %r9,0(%rdi) + + mulxq %r10,%r9,%rcx + adcxq %rax,%r9 + + mulxq %r11,%r10,%rax + adcxq %rcx,%r10 + + mulxq %r12,%r11,%rcx + adcxq %rax,%r11 + + mulxq %r13,%r12,%r13 + movq 8(%rbx),%rdx + adcxq %rcx,%r12 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,8(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 16(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,16(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 24(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,24(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 32(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,32(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 40(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,40(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq %rax,%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + movq %r10,64(%rdi) + movq %r11,72(%rdi) + movq %r12,80(%rdi) + movq %r13,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_384,.-__mulx_384 +.globl sqrx_384 +.hidden sqrx_384 +.type sqrx_384,@function +.align 32 +sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + call __sqrx_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_384,.-sqrx_384 +.type __sqrx_384,@function +.align 32 +__sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%rcx + movq 32(%rsi),%rbx +.byte 102,72,15,110,199 + + + mulxq %r14,%r8,%rdi + movq 40(%rsi),%rbp + mulxq %r15,%r9,%rax + addq %rdi,%r9 + mulxq %rcx,%r10,%rdi + adcq %rax,%r10 + mulxq %rbx,%r11,%rax + adcq %rdi,%r11 + mulxq %rbp,%r12,%r13 + movq %r14,%rdx + adcq %rax,%r12 + adcq $0,%r13 + + + xorq %r14,%r14 + mulxq %r15,%rdi,%rax + adcxq %rdi,%r10 + adoxq %rax,%r11 + + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r11 + adoxq %rax,%r12 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbp,%rdi,%rax + movq %r15,%rdx + adcxq %rdi,%r13 + adoxq %r14,%rax + adcxq %rax,%r14 + + + xorq %r15,%r15 + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r13 + adoxq %rax,%r14 + + mulxq %rbp,%rdi,%rax + movq %rcx,%rdx + adcxq %rdi,%r14 + adoxq %r15,%rax + adcxq %rax,%r15 + + + xorq %rcx,%rcx + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r14 + adoxq %rax,%r15 + + mulxq %rbp,%rdi,%rax + movq %rbx,%rdx + adcxq %rdi,%r15 + adoxq %rcx,%rax + adcxq %rax,%rcx + + + mulxq %rbp,%rdi,%rbx + movq 0(%rsi),%rdx + addq %rdi,%rcx +.byte 102,72,15,126,199 + adcq $0,%rbx + + + xorq %rbp,%rbp + adcxq %r8,%r8 + adcxq %r9,%r9 + adcxq %r10,%r10 + adcxq %r11,%r11 + adcxq %r12,%r12 + + + mulxq %rdx,%rdx,%rax + movq %rdx,0(%rdi) + movq 8(%rsi),%rdx + adoxq %rax,%r8 + movq %r8,8(%rdi) + + mulxq %rdx,%r8,%rax + movq 16(%rsi),%rdx + adoxq %r8,%r9 + adoxq %rax,%r10 + movq %r9,16(%rdi) + movq %r10,24(%rdi) + + mulxq %rdx,%r8,%r9 + movq 24(%rsi),%rdx + adoxq %r8,%r11 + adoxq %r9,%r12 + adcxq %r13,%r13 + adcxq %r14,%r14 + movq %r11,32(%rdi) + movq %r12,40(%rdi) + + mulxq %rdx,%r8,%r9 + movq 32(%rsi),%rdx + adoxq %r8,%r13 + adoxq %r9,%r14 + adcxq %r15,%r15 + adcxq %rcx,%rcx + movq %r13,48(%rdi) + movq %r14,56(%rdi) + + mulxq %rdx,%r8,%r9 + movq 40(%rsi),%rdx + adoxq %r8,%r15 + adoxq %r9,%rcx + adcxq %rbx,%rbx + adcxq %rbp,%rbp + movq %r15,64(%rdi) + movq %rcx,72(%rdi) + + mulxq %rdx,%r8,%r9 + adoxq %r8,%rbx + adoxq %r9,%rbp + + movq %rbx,80(%rdi) + movq %rbp,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sqrx_384,.-__sqrx_384 + + + +.globl redcx_mont_384 +.hidden redcx_mont_384 +.type redcx_mont_384,@function +.align 32 +redcx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redcx_mont_384,.-redcx_mont_384 + + + + +.globl fromx_mont_384 +.hidden fromx_mont_384 +.type fromx_mont_384,@function +.align 32 +fromx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + + + + + movq %r14,%rax + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size fromx_mont_384,.-fromx_mont_384 +.type __mulx_by_1_mont_384,@function +.align 32 +__mulx_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq %rcx,%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + imulq %r8,%rdx + + + xorq %r14,%r14 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r13 + adoxq %r14,%rbp + adcxq %rbp,%r14 + imulq %r9,%rdx + + + xorq %r15,%r15 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r14 + adoxq %r15,%rbp + adcxq %rbp,%r15 + imulq %r10,%rdx + + + xorq %r8,%r8 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r15 + adoxq %r8,%rbp + adcxq %rbp,%r8 + imulq %r11,%rdx + + + xorq %r9,%r9 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r8 + adoxq %r9,%rbp + adcxq %rbp,%r9 + imulq %r12,%rdx + + + xorq %r10,%r10 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r9 + adoxq %r10,%rbp + adcxq %rbp,%r10 + imulq %r13,%rdx + + + xorq %r11,%r11 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r10 + adoxq %r11,%rbp + adcxq %rbp,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_by_1_mont_384,.-__mulx_by_1_mont_384 + +.type __redc_tail_mont_384,@function +.align 32 +__redc_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl sgn0x_pty_mont_384 +.hidden sgn0x_pty_mont_384 +.type sgn0x_pty_mont_384,@function +.align 32 +sgn0x_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0x_pty_mont_384,.-sgn0x_pty_mont_384 + +.globl sgn0x_pty_mont_384x +.hidden sgn0x_pty_mont_384x +.type sgn0x_pty_mont_384x,@function +.align 32 +sgn0x_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x +.globl mulx_mont_384 +.hidden mulx_mont_384 +.type mulx_mont_384,@function +.align 32 +mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 +.byte 102,72,15,110,199 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + movq %r8,(%rsp) + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_mont_384,.-mulx_mont_384 +.type __mulx_mont_384,@function +.align 32 +__mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + xorq %r15,%r15 + + movq %r8,16(%rsp) + imulq 8(%rsp),%r8 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %rbp,%r15 + adoxq %rax,%r15 + adoxq %rax,%rax + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %r8,%r14 + adoxq %r8,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r9,16(%rsp) + imulq 8(%rsp),%r9 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rbp,%rax + adoxq %r8,%rax + adoxq %r8,%r8 + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r9,%r15 + adoxq %r9,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r10,16(%rsp) + imulq 8(%rsp),%r10 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %rbp,%r8 + adoxq %r9,%r8 + adoxq %r9,%r9 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r10,%rax + adoxq %r10,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r11,16(%rsp) + imulq 8(%rsp),%r11 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %rbp,%r9 + adoxq %r10,%r9 + adoxq %r10,%r10 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r11,%r8 + adoxq %r11,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + movq %r12,16(%rsp) + imulq 8(%rsp),%r12 + + + xorq %r11,%r11 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %rbp,%r10 + adoxq %r11,%r10 + adoxq %r11,%r11 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r12,%r9 + adoxq %r12,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + imulq 8(%rsp),%rdx +.byte 102,72,15,126,195 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + movq %r15,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + movq %rax,%rsi + + mulxq 40+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + movq %r14,%rdx + adcxq %r12,%r10 + adoxq %r12,%r11 + leaq 128(%rcx),%rcx + movq %r8,%r12 + adcq $0,%r11 + + + + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r9,%rdi + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r8 + sbbq 32(%rcx),%r9 + movq %r10,%rbp + sbbq 40(%rcx),%r10 + sbbq $0,%r11 + + cmovncq %r14,%rdx + cmovcq %r13,%r15 + cmovcq %rsi,%rax + cmovncq %r8,%r12 + movq %rdx,0(%rbx) + cmovncq %r9,%rdi + movq %r15,8(%rbx) + cmovncq %r10,%rbp + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_mont_384,.-__mulx_mont_384 +.globl sqrx_mont_384 +.hidden sqrx_mont_384 +.type sqrx_mont_384,@function +.align 32 +sqrx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rcx,%r8 + leaq -128(%rdx),%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 +.byte 102,72,15,110,199 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + leaq (%rsi),%rbx + movq %r8,(%rsp) + leaq -128(%rsi),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_384,.-sqrx_mont_384 + +.globl sqrx_n_mul_mont_384 +.hidden sqrx_n_mul_mont_384 +.type sqrx_n_mul_mont_384,@function +.align 32 +sqrx_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 +.byte 102,72,15,110,199 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,16(%rsp) + movq 0(%r9),%xmm2 + +.Loop_sqrx_384: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_384 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 16(%rsp),%rbx + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384 + +.globl sqrx_n_mul_mont_383 +.hidden sqrx_n_mul_mont_383 +.type sqrx_n_mul_mont_383,@function +.align 32 +sqrx_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 +.byte 102,72,15,110,199 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,16(%rsp) + movq 0(%r9),%xmm2 + leaq -128(%rcx),%rcx + +.Loop_sqrx_383: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_383_nonred + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_383 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 16(%rsp),%rbx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383 +.type __mulx_mont_383_nonred,@function +.align 32 +__mulx_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + movq %r8,%rax + imulq 8(%rsp),%r8 + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %r15,%rbp + adoxq %rbp,%r15 + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %rax,%r14 + adoxq %rax,%r15 + adcxq %rax,%r15 + movq %r9,%r8 + imulq 8(%rsp),%r9 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rax,%rbp + adoxq %rbp,%rax + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r10,%r9 + imulq 8(%rsp),%r10 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %r8,%rbp + adoxq %rbp,%r8 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r11,%r10 + imulq 8(%rsp),%r11 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %r9,%rbp + adoxq %rbp,%r9 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r12,%r11 + imulq 8(%rsp),%r12 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %r10,%rbp + adoxq %rbp,%r10 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + imulq 8(%rsp),%rdx +.byte 102,72,15,126,195 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r14,%rdx + adcxq %rdi,%r9 + adoxq %rbp,%r10 + adcq $0,%r10 + movq %r8,%r12 + + movq %r14,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r9,%rdi + movq %r8,24(%rbx) + movq %r9,32(%rbx) + movq %r10,40(%rbx) + movq %r10,%rbp + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_mont_383_nonred,.-__mulx_mont_383_nonred +.globl sqrx_mont_382x +.hidden sqrx_mont_382x +.type sqrx_mont_382x,@function +.align 32 +sqrx_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) + movq %rdi,%xmm0 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + adcq %r12,%r12 + adcq %rdi,%rdi + adcq %rbp,%rbp + + movq %rdx,48(%rbx) + movq %r15,56(%rbx) + movq %rax,64(%rbx) + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32-128(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + + + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + movq 32+96(%rsp),%r14 + leaq 128(%rcx),%rcx + movq 32+0(%rsp),%r8 + andq %r14,%r8 + movq 32+8(%rsp),%r9 + andq %r14,%r9 + movq 32+16(%rsp),%r10 + andq %r14,%r10 + movq 32+24(%rsp),%r11 + andq %r14,%r11 + movq 32+32(%rsp),%r13 + andq %r14,%r13 + andq 32+40(%rsp),%r14 + + subq %r8,%rdx + movq 0(%rcx),%r8 + sbbq %r9,%r15 + movq 8(%rcx),%r9 + sbbq %r10,%rax + movq 16(%rcx),%r10 + sbbq %r11,%r12 + movq 24(%rcx),%r11 + sbbq %r13,%rdi + movq 32(%rcx),%r13 + sbbq %r14,%rbp + sbbq %r14,%r14 + + andq %r14,%r8 + andq %r14,%r9 + andq %r14,%r10 + andq %r14,%r11 + andq %r14,%r13 + andq 40(%rcx),%r14 + + addq %r8,%rdx + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%r12 + adcq %r13,%rdi + adcq %r14,%rbp + + movq %rdx,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_382x,.-sqrx_mont_382x + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/build/elf/sha256-x86_64.s b/build/elf/sha256-x86_64.s new file mode 100644 index 00000000..bb17084c --- /dev/null +++ b/build/elf/sha256-x86_64.s @@ -0,0 +1,1446 @@ +.text + +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl sha256_block_data_order_shaext +.hidden sha256_block_data_order_shaext +.type sha256_block_data_order_shaext,@function +.align 64 +sha256_block_data_order_shaext: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 256-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 16-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 48-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 64-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 80-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 96-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 112-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 144-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 176-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 208-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 224-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 240-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext +.globl sha256_block_data_order +.hidden sha256_block_data_order +.type sha256_block_data_order,@function +.align 64 +sha256_block_data_order: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $40,%rsp +.cfi_adjust_cfa_offset 40 + leaq (%rsi,%rdx,4),%rdx + movq %rdi,0(%rsp) + + movq %rdx,16(%rsp) + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + + + leaq -64(%rsp),%rsp + movl 0(%rdi),%eax + andq $-64,%rsp + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa K256+256(%rip),%xmm7 + movq %rsi,8(%rbp) + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rsi +.byte 102,15,56,0,207 + movdqa 0(%rsi),%xmm4 + movdqa 16(%rsi),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 32(%rsi),%xmm6 +.byte 102,15,56,0,223 + movdqa 48(%rsi),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + subq $-64,%rsi + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 16(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 32(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 48(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,67(%rsi) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 0(%rbp),%rdi + movl %r14d,%eax + movq 8(%rbp),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + leaq 64(%rsi),%rsi + cmpq 16(%rbp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + xorps %xmm0,%xmm0 + leaq 40+48(%rbp),%r11 +.cfi_def_cfa %r11,8 + movaps %xmm0,0(%rsp) + movaps %xmm0,16(%rsp) + movaps %xmm0,32(%rsp) + movaps %xmm0,48(%rsp) + movq 40(%rbp),%r15 +.cfi_restore %r15 + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbx +.cfi_restore %rbx + movq -8(%r11),%rbp +.cfi_restore %rbp + + leaq (%r11),%rsp + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_block_data_order,.-sha256_block_data_order +.globl sha256_emit +.hidden sha256_emit +.type sha256_emit,@function +.align 16 +sha256_emit: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + bswapq %r8 + movq 24(%rsi),%r11 + bswapq %r9 + movl %r8d,4(%rdi) + bswapq %r10 + movl %r9d,12(%rdi) + bswapq %r11 + movl %r10d,20(%rdi) + shrq $32,%r8 + movl %r11d,28(%rdi) + shrq $32,%r9 + movl %r8d,0(%rdi) + shrq $32,%r10 + movl %r9d,8(%rdi) + shrq $32,%r11 + movl %r10d,16(%rdi) + movl %r11d,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_emit,.-sha256_emit + +.globl sha256_bcopy +.hidden sha256_bcopy +.type sha256_bcopy,@function +.align 16 +sha256_bcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rsi,%rdi +.Loop_bcopy: + movzbl (%rsi),%eax + leaq 1(%rsi),%rsi + movb %al,-1(%rdi,%rsi,1) + decq %rdx + jnz .Loop_bcopy + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_bcopy,.-sha256_bcopy + +.globl sha256_hcopy +.hidden sha256_hcopy +.type sha256_hcopy,@function +.align 16 +sha256_hcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_hcopy,.-sha256_hcopy + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/build/mach-o/add_mod_256-x86_64.s b/build/mach-o/add_mod_256-x86_64.s new file mode 100644 index 00000000..ea5f1507 --- /dev/null +++ b/build/mach-o/add_mod_256-x86_64.s @@ -0,0 +1,396 @@ +.text + +.globl _add_mod_256 +.private_extern _add_mod_256 + +.p2align 5 +_add_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +L$oaded_a_add_mod_256: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _mul_by_3_mod_256 +.private_extern _mul_by_3_mod_256 + +.p2align 5 +_mul_by_3_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rcx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rsi,%rdx + movq 24(%rsi),%r11 + + call __lshift_mod_256 + movq 0(%rsp),%r12 +.cfi_restore %r12 + jmp L$oaded_a_add_mod_256 + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + movq %r8,%rax + adcq %r10,%r10 + movq %r9,%rsi + adcq %r11,%r11 + sbbq %r12,%r12 + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + cmovcq %rbx,%r10 + cmovcq %rbp,%r11 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _lshift_mod_256 +.private_extern _lshift_mod_256 + +.p2align 5 +_lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +L$oop_lshift_mod_256: + call __lshift_mod_256 + decl %edx + jnz L$oop_lshift_mod_256 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _rshift_mod_256 +.private_extern _rshift_mod_256 + +.p2align 5 +_rshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rbp + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +L$oop_rshift_mod_256: + movq %rbp,%r8 + andq $1,%rbp + movq 0(%rcx),%rax + negq %rbp + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + + andq %rbp,%rax + andq %rbp,%rsi + andq %rbp,%rbx + andq 24(%rcx),%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + adcq %rbx,%r10 + adcq %rbp,%r11 + sbbq %rax,%rax + + shrq $1,%r8 + movq %r9,%rbp + shrq $1,%r9 + movq %r10,%rbx + shrq $1,%r10 + movq %r11,%rsi + shrq $1,%r11 + + shlq $63,%rbp + shlq $63,%rbx + orq %r8,%rbp + shlq $63,%rsi + orq %rbx,%r9 + shlq $63,%rax + orq %rsi,%r10 + orq %rax,%r11 + + decl %edx + jnz L$oop_rshift_mod_256 + + movq %rbp,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _cneg_mod_256 +.private_extern _cneg_mod_256 + +.p2align 5 +_cneg_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r12 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %r12,%r8 + movq 24(%rsi),%r11 + orq %r9,%r12 + orq %r10,%r12 + orq %r11,%r12 + movq $-1,%rbp + + movq 0(%rcx),%rax + cmovnzq %rbp,%r12 + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + andq %r12,%rax + movq 24(%rcx),%rbp + andq %r12,%rsi + andq %r12,%rbx + andq %r12,%rbp + + subq %r8,%rax + sbbq %r9,%rsi + sbbq %r10,%rbx + sbbq %r11,%rbp + + orq %rdx,%rdx + + cmovzq %r8,%rax + cmovzq %r9,%rsi + movq %rax,0(%rdi) + cmovzq %r10,%rbx + movq %rsi,8(%rdi) + cmovzq %r11,%rbp + movq %rbx,16(%rdi) + movq %rbp,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _sub_mod_256 +.private_extern _sub_mod_256 + +.p2align 5 +_sub_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/build/mach-o/add_mod_384-x86_64.s b/build/mach-o/add_mod_384-x86_64.s new file mode 100644 index 00000000..86889df7 --- /dev/null +++ b/build/mach-o/add_mod_384-x86_64.s @@ -0,0 +1,1477 @@ +.text + +.private_extern _BLS12_381_P + +.globl _add_mod_384 +.private_extern _add_mod_384 + +.p2align 5 +_add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__add_mod_384_a_is_loaded: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _add_mod_384x +.private_extern _add_mod_384x + +.p2align 5 +_add_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _lshift_mod_384 +.private_extern _lshift_mod_384 + +.p2align 5 +_lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +L$oop_lshift_mod_384: + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdi,%rdi + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdi + + movq (%rsp),%rdi + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + decl %edx + jnz L$oop_lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _mul_by_3_mod_384 +.private_extern _mul_by_3_mod_384 + +.p2align 5 +_mul_by_3_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mul_by_8_mod_384 +.private_extern _mul_by_8_mod_384 + +.p2align 5 +_mul_by_8_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mul_by_b_onE1 +.private_extern _mul_by_b_onE1 + +.p2align 5 +_mul_by_b_onE1: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq _BLS12_381_P(%rip),%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mul_by_4b_onE1 +.private_extern _mul_by_4b_onE1 + +.p2align 5 +_mul_by_4b_onE1: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq _BLS12_381_P(%rip),%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _mul_by_3_mod_384x +.private_extern _mul_by_3_mod_384x + +.p2align 5 +_mul_by_3_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq (%rsp),%rsi + leaq 48(%rdi),%rdi + + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + movq 72(%rsi),%r11 + movq 80(%rsi),%r12 + movq 88(%rsi),%r13 + + call __lshift_mod_384 + + movq $48,%rdx + addq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mul_by_8_mod_384x +.private_extern _mul_by_8_mod_384x + +.p2align 5 +_mul_by_8_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq (%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,48+0(%rdi) + movq %r9,48+8(%rdi) + movq %r10,48+16(%rdi) + movq %r11,48+24(%rdi) + movq %r12,48+32(%rdi) + movq %r13,48+40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mul_by_b_onE2 +.private_extern _mul_by_b_onE2 + +.p2align 5 +_mul_by_b_onE2: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + leaq _BLS12_381_P(%rip),%rcx + leaq 48(%rsi),%rdx + call __sub_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq 0(%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mul_by_4b_onE2 +.private_extern _mul_by_4b_onE2 + +.p2align 5 +_mul_by_4b_onE2: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + leaq _BLS12_381_P(%rip),%rcx + leaq 48(%rsi),%rdx + call __sub_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq 0(%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _cneg_mod_384 +.private_extern _cneg_mod_384 + +.p2align 5 +_cneg_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdx +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rdx,%r8 + movq 24(%rsi),%r11 + orq %r9,%rdx + movq 32(%rsi),%r12 + orq %r10,%rdx + movq 40(%rsi),%r13 + orq %r11,%rdx + movq $-1,%rsi + orq %r12,%rdx + orq %r13,%rdx + + movq 0(%rcx),%r14 + cmovnzq %rsi,%rdx + movq 8(%rcx),%r15 + movq 16(%rcx),%rax + andq %rdx,%r14 + movq 24(%rcx),%rbx + andq %rdx,%r15 + movq 32(%rcx),%rbp + andq %rdx,%rax + movq 40(%rcx),%rsi + andq %rdx,%rbx + movq 0(%rsp),%rcx + andq %rdx,%rbp + andq %rdx,%rsi + + subq %r8,%r14 + sbbq %r9,%r15 + sbbq %r10,%rax + sbbq %r11,%rbx + sbbq %r12,%rbp + sbbq %r13,%rsi + + orq %rcx,%rcx + + cmovzq %r8,%r14 + cmovzq %r9,%r15 + cmovzq %r10,%rax + movq %r14,0(%rdi) + cmovzq %r11,%rbx + movq %r15,8(%rdi) + cmovzq %r12,%rbp + movq %rax,16(%rdi) + cmovzq %r13,%rsi + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rsi,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _sub_mod_384 +.private_extern _sub_mod_384 + +.p2align 5 +_sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sub_mod_384x +.private_extern _sub_mod_384x + +.p2align 5 +_sub_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __sub_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mul_by_1_plus_i_mod_384x +.private_extern _mul_by_1_plus_i_mod_384x + +.p2align 5 +_mul_by_1_plus_i_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $56,%rsp +.cfi_adjust_cfa_offset 56 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rbx + adcq 72(%rsi),%r11 + movq %r12,%rcx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + movq %rdi,48(%rsp) + sbbq %rdi,%rdi + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rbx + sbbq 80(%rsi),%rcx + sbbq 88(%rsi),%rbp + sbbq %rsi,%rsi + + movq %r8,0(%rsp) + movq 0(%rdx),%r8 + movq %r9,8(%rsp) + movq 8(%rdx),%r9 + movq %r10,16(%rsp) + movq 16(%rdx),%r10 + movq %r11,24(%rsp) + movq 24(%rdx),%r11 + movq %r12,32(%rsp) + andq %rsi,%r8 + movq 32(%rdx),%r12 + movq %r13,40(%rsp) + andq %rsi,%r9 + movq 40(%rdx),%r13 + andq %rsi,%r10 + andq %rsi,%r11 + andq %rsi,%r12 + andq %rsi,%r13 + movq 48(%rsp),%rsi + + addq %r8,%r14 + movq 0(%rsp),%r8 + adcq %r9,%r15 + movq 8(%rsp),%r9 + adcq %r10,%rax + movq 16(%rsp),%r10 + adcq %r11,%rbx + movq 24(%rsp),%r11 + adcq %r12,%rcx + movq 32(%rsp),%r12 + adcq %r13,%rbp + movq 40(%rsp),%r13 + + movq %r14,0(%rsi) + movq %r8,%r14 + movq %r15,8(%rsi) + movq %rax,16(%rsi) + movq %r9,%r15 + movq %rbx,24(%rsi) + movq %rcx,32(%rsi) + movq %r10,%rax + movq %rbp,40(%rsi) + + subq 0(%rdx),%r8 + movq %r11,%rbx + sbbq 8(%rdx),%r9 + sbbq 16(%rdx),%r10 + movq %r12,%rcx + sbbq 24(%rdx),%r11 + sbbq 32(%rdx),%r12 + movq %r13,%rbp + sbbq 40(%rdx),%r13 + sbbq $0,%rdi + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,48(%rsi) + cmovcq %rbx,%r11 + movq %r9,56(%rsi) + cmovcq %rcx,%r12 + movq %r10,64(%rsi) + cmovcq %rbp,%r13 + movq %r11,72(%rsi) + movq %r12,80(%rsi) + movq %r13,88(%rsi) + + movq 56+0(%rsp),%r15 +.cfi_restore %r15 + movq 56+8(%rsp),%r14 +.cfi_restore %r14 + movq 56+16(%rsp),%r13 +.cfi_restore %r13 + movq 56+24(%rsp),%r12 +.cfi_restore %r12 + movq 56+32(%rsp),%rbx +.cfi_restore %rbx + movq 56+40(%rsp),%rbp +.cfi_restore %rbp + leaq 56+48(%rsp),%rsp +.cfi_adjust_cfa_offset -56-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sgn0_pty_mod_384 +.private_extern _sgn0_pty_mod_384 + +.p2align 5 +_sgn0_pty_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + movq 40(%rdi),%rdx + + xorq %rax,%rax + movq %r8,%rdi + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + notq %rax + andq $1,%rdi + andq $2,%rax + orq %rdi,%rax + + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0_pty_mod_384x +.private_extern _sgn0_pty_mod_384x + +.p2align 5 +_sgn0_pty_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + movq 40(%rdi),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + xorq %rax,%rax + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + movq %r8,0(%rsp) + notq %rax + andq $1,%rbp + andq $2,%rax + orq %rbp,%rax + + movq 48(%rdi),%r8 + movq 56(%rdi),%r9 + movq 64(%rdi),%r10 + movq 72(%rdi),%r11 + movq 80(%rdi),%rcx + movq 88(%rdi),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + xorq %rdi,%rdi + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rdi + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rdi + + movq 0(%rsp),%rbx + + notq %rdi + + testq %r8,%r8 + cmovnzq %rdi,%rax + + testq %rbx,%rbx + cmovzq %rdi,%rbp + + andq $1,%rbp + andq $2,%rax + orq %rbp,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/build/mach-o/add_mod_384x384-x86_64.s b/build/mach-o/add_mod_384x384-x86_64.s new file mode 100644 index 00000000..2dc58f81 --- /dev/null +++ b/build/mach-o/add_mod_384x384-x86_64.s @@ -0,0 +1,244 @@ +.text + + +.p2align 5 +__add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + addq 0(%rdx),%r8 + movq 56(%rsi),%r15 + adcq 8(%rdx),%r9 + movq 64(%rsi),%rax + adcq 16(%rdx),%r10 + movq 72(%rsi),%rbx + adcq 24(%rdx),%r11 + movq 80(%rsi),%rbp + adcq 32(%rdx),%r12 + movq 88(%rsi),%rsi + adcq 40(%rdx),%r13 + movq %r8,0(%rdi) + adcq 48(%rdx),%r14 + movq %r9,8(%rdi) + adcq 56(%rdx),%r15 + movq %r10,16(%rdi) + adcq 64(%rdx),%rax + movq %r12,32(%rdi) + movq %r14,%r8 + adcq 72(%rdx),%rbx + movq %r11,24(%rdi) + movq %r15,%r9 + adcq 80(%rdx),%rbp + movq %r13,40(%rdi) + movq %rax,%r10 + adcq 88(%rdx),%rsi + movq %rbx,%r11 + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %rbp,%r12 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%rbx + sbbq 32(%rcx),%rbp + movq %rsi,%r13 + sbbq 40(%rcx),%rsi + sbbq $0,%rdx + + cmovcq %r8,%r14 + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %r14,48(%rdi) + cmovcq %r11,%rbx + movq %r15,56(%rdi) + cmovcq %r12,%rbp + movq %rax,64(%rdi) + cmovcq %r13,%rsi + movq %rbx,72(%rdi) + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _add_mod_384x384 +.private_extern _add_mod_384x384 + +.p2align 5 +_add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sub_mod_384x384 +.private_extern _sub_mod_384x384 + +.p2align 5 +_sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/build/mach-o/inverse_mod_384-x86_64.s b/build/mach-o/inverse_mod_384-x86_64.s new file mode 100644 index 00000000..e9309d9b --- /dev/null +++ b/build/mach-o/inverse_mod_384-x86_64.s @@ -0,0 +1,370 @@ +.text + +.p2align 5 +L$one: +.quad 1,0,0,0,0,0,0,0 + +.globl _eucl_inverse_mod_384 +.private_extern _eucl_inverse_mod_384 + +.p2align 5 +_eucl_inverse_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $216,%rsp +.cfi_adjust_cfa_offset 216 + + + movq %rdi,0(%rsp) + leaq L$one(%rip),%rbp + cmpq $0,%rcx + cmoveq %rbp,%rcx + + movq 0(%rsi),%rax + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rax,%r8 + orq %r9,%rax + orq %r10,%rax + orq %r11,%rax + orq %r12,%rax + orq %r13,%rax + jz L$abort + + leaq 16(%rsp),%rsi + movq 0(%rcx),%r14 + movq 8(%rcx),%r15 + movq 16(%rcx),%rax + movq 24(%rcx),%rbx + movq 32(%rcx),%rbp + movq 40(%rcx),%rdi + + movq %r8,0(%rsi) + movq %r9,8(%rsi) + movq %r10,16(%rsi) + movq %r11,24(%rsi) + movq %r12,32(%rsi) + movq %r13,40(%rsi) + + leaq 112(%rsp),%rcx + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + movq %r14,48(%rsi) + movq %r15,56(%rsi) + movq %rax,64(%rsi) + movq %rbx,72(%rsi) + movq %rbp,80(%rsi) + movq %rdi,88(%rsi) + + movq %r8,0(%rcx) + movq %r9,8(%rcx) + movq %r10,16(%rcx) + movq %r11,24(%rcx) + movq %r12,32(%rcx) + movq %r13,40(%rcx) + + xorl %eax,%eax + movq %rax,48(%rcx) + movq %rax,56(%rcx) + movq %rax,64(%rcx) + movq %rax,72(%rcx) + movq %rax,80(%rcx) + movq %rax,88(%rcx) + jmp L$oop_inv + +.p2align 5 +L$oop_inv: + leaq 112(%rsp),%rsi + call __remove_powers_of_2 + + leaq 16(%rsp),%rsi + call __remove_powers_of_2 + + leaq 112(%rsp),%rcx + subq 112+0(%rsp),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + sbbq 40(%rcx),%r13 + jae L$u_greater_than_v + + + xchgq %rcx,%rsi + + notq %r8 + notq %r9 + notq %r10 + notq %r11 + notq %r12 + notq %r13 + + addq $1,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + +L$u_greater_than_v: + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rax + movq 72(%rsi),%rbx + movq 80(%rsi),%rbp + movq 88(%rsi),%rdi + + subq 48(%rcx),%r14 + sbbq 56(%rcx),%r15 + sbbq 64(%rcx),%rax + sbbq 72(%rcx),%rbx + sbbq 80(%rcx),%rbp + sbbq 88(%rcx),%rdi + + movq %r8,0(%rsi) + sbbq %r8,%r8 + movq %r9,8(%rsi) + movq %r8,%r9 + movq %r10,16(%rsi) + movq %r8,%r10 + movq %r11,24(%rsi) + movq %r8,%r11 + movq %r12,32(%rsi) + movq %r8,%r12 + movq %r13,40(%rsi) + movq %r8,%r13 + + andq 0(%rdx),%r8 + andq 8(%rdx),%r9 + andq 16(%rdx),%r10 + andq 24(%rdx),%r11 + andq 32(%rdx),%r12 + andq 40(%rdx),%r13 + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%rbx + adcq %r12,%rbp + adcq %r13,%rdi + + movq %r14,48(%rsi) + movq %r15,56(%rsi) + movq %rax,64(%rsi) + movq %rbx,72(%rsi) + movq %rbp,80(%rsi) + movq %rdi,88(%rsi) + + movq 16+0(%rsp),%r8 + movq 16+8(%rsp),%r9 + movq 16+16(%rsp),%r10 + movq 16+24(%rsp),%r11 + orq %r9,%r8 + orq 16+32(%rsp),%r10 + orq 16+40(%rsp),%r11 +.byte 0x67 + orq %r10,%r8 + orq %r11,%r8 + jnz L$oop_inv + + leaq 112(%rsp),%rsi + movq 0(%rsp),%rdi + movl $1,%eax + + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + movq 72(%rsi),%r11 + movq 80(%rsi),%r12 + movq 88(%rsi),%r13 + +L$abort: + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + leaq 216(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -216-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__remove_powers_of_2: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +L$oop_of_2: + bsfq %r8,%rcx + movl $63,%eax + cmovzl %eax,%ecx + + cmpl $0,%ecx + je L$oop_of_2_done + + shrq %cl,%r8 + movq %r9,%r14 + shrq %cl,%r9 + movq %r10,%r15 + shrq %cl,%r10 + movq %r11,%rax + shrq %cl,%r11 + movq %r12,%rbx + shrq %cl,%r12 + movq %r13,%rbp + shrq %cl,%r13 + negb %cl + shlq %cl,%r14 + shlq %cl,%r15 + orq %r14,%r8 + movq 48(%rsi),%r14 + shlq %cl,%rax + orq %r15,%r9 + movq 56(%rsi),%r15 + shlq %cl,%rbx + orq %rax,%r10 + movq 64(%rsi),%rax + shlq %cl,%rbp + orq %rbx,%r11 + movq 72(%rsi),%rbx + orq %rbp,%r12 + movq 80(%rsi),%rbp + negb %cl + movq 88(%rsi),%rdi + + movq %r8,0(%rsi) + movq %r9,8(%rsi) + movq %r10,16(%rsi) + movq %r11,24(%rsi) + movq %r12,32(%rsi) + movq %r13,40(%rsi) + jmp L$oop_div_by_2 + +.p2align 5 +L$oop_div_by_2: + movq $1,%r13 + movq 0(%rdx),%r8 + andq %r14,%r13 + movq 8(%rdx),%r9 + negq %r13 + movq 16(%rdx),%r10 + andq %r13,%r8 + movq 24(%rdx),%r11 + andq %r13,%r9 + movq 32(%rdx),%r12 + andq %r13,%r10 + andq %r13,%r11 + andq %r13,%r12 + andq 40(%rdx),%r13 + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%rbx + adcq %r12,%rbp + adcq %r13,%rdi + sbbq %r13,%r13 + + shrq $1,%r14 + movq %r15,%r8 + shrq $1,%r15 + movq %rax,%r9 + shrq $1,%rax + movq %rbx,%r10 + shrq $1,%rbx + movq %rbp,%r11 + shrq $1,%rbp + movq %rdi,%r12 + shrq $1,%rdi + shlq $63,%r8 + shlq $63,%r9 + orq %r8,%r14 + shlq $63,%r10 + orq %r9,%r15 + shlq $63,%r11 + orq %r10,%rax + shlq $63,%r12 + orq %r11,%rbx + shlq $63,%r13 + orq %r12,%rbp + orq %r13,%rdi + + decl %ecx + jnz L$oop_div_by_2 + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r14,48(%rsi) + movq %r15,56(%rsi) + movq %rax,64(%rsi) + movq %rbx,72(%rsi) + movq %rbp,80(%rsi) + movq %rdi,88(%rsi) + + testq $1,%r8 +.byte 0x2e + jz L$oop_of_2 + +L$oop_of_2_done: + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/build/mach-o/mulq_mont_256-x86_64.s b/build/mach-o/mulq_mont_256-x86_64.s new file mode 100644 index 00000000..d83f5440 --- /dev/null +++ b/build/mach-o/mulq_mont_256-x86_64.s @@ -0,0 +1,706 @@ +.text + +.globl _mul_mont_sparse_256 +.private_extern _mul_mont_sparse_256 + +.p2align 5 +_mul_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r13 + movq 8(%rsi),%r14 + movq 16(%rsi),%r12 + movq 24(%rsi),%rbp + movq %rdx,%rbx + + movq %rax,%r15 + mulq %r13 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqr_mont_sparse_256 +.private_extern _sqr_mont_sparse_256 + +.p2align 5 +_sqr_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rax + movq %rcx,%r8 + movq 8(%rsi),%r14 + movq %rdx,%rcx + movq 16(%rsi),%r12 + leaq (%rsi),%rbx + movq 24(%rsi),%rbp + + movq %rax,%r15 + mulq %rax + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulq %r14 + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq 8(%rbx),%rax + adcq $0,%rdx + xorq %r14,%r14 + movq %rdx,%r13 + + movq %r9,%rdi + imulq %r8,%r9 + + + movq %rax,%r15 + mulq 0(%rsi) + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + xorq %r15,%r15 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r9,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rdi,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + addq %rdx,%r13 + adcq $0,%r14 + adcq $0,%r15 + movq %r10,%rdi + imulq %r8,%r10 + + + movq %rax,%r9 + mulq 0(%rsi) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + xorq %r9,%r9 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r10,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rdi,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r13 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + addq %rdx,%r14 + adcq $0,%r15 + adcq $0,%r9 + movq %r11,%rdi + imulq %r8,%r11 + + + movq %rax,%r10 + mulq 0(%rsi) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r9 + xorq %r10,%r10 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r11,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rdi,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + addq %rdx,%r15 + adcq $0,%r9 + adcq $0,%r10 + imulq %r8,%rax + movq 8(%rsp),%rsi + + + movq %rax,%r11 + mulq 0(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r12,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + movq %r14,%rbx + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rdx,%r9 + adcq $0,%r10 + + + + + movq %r15,%r12 + subq 0(%rcx),%r13 + sbbq 8(%rcx),%r14 + sbbq 16(%rcx),%r15 + movq %r9,%rbp + sbbq 24(%rcx),%r9 + sbbq $0,%r10 + + cmovcq %rax,%r13 + cmovcq %rbx,%r14 + cmovcq %r12,%r15 + movq %r13,0(%rsi) + cmovcq %rbp,%r9 + movq %r14,8(%rsi) + movq %r15,16(%rsi) + movq %r9,24(%rsi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _from_mont_256 +.private_extern _from_mont_256 + +.p2align 5 +_from_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + + + + + movq %r14,%r10 + movq %r15,%r11 + movq %r9,%r12 + + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + sbbq 24(%rbx),%r9 + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _redc_mont_256 +.private_extern _redc_mont_256 + +.p2align 5 +_redc_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + addq 32(%rsi),%r13 + adcq 40(%rsi),%r14 + movq %r13,%rax + adcq 48(%rsi),%r15 + movq %r14,%r10 + adcq 56(%rsi),%r9 + sbbq %rsi,%rsi + + + + + movq %r15,%r11 + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + movq %r9,%r12 + sbbq 24(%rbx),%r9 + sbbq $0,%rsi + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + movq %rax,%r13 + imulq %rcx,%rax + movq %rax,%r9 + + mulq 0(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq %rdx,%r13 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r10 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 16(%rbx) + movq %r10,%r14 + imulq %rcx,%r10 + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r11 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r13,%r12 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r9 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/build/mach-o/mulq_mont_384-x86_64.s b/build/mach-o/mulq_mont_384-x86_64.s new file mode 100644 index 00000000..7465af48 --- /dev/null +++ b/build/mach-o/mulq_mont_384-x86_64.s @@ -0,0 +1,3611 @@ +.text + + + + + + + + +.p2align 5 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mul_mont_384x +.private_extern _mul_mont_384x + +.p2align 5 +_mul_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulq_384 + + + leaq 48(%rbx),%rbx + leaq 48(%rsi),%rsi + leaq 40+96(%rsp),%rdi + call __mulq_384 + + + movq 8(%rsp),%rcx + leaq -48(%rsi),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulq_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + movq %rcx,%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_mont_384x +.private_extern _sqr_mont_384x + +.p2align 5 +_sqr_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) +.byte 102,72,15,110,199 + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + call __mulq_mont_384 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + movq %r14,%r12 + adcq %r9,%r9 + movq %r15,%r13 + adcq %r10,%r10 + movq %r8,%rax + adcq %r11,%r11 + movq %r9,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r10,%rbp + sbbq 16(%rcx),%r8 + sbbq 24(%rcx),%r9 + sbbq 32(%rcx),%r10 + movq %r11,%rsi + sbbq 40(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r14 + cmovcq %r13,%r15 + cmovcq %rax,%r8 + movq %r14,48(%rdi) + cmovcq %rbx,%r9 + movq %r15,56(%rdi) + cmovcq %rbp,%r10 + movq %r8,64(%rdi) + cmovcq %rsi,%r11 + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mul_382x +.private_extern _mul_382x + +.p2align 5 +_mul_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulq_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulq_384 + + + leaq 48(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulq_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_382x +.private_extern _sqr_382x + +.p2align 5 +_sqr_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulq_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulq_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mul_384 +.private_extern _mul_384 + +.p2align 5 +_mul_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rbx + call __mulq_384 + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__mulq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rax + + movq %rax,%rbp + mulq 0(%rsi) + movq %rax,0(%rdi) + movq %rbp,%rax + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r11 + movq 8(%rbx),%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,8(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,16(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,24(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,32(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,40(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq %rax,%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rcx,48(%rdi) + movq %r8,56(%rdi) + movq %r9,64(%rdi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_384 +.private_extern _sqr_384 + +.p2align 5 +_sqr_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sqrq_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__sqrq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r15 + movq 16(%rsi),%rcx + movq 24(%rsi),%rbx + + + movq %rax,%r14 + mulq %r15 + movq %rax,%r9 + movq %r14,%rax + movq 32(%rsi),%rbp + movq %rdx,%r10 + + mulq %rcx + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + movq 40(%rsi),%rsi + movq %rdx,%r11 + + mulq %rbx + addq %rax,%r11 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq %rax + xorq %r8,%r8 + movq %rax,0(%rdi) + movq %r15,%rax + addq %r9,%r9 + adcq $0,%r8 + addq %rdx,%r9 + adcq $0,%r8 + movq %r9,8(%rdi) + + mulq %rcx + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbx + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbp + addq %rax,%r13 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rax + xorq %r9,%r9 + addq %rax,%r8 + movq %rcx,%rax + addq %r10,%r10 + adcq %r11,%r11 + adcq $0,%r9 + addq %r8,%r10 + adcq %rdx,%r11 + adcq $0,%r9 + movq %r10,16(%rdi) + + mulq %rbx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + movq %r11,24(%rdi) + movq %rdx,%r8 + + mulq %rbp + addq %rax,%r14 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq %rsi + addq %rax,%r15 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + xorq %r11,%r11 + addq %rax,%r9 + movq %rbx,%rax + addq %r12,%r12 + adcq %r13,%r13 + adcq $0,%r11 + addq %r9,%r12 + adcq %rdx,%r13 + adcq $0,%r11 + movq %r12,32(%rdi) + + + mulq %rbp + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %r13,40(%rdi) + movq %rdx,%r8 + + mulq %rsi + addq %rax,%rcx + movq %rbx,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%rbx + + mulq %rax + xorq %r12,%r12 + addq %rax,%r11 + movq %rbp,%rax + addq %r14,%r14 + adcq %r15,%r15 + adcq $0,%r12 + addq %r11,%r14 + adcq %rdx,%r15 + movq %r14,48(%rdi) + adcq $0,%r12 + movq %r15,56(%rdi) + + + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + xorq %r13,%r13 + addq %rax,%r12 + movq %rsi,%rax + addq %rcx,%rcx + adcq %rbx,%rbx + adcq $0,%r13 + addq %r12,%rcx + adcq %rdx,%rbx + movq %rcx,64(%rdi) + adcq $0,%r13 + movq %rbx,72(%rdi) + + + mulq %rax + addq %r13,%rax + addq %rbp,%rbp + adcq $0,%rdx + addq %rbp,%rax + adcq $0,%rdx + movq %rax,80(%rdi) + movq %rdx,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqr_mont_384 +.private_extern _sqr_mont_384 + +.p2align 5 +_sqr_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $120,%rsp +.cfi_adjust_cfa_offset 8*15 + + + movq %rcx,96(%rsp) + movq %rdx,104(%rsp) + movq %rdi,112(%rsp) + + movq %rsp,%rdi + call __sqrq_384 + + leaq 0(%rsp),%rsi + movq 96(%rsp),%rcx + movq 104(%rsp),%rbx + movq 112(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 120(%rsp),%r8 + movq 120(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*21 + + .byte 0xf3,0xc3 +.cfi_endproc + + + + +.globl _redc_mont_384 +.private_extern _redc_mont_384 + +.p2align 5 +_redc_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + + + +.globl _from_mont_384 +.private_extern _from_mont_384 + +.p2align 5 +_from_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + + + + + + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r8 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r9 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r10 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %r9,%r15 + imulq %rcx,%r9 + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 32(%rbx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 40(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r9,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %r10,%r8 + imulq %rcx,%r10 + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r8 + movq %r10,%rax + adcq %rdx,%r8 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %r11,%r9 + imulq %rcx,%r11 + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r11,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %r12,%r10 + imulq %rcx,%r12 + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %r13,%r11 + imulq %rcx,%r13 + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__redc_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0_pty_mont_384 +.private_extern _sgn0_pty_mont_384 + +.p2align 5 +_sgn0_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0_pty_mont_384x +.private_extern _sgn0_pty_mont_384x + +.p2align 5 +_sgn0_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mul_mont_384 +.private_extern _mul_mont_384 + +.p2align 5 +_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %r8 +.cfi_adjust_cfa_offset 8 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq %rdx,%rbx +.byte 102,72,15,110,199 + + call __mulq_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rdi + mulq %r14 + movq %rax,%r8 + movq %rdi,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%rbp + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + xorq %r15,%r15 + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r8,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + adcq $0,%r15 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r9,%rbp + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r14 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r14 + movq %r9,%rax + adcq %rdx,%r15 + adcq $0,%r8 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r9,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + adcq $0,%r8 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r10,%rbp + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r15 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r15 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r10,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r8 + adcq $0,%r9 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r11,%rbp + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r11,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq %rdx,%r9 + adcq $0,%r10 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r12,%rbp + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r9 + adcq $0,%rdx + xorq %r11,%r11 + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r10 + adcq $0,%r11 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r12,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + movq %r13,%rbp + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r8 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rsi) + addq %r12,%r10 + adcq $0,%rdx + xorq %r12,%r12 + addq %rax,%r10 + movq %r13,%rax + adcq %rdx,%r11 + adcq $0,%r12 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r13,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq %rdx,%r11 + adcq $0,%r12 + + + + +.byte 102,72,15,126,199 + subq 0(%rcx),%r14 + movq %r15,%rdx + sbbq 8(%rcx),%r15 + movq %r8,%rbx + sbbq 16(%rcx),%r8 + movq %r9,%rsi + sbbq 24(%rcx),%r9 + movq %r10,%rbp + sbbq 32(%rcx),%r10 + movq %r11,%r13 + sbbq 40(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rdx,%r15 + cmovcq %rbx,%r8 + movq %r14,0(%rdi) + cmovcq %rsi,%r9 + movq %r15,8(%rdi) + cmovcq %rbp,%r10 + movq %r8,16(%rdi) + cmovcq %r13,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_n_mul_mont_384 +.private_extern _sqr_n_mul_mont_384 + +.p2align 5 +_sqr_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rcx,8(%rsp) +.byte 102,72,15,110,199 + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +L$oop_sqr_384: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 8(%rsp),%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movd %xmm1,%edx + leaq 0(%rdi),%rsi + decl %edx + jnz L$oop_sqr_384 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqr_n_mul_mont_383 +.private_extern _sqr_n_mul_mont_383 + +.p2align 5 +_sqr_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rcx,8(%rsp) +.byte 102,72,15,110,199 + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +L$oop_sqr_383: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 8(%rsp),%rbx + call __mulq_by_1_mont_384 + + movd %xmm1,%edx + addq 48(%rsi),%r14 + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + leaq 0(%rdi),%rsi + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + decl %edx + jnz L$oop_sqr_383 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rbp + mulq %r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%r15 + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%r15 + movq %r8,%rax + adcq %rdx,%r15 + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r9 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rcx) + addq %r15,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %r15,%r13 + adcq %rdx,%r14 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + movq %r9,%r8 + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rsi) + addq %r15,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rcx) + addq %rax,%r8 + movq %r9,%rax + adcq %rdx,%r8 + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rcx) + addq %r8,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r8,%r14 + adcq %rdx,%r15 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r10,%r9 + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rcx) + addq %rax,%r9 + movq %r10,%rax + adcq %rdx,%r9 + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rcx) + addq %r9,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r9,%r15 + adcq %rdx,%r8 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r11,%r10 + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rcx) + addq %rax,%r10 + movq %r11,%rax + adcq %rdx,%r10 + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rcx) + addq %r10,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r10,%r8 + adcq %rdx,%r9 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r12,%r11 + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rcx) + addq %rax,%r11 + movq %r12,%rax + adcq %rdx,%r11 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rcx) + addq %r11,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r11,%r9 + adcq %rdx,%r10 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r13,%r12 + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 0(%rcx) + addq %rax,%r12 + movq %r13,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 24(%rcx) + addq %r12,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r12,%r10 + adcq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_mont_382x +.private_extern _sqr_mont_382x + +.p2align 5 +_sqr_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) + movq %rdi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq 24(%rsp),%rdi + call __mulq_mont_383_nonred + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %r8,64(%rdi) + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_383_nonred + movq 32+96(%rsp),%rsi + movq 32+0(%rsp),%r12 + movq 32+8(%rsp),%r13 + andq %rsi,%r12 + movq 32+16(%rsp),%rax + andq %rsi,%r13 + movq 32+24(%rsp),%rbx + andq %rsi,%rax + movq 32+32(%rsp),%rbp + andq %rsi,%rbx + andq %rsi,%rbp + andq 32+40(%rsp),%rsi + + subq %r12,%r14 + movq 0(%rcx),%r12 + sbbq %r13,%r15 + movq 8(%rcx),%r13 + sbbq %rax,%r8 + movq 16(%rcx),%rax + sbbq %rbx,%r9 + movq 24(%rcx),%rbx + sbbq %rbp,%r10 + movq 32(%rcx),%rbp + sbbq %rsi,%r11 + sbbq %rsi,%rsi + + andq %rsi,%r12 + andq %rsi,%r13 + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r12,%r14 + adcq %r13,%r15 + adcq %rax,%r8 + adcq %rbx,%r9 + adcq %rbp,%r10 + adcq %rsi,%r11 + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/build/mach-o/mulx_mont_256-x86_64.s b/build/mach-o/mulx_mont_256-x86_64.s new file mode 100644 index 00000000..178372f4 --- /dev/null +++ b/build/mach-o/mulx_mont_256-x86_64.s @@ -0,0 +1,619 @@ +.text + +.globl _mulx_mont_sparse_256 +.private_extern _mulx_mont_sparse_256 + +.p2align 5 +_mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqrx_mont_sparse_256 +.private_extern _sqrx_mont_sparse_256 + +.p2align 5 +_sqrx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + movq %rcx,%r8 + movq %rdx,%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulxq %r15,%r15,%r12 + mulxq %rbp,%rbp,%r13 + addq %r15,%r11 + mulxq %r9,%r9,%r14 + movq 8(%rbx),%rdx + adcq %rbp,%r12 + adcq %r9,%r13 + adcq $0,%r14 + + movq %rax,%r10 + imulq %r8,%rax + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r11 + adcxq %r9,%r12 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r14 + adcxq %r15,%r9 + adoxq %r9,%r15 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r10 + adoxq %r11,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r12 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r12 + adoxq %r9,%r13 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 16(%rbx),%rdx + adcxq %rbp,%r13 + adoxq %r9,%r14 + adcxq %r10,%r14 + adoxq %r10,%r15 + adcxq %r10,%r15 + adoxq %r10,%r10 + adcq $0,%r10 + movq %rax,%r11 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r15 + adcxq %r10,%r9 + adoxq %r9,%r10 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r11 + adoxq %r12,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r13 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r13 + adoxq %r9,%r14 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 24(%rbx),%rdx + adcxq %rbp,%r14 + adoxq %r9,%r15 + adcxq %r11,%r15 + adoxq %r11,%r10 + adcxq %r11,%r10 + adoxq %r11,%r11 + adcq $0,%r11 + movq %rax,%r12 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r15 + adcxq %r9,%r10 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r10 + adcxq %r11,%r9 + adoxq %r9,%r11 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r12 + adoxq %r13,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r14 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %rax,%rdx + adcxq %rbp,%r15 + adoxq %r9,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + adoxq %r12,%r12 + adcq $0,%r12 + imulq %r8,%rdx + + + xorq %rbp,%rbp + mulxq 0+128(%rcx),%r13,%r9 + adcxq %rax,%r13 + adoxq %r9,%r14 + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r15 + adoxq %r9,%r10 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %r14,%rdx + leaq 128(%rcx),%rcx + adcxq %rbp,%r10 + adoxq %r9,%r11 + movq %r15,%rax + adcxq %r13,%r11 + adoxq %r13,%r12 + adcq $0,%r12 + + + + + movq %r10,%rbp + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + sbbq 16(%rcx),%r10 + movq %r11,%r9 + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rdx,%r14 + cmovcq %rax,%r15 + cmovcq %rbp,%r10 + movq %r14,0(%rdi) + cmovcq %r9,%r11 + movq %r15,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _fromx_mont_256 +.private_extern _fromx_mont_256 + +.p2align 5 +_fromx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + + + + + movq %r15,%rdx + movq %r10,%r12 + movq %r11,%r13 + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + sbbq 24(%rbx),%r11 + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _redcx_mont_256 +.private_extern _redcx_mont_256 + +.p2align 5 +_redcx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + addq 32(%rsi),%r14 + adcq 40(%rsi),%r15 + movq %r14,%rax + adcq 48(%rsi),%r10 + movq %r15,%rdx + adcq 56(%rsi),%r11 + sbbq %rsi,%rsi + + + + + movq %r10,%r12 + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + movq %r11,%r13 + sbbq 24(%rbx),%r11 + sbbq $0,%rsi + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r11 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r10 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r10 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + movq %r13,%r11 + imulq %rcx,%r13 + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/build/mach-o/mulx_mont_384-x86_64.s b/build/mach-o/mulx_mont_384-x86_64.s new file mode 100644 index 00000000..065d9acd --- /dev/null +++ b/build/mach-o/mulx_mont_384-x86_64.s @@ -0,0 +1,2961 @@ +.text + + + + + + + + +.p2align 5 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mulx_mont_384x +.private_extern _mulx_mont_384x + +.p2align 5 +_mulx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulx_384 + + + leaq 48(%rbx),%rbx + leaq 128+48(%rsi),%rsi + leaq 96(%rdi),%rdi + call __mulx_384 + + + movq 8(%rsp),%rcx + leaq (%rbx),%rsi + leaq -48(%rbx),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulx_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + leaq (%rcx),%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_mont_384x +.private_extern _sqrx_mont_384x + +.p2align 5 +_sqrx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + + movq %rsi,16(%rsp) +.byte 102,72,15,110,199 + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + movq %rdx,%r8 + adcq %r12,%r12 + movq %r15,%r9 + adcq %rdi,%rdi + movq %rax,%r10 + adcq %rbp,%rbp + movq %r12,%r11 + sbbq %rsi,%rsi + + subq 0(%rcx),%rdx + sbbq 8(%rcx),%r15 + movq %rdi,%r13 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r12 + sbbq 32(%rcx),%rdi + movq %rbp,%r14 + sbbq 40(%rcx),%rbp + sbbq $0,%rsi + + cmovcq %r8,%rdx + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %rdx,48(%rbx) + cmovcq %r11,%r12 + movq %r15,56(%rbx) + cmovcq %r13,%rdi + movq %rax,64(%rbx) + cmovcq %r14,%rbp + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mulx_382x +.private_extern _mulx_382x + +.p2align 5 +_mulx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulx_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulx_384 + + + leaq 48+128(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulx_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_382x +.private_extern _sqrx_382x + +.p2align 5 +_sqrx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulx_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulx_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mulx_384 +.private_extern _mulx_384 + +.p2align 5 +_mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + + movq %rdx,%rbx + call __mulx_384 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq -128(%rsi),%rsi + + mulxq %r14,%r9,%rcx + xorq %rbp,%rbp + + mulxq %r15,%r8,%rax + adcxq %rcx,%r8 + movq %r9,0(%rdi) + + mulxq %r10,%r9,%rcx + adcxq %rax,%r9 + + mulxq %r11,%r10,%rax + adcxq %rcx,%r10 + + mulxq %r12,%r11,%rcx + adcxq %rax,%r11 + + mulxq %r13,%r12,%r13 + movq 8(%rbx),%rdx + adcxq %rcx,%r12 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,8(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 16(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,16(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 24(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,24(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 32(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,32(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 40(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,40(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq %rax,%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + movq %r10,64(%rdi) + movq %r11,72(%rdi) + movq %r12,80(%rdi) + movq %r13,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_384 +.private_extern _sqrx_384 + +.p2align 5 +_sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + call __sqrx_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%rcx + movq 32(%rsi),%rbx +.byte 102,72,15,110,199 + + + mulxq %r14,%r8,%rdi + movq 40(%rsi),%rbp + mulxq %r15,%r9,%rax + addq %rdi,%r9 + mulxq %rcx,%r10,%rdi + adcq %rax,%r10 + mulxq %rbx,%r11,%rax + adcq %rdi,%r11 + mulxq %rbp,%r12,%r13 + movq %r14,%rdx + adcq %rax,%r12 + adcq $0,%r13 + + + xorq %r14,%r14 + mulxq %r15,%rdi,%rax + adcxq %rdi,%r10 + adoxq %rax,%r11 + + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r11 + adoxq %rax,%r12 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbp,%rdi,%rax + movq %r15,%rdx + adcxq %rdi,%r13 + adoxq %r14,%rax + adcxq %rax,%r14 + + + xorq %r15,%r15 + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r13 + adoxq %rax,%r14 + + mulxq %rbp,%rdi,%rax + movq %rcx,%rdx + adcxq %rdi,%r14 + adoxq %r15,%rax + adcxq %rax,%r15 + + + xorq %rcx,%rcx + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r14 + adoxq %rax,%r15 + + mulxq %rbp,%rdi,%rax + movq %rbx,%rdx + adcxq %rdi,%r15 + adoxq %rcx,%rax + adcxq %rax,%rcx + + + mulxq %rbp,%rdi,%rbx + movq 0(%rsi),%rdx + addq %rdi,%rcx +.byte 102,72,15,126,199 + adcq $0,%rbx + + + xorq %rbp,%rbp + adcxq %r8,%r8 + adcxq %r9,%r9 + adcxq %r10,%r10 + adcxq %r11,%r11 + adcxq %r12,%r12 + + + mulxq %rdx,%rdx,%rax + movq %rdx,0(%rdi) + movq 8(%rsi),%rdx + adoxq %rax,%r8 + movq %r8,8(%rdi) + + mulxq %rdx,%r8,%rax + movq 16(%rsi),%rdx + adoxq %r8,%r9 + adoxq %rax,%r10 + movq %r9,16(%rdi) + movq %r10,24(%rdi) + + mulxq %rdx,%r8,%r9 + movq 24(%rsi),%rdx + adoxq %r8,%r11 + adoxq %r9,%r12 + adcxq %r13,%r13 + adcxq %r14,%r14 + movq %r11,32(%rdi) + movq %r12,40(%rdi) + + mulxq %rdx,%r8,%r9 + movq 32(%rsi),%rdx + adoxq %r8,%r13 + adoxq %r9,%r14 + adcxq %r15,%r15 + adcxq %rcx,%rcx + movq %r13,48(%rdi) + movq %r14,56(%rdi) + + mulxq %rdx,%r8,%r9 + movq 40(%rsi),%rdx + adoxq %r8,%r15 + adoxq %r9,%rcx + adcxq %rbx,%rbx + adcxq %rbp,%rbp + movq %r15,64(%rdi) + movq %rcx,72(%rdi) + + mulxq %rdx,%r8,%r9 + adoxq %r8,%rbx + adoxq %r9,%rbp + + movq %rbx,80(%rdi) + movq %rbp,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + + +.globl _redcx_mont_384 +.private_extern _redcx_mont_384 + +.p2align 5 +_redcx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + + + +.globl _fromx_mont_384 +.private_extern _fromx_mont_384 + +.p2align 5 +_fromx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + + + + + movq %r14,%rax + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq %rcx,%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + imulq %r8,%rdx + + + xorq %r14,%r14 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r13 + adoxq %r14,%rbp + adcxq %rbp,%r14 + imulq %r9,%rdx + + + xorq %r15,%r15 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r14 + adoxq %r15,%rbp + adcxq %rbp,%r15 + imulq %r10,%rdx + + + xorq %r8,%r8 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r15 + adoxq %r8,%rbp + adcxq %rbp,%r8 + imulq %r11,%rdx + + + xorq %r9,%r9 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r8 + adoxq %r9,%rbp + adcxq %rbp,%r9 + imulq %r12,%rdx + + + xorq %r10,%r10 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r9 + adoxq %r10,%rbp + adcxq %rbp,%r10 + imulq %r13,%rdx + + + xorq %r11,%r11 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r10 + adoxq %r11,%rbp + adcxq %rbp,%r11 + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__redc_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0x_pty_mont_384 +.private_extern _sgn0x_pty_mont_384 + +.p2align 5 +_sgn0x_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0x_pty_mont_384x +.private_extern _sgn0x_pty_mont_384x + +.p2align 5 +_sgn0x_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mulx_mont_384 +.private_extern _mulx_mont_384 + +.p2align 5 +_mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 +.byte 102,72,15,110,199 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + movq %r8,(%rsp) + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + xorq %r15,%r15 + + movq %r8,16(%rsp) + imulq 8(%rsp),%r8 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %rbp,%r15 + adoxq %rax,%r15 + adoxq %rax,%rax + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %r8,%r14 + adoxq %r8,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r9,16(%rsp) + imulq 8(%rsp),%r9 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rbp,%rax + adoxq %r8,%rax + adoxq %r8,%r8 + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r9,%r15 + adoxq %r9,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r10,16(%rsp) + imulq 8(%rsp),%r10 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %rbp,%r8 + adoxq %r9,%r8 + adoxq %r9,%r9 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r10,%rax + adoxq %r10,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r11,16(%rsp) + imulq 8(%rsp),%r11 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %rbp,%r9 + adoxq %r10,%r9 + adoxq %r10,%r10 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r11,%r8 + adoxq %r11,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + movq %r12,16(%rsp) + imulq 8(%rsp),%r12 + + + xorq %r11,%r11 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %rbp,%r10 + adoxq %r11,%r10 + adoxq %r11,%r11 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r12,%r9 + adoxq %r12,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + imulq 8(%rsp),%rdx +.byte 102,72,15,126,195 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + movq %r15,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + movq %rax,%rsi + + mulxq 40+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + movq %r14,%rdx + adcxq %r12,%r10 + adoxq %r12,%r11 + leaq 128(%rcx),%rcx + movq %r8,%r12 + adcq $0,%r11 + + + + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r9,%rdi + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r8 + sbbq 32(%rcx),%r9 + movq %r10,%rbp + sbbq 40(%rcx),%r10 + sbbq $0,%r11 + + cmovncq %r14,%rdx + cmovcq %r13,%r15 + cmovcq %rsi,%rax + cmovncq %r8,%r12 + movq %rdx,0(%rbx) + cmovncq %r9,%rdi + movq %r15,8(%rbx) + cmovncq %r10,%rbp + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_mont_384 +.private_extern _sqrx_mont_384 + +.p2align 5 +_sqrx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rcx,%r8 + leaq -128(%rdx),%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 +.byte 102,72,15,110,199 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + leaq (%rsi),%rbx + movq %r8,(%rsp) + leaq -128(%rsi),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqrx_n_mul_mont_384 +.private_extern _sqrx_n_mul_mont_384 + +.p2align 5 +_sqrx_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 +.byte 102,72,15,110,199 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,16(%rsp) + movq 0(%r9),%xmm2 + +L$oop_sqrx_384: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movd %xmm1,%r10d + decl %r10d + jnz L$oop_sqrx_384 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 16(%rsp),%rbx + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqrx_n_mul_mont_383 +.private_extern _sqrx_n_mul_mont_383 + +.p2align 5 +_sqrx_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 +.byte 102,72,15,110,199 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,16(%rsp) + movq 0(%r9),%xmm2 + leaq -128(%rcx),%rcx + +L$oop_sqrx_383: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_383_nonred + + movd %xmm1,%r10d + decl %r10d + jnz L$oop_sqrx_383 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 16(%rsp),%rbx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + movq %r8,%rax + imulq 8(%rsp),%r8 + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %r15,%rbp + adoxq %rbp,%r15 + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %rax,%r14 + adoxq %rax,%r15 + adcxq %rax,%r15 + movq %r9,%r8 + imulq 8(%rsp),%r9 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rax,%rbp + adoxq %rbp,%rax + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r10,%r9 + imulq 8(%rsp),%r10 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %r8,%rbp + adoxq %rbp,%r8 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r11,%r10 + imulq 8(%rsp),%r11 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %r9,%rbp + adoxq %rbp,%r9 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r12,%r11 + imulq 8(%rsp),%r12 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %r10,%rbp + adoxq %rbp,%r10 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + imulq 8(%rsp),%rdx +.byte 102,72,15,126,195 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r14,%rdx + adcxq %rdi,%r9 + adoxq %rbp,%r10 + adcq $0,%r10 + movq %r8,%r12 + + movq %r14,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r9,%rdi + movq %r8,24(%rbx) + movq %r9,32(%rbx) + movq %r10,40(%rbx) + movq %r10,%rbp + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_mont_382x +.private_extern _sqrx_mont_382x + +.p2align 5 +_sqrx_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) + movq %rdi,%xmm0 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + adcq %r12,%r12 + adcq %rdi,%rdi + adcq %rbp,%rbp + + movq %rdx,48(%rbx) + movq %r15,56(%rbx) + movq %rax,64(%rbx) + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32-128(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + + + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + movq 32+96(%rsp),%r14 + leaq 128(%rcx),%rcx + movq 32+0(%rsp),%r8 + andq %r14,%r8 + movq 32+8(%rsp),%r9 + andq %r14,%r9 + movq 32+16(%rsp),%r10 + andq %r14,%r10 + movq 32+24(%rsp),%r11 + andq %r14,%r11 + movq 32+32(%rsp),%r13 + andq %r14,%r13 + andq 32+40(%rsp),%r14 + + subq %r8,%rdx + movq 0(%rcx),%r8 + sbbq %r9,%r15 + movq 8(%rcx),%r9 + sbbq %r10,%rax + movq 16(%rcx),%r10 + sbbq %r11,%r12 + movq 24(%rcx),%r11 + sbbq %r13,%rdi + movq 32(%rcx),%r13 + sbbq %r14,%rbp + sbbq %r14,%r14 + + andq %r14,%r8 + andq %r14,%r9 + andq %r14,%r10 + andq %r14,%r11 + andq %r14,%r13 + andq 40(%rcx),%r14 + + addq %r8,%rdx + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%r12 + adcq %r13,%rdi + adcq %r14,%rbp + + movq %rdx,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/build/mach-o/sha256-x86_64.s b/build/mach-o/sha256-x86_64.s new file mode 100644 index 00000000..bbfa54fb --- /dev/null +++ b/build/mach-o/sha256-x86_64.s @@ -0,0 +1,1438 @@ +.text + +.p2align 6 + +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl _sha256_block_data_order_shaext +.private_extern _sha256_block_data_order_shaext + +.p2align 6 +_sha256_block_data_order_shaext: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 256-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp L$oop_shaext + +.p2align 4 +L$oop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 16-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 48-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 64-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 80-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 96-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 112-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 144-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 176-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 208-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 224-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 240-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz L$oop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sha256_block_data_order +.private_extern _sha256_block_data_order + +.p2align 6 +_sha256_block_data_order: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $40,%rsp +.cfi_adjust_cfa_offset 40 + leaq (%rsi,%rdx,4),%rdx + movq %rdi,0(%rsp) + + movq %rdx,16(%rsp) + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + + + leaq -64(%rsp),%rsp + movl 0(%rdi),%eax + andq $-64,%rsp + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp L$loop_ssse3 +.p2align 4 +L$loop_ssse3: + movdqa K256+256(%rip),%xmm7 + movq %rsi,8(%rbp) + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rsi +.byte 102,15,56,0,207 + movdqa 0(%rsi),%xmm4 + movdqa 16(%rsi),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 32(%rsi),%xmm6 +.byte 102,15,56,0,223 + movdqa 48(%rsi),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp L$ssse3_00_47 + +.p2align 4 +L$ssse3_00_47: + subq $-64,%rsi + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 16(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 32(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 48(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,67(%rsi) + jne L$ssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 0(%rbp),%rdi + movl %r14d,%eax + movq 8(%rbp),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + leaq 64(%rsi),%rsi + cmpq 16(%rbp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop_ssse3 + + xorps %xmm0,%xmm0 + leaq 40+48(%rbp),%r11 +.cfi_def_cfa %r11,8 + movaps %xmm0,0(%rsp) + movaps %xmm0,16(%rsp) + movaps %xmm0,32(%rsp) + movaps %xmm0,48(%rsp) + movq 40(%rbp),%r15 +.cfi_restore %r15 + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbx +.cfi_restore %rbx + movq -8(%r11),%rbp +.cfi_restore %rbp + + leaq (%r11),%rsp + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sha256_emit +.private_extern _sha256_emit + +.p2align 4 +_sha256_emit: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + bswapq %r8 + movq 24(%rsi),%r11 + bswapq %r9 + movl %r8d,4(%rdi) + bswapq %r10 + movl %r9d,12(%rdi) + bswapq %r11 + movl %r10d,20(%rdi) + shrq $32,%r8 + movl %r11d,28(%rdi) + shrq $32,%r9 + movl %r8d,0(%rdi) + shrq $32,%r10 + movl %r9d,8(%rdi) + shrq $32,%r11 + movl %r10d,16(%rdi) + movl %r11d,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sha256_bcopy +.private_extern _sha256_bcopy + +.p2align 4 +_sha256_bcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rsi,%rdi +L$oop_bcopy: + movzbl (%rsi),%eax + leaq 1(%rsi),%rsi + movb %al,-1(%rdi,%rsi,1) + decq %rdx + jnz L$oop_bcopy + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sha256_hcopy +.private_extern _sha256_hcopy + +.p2align 4 +_sha256_hcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/build/refresh.sh b/build/refresh.sh new file mode 100755 index 00000000..6c05fd23 --- /dev/null +++ b/build/refresh.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +HERE=`dirname $0` +cd "${HERE}" +for pl in ../src/asm/*-x86_64.pl; do + s=`basename $pl .pl`.asm + (set -x; ${PERL:-perl} $pl masm > win64/$s) + s=`basename $pl .pl`.s + (set -x; ${PERL:-perl} $pl elf > elf/$s) + (set -x; ${PERL:-perl} $pl mingw64 > coff/$s) + (set -x; ${PERL:-perl} $pl macosx > mach-o/$s) +done + diff --git a/build/win64/add_mod_256-x86_64.asm b/build/win64/add_mod_256-x86_64.asm new file mode 100644 index 00000000..35c2ed2f --- /dev/null +++ b/build/win64/add_mod_256-x86_64.asm @@ -0,0 +1,660 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC add_mod_256 + + +ALIGN 32 +add_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + sub rsp,8 + +$L$SEH_body_add_mod_256:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + +$L$oaded_a_add_mod_256:: + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + mov rax,r8 + adc r10,QWORD PTR[16+rdx] + mov rsi,r9 + adc r11,QWORD PTR[24+rdx] + sbb rdx,rdx + + mov rbx,r10 + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rcx] + mov rbp,r11 + sbb r11,QWORD PTR[24+rcx] + sbb rdx,0 + + cmovc r8,rax + cmovc r9,rsi + mov QWORD PTR[rdi],r8 + cmovc r10,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r11,rbp + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_add_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_mod_256:: +add_mod_256 ENDP + + +PUBLIC mul_by_3_mod_256 + + +ALIGN 32 +mul_by_3_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_3_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + +$L$SEH_body_mul_by_3_mod_256:: + + + mov rcx,rdx + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov rdx,rsi + mov r11,QWORD PTR[24+rsi] + + call __lshift_mod_256 + mov r12,QWORD PTR[rsp] + + jmp $L$oaded_a_add_mod_256 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_mul_by_3_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_3_mod_256:: +mul_by_3_mod_256 ENDP + + +ALIGN 32 +__lshift_mod_256 PROC PRIVATE + DB 243,15,30,250 + add r8,r8 + adc r9,r9 + mov rax,r8 + adc r10,r10 + mov rsi,r9 + adc r11,r11 + sbb r12,r12 + + mov rbx,r10 + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rcx] + mov rbp,r11 + sbb r11,QWORD PTR[24+rcx] + sbb r12,0 + + cmovc r8,rax + cmovc r9,rsi + cmovc r10,rbx + cmovc r11,rbp + + DB 0F3h,0C3h ;repret +__lshift_mod_256 ENDP + + +PUBLIC lshift_mod_256 + + +ALIGN 32 +lshift_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_lshift_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + +$L$SEH_body_lshift_mod_256:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + +$L$oop_lshift_mod_256:: + call __lshift_mod_256 + dec edx + jnz $L$oop_lshift_mod_256 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + mov r12,QWORD PTR[rsp] + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_lshift_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_lshift_mod_256:: +lshift_mod_256 ENDP + + +PUBLIC rshift_mod_256 + + +ALIGN 32 +rshift_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_rshift_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + sub rsp,8 + +$L$SEH_body_rshift_mod_256:: + + + mov rbp,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + +$L$oop_rshift_mod_256:: + mov r8,rbp + and rbp,1 + mov rax,QWORD PTR[rcx] + neg rbp + mov rsi,QWORD PTR[8+rcx] + mov rbx,QWORD PTR[16+rcx] + + and rax,rbp + and rsi,rbp + and rbx,rbp + and rbp,QWORD PTR[24+rcx] + + add r8,rax + adc r9,rsi + adc r10,rbx + adc r11,rbp + sbb rax,rax + + shr r8,1 + mov rbp,r9 + shr r9,1 + mov rbx,r10 + shr r10,1 + mov rsi,r11 + shr r11,1 + + shl rbp,63 + shl rbx,63 + or rbp,r8 + shl rsi,63 + or r9,rbx + shl rax,63 + or r10,rsi + or r11,rax + + dec edx + jnz $L$oop_rshift_mod_256 + + mov QWORD PTR[rdi],rbp + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_rshift_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_rshift_mod_256:: +rshift_mod_256 ENDP + + +PUBLIC cneg_mod_256 + + +ALIGN 32 +cneg_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_cneg_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + +$L$SEH_body_cneg_mod_256:: + + + mov r12,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r8,r12 + mov r11,QWORD PTR[24+rsi] + or r12,r9 + or r12,r10 + or r12,r11 + mov rbp,-1 + + mov rax,QWORD PTR[rcx] + cmovnz r12,rbp + mov rsi,QWORD PTR[8+rcx] + mov rbx,QWORD PTR[16+rcx] + and rax,r12 + mov rbp,QWORD PTR[24+rcx] + and rsi,r12 + and rbx,r12 + and rbp,r12 + + sub rax,r8 + sbb rsi,r9 + sbb rbx,r10 + sbb rbp,r11 + + or rdx,rdx + + cmovz rax,r8 + cmovz rsi,r9 + mov QWORD PTR[rdi],rax + cmovz rbx,r10 + mov QWORD PTR[8+rdi],rsi + cmovz rbp,r11 + mov QWORD PTR[16+rdi],rbx + mov QWORD PTR[24+rdi],rbp + + mov r12,QWORD PTR[rsp] + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_cneg_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_cneg_mod_256:: +cneg_mod_256 ENDP + + +PUBLIC sub_mod_256 + + +ALIGN 32 +sub_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + sub rsp,8 + +$L$SEH_body_sub_mod_256:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + + sub r8,QWORD PTR[rdx] + mov rax,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov rsi,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[24+rcx] + sbb rdx,rdx + + and rax,rdx + and rsi,rdx + and rbx,rdx + and rbp,rdx + + add r8,rax + adc r9,rsi + mov QWORD PTR[rdi],r8 + adc r10,rbx + mov QWORD PTR[8+rdi],r9 + adc r11,rbp + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_sub_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_mod_256:: +sub_mod_256 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_add_mod_256 + DD imagerel $L$SEH_body_add_mod_256 + DD imagerel $L$SEH_info_add_mod_256_prologue + + DD imagerel $L$SEH_body_add_mod_256 + DD imagerel $L$SEH_epilogue_add_mod_256 + DD imagerel $L$SEH_info_add_mod_256_body + + DD imagerel $L$SEH_epilogue_add_mod_256 + DD imagerel $L$SEH_end_add_mod_256 + DD imagerel $L$SEH_info_add_mod_256_epilogue + + DD imagerel $L$SEH_begin_mul_by_3_mod_256 + DD imagerel $L$SEH_body_mul_by_3_mod_256 + DD imagerel $L$SEH_info_mul_by_3_mod_256_prologue + + DD imagerel $L$SEH_body_mul_by_3_mod_256 + DD imagerel $L$SEH_epilogue_mul_by_3_mod_256 + DD imagerel $L$SEH_info_mul_by_3_mod_256_body + + DD imagerel $L$SEH_epilogue_mul_by_3_mod_256 + DD imagerel $L$SEH_end_mul_by_3_mod_256 + DD imagerel $L$SEH_info_mul_by_3_mod_256_epilogue + + DD imagerel $L$SEH_begin_lshift_mod_256 + DD imagerel $L$SEH_body_lshift_mod_256 + DD imagerel $L$SEH_info_lshift_mod_256_prologue + + DD imagerel $L$SEH_body_lshift_mod_256 + DD imagerel $L$SEH_epilogue_lshift_mod_256 + DD imagerel $L$SEH_info_lshift_mod_256_body + + DD imagerel $L$SEH_epilogue_lshift_mod_256 + DD imagerel $L$SEH_end_lshift_mod_256 + DD imagerel $L$SEH_info_lshift_mod_256_epilogue + + DD imagerel $L$SEH_begin_rshift_mod_256 + DD imagerel $L$SEH_body_rshift_mod_256 + DD imagerel $L$SEH_info_rshift_mod_256_prologue + + DD imagerel $L$SEH_body_rshift_mod_256 + DD imagerel $L$SEH_epilogue_rshift_mod_256 + DD imagerel $L$SEH_info_rshift_mod_256_body + + DD imagerel $L$SEH_epilogue_rshift_mod_256 + DD imagerel $L$SEH_end_rshift_mod_256 + DD imagerel $L$SEH_info_rshift_mod_256_epilogue + + DD imagerel $L$SEH_begin_cneg_mod_256 + DD imagerel $L$SEH_body_cneg_mod_256 + DD imagerel $L$SEH_info_cneg_mod_256_prologue + + DD imagerel $L$SEH_body_cneg_mod_256 + DD imagerel $L$SEH_epilogue_cneg_mod_256 + DD imagerel $L$SEH_info_cneg_mod_256_body + + DD imagerel $L$SEH_epilogue_cneg_mod_256 + DD imagerel $L$SEH_end_cneg_mod_256 + DD imagerel $L$SEH_info_cneg_mod_256_epilogue + + DD imagerel $L$SEH_begin_sub_mod_256 + DD imagerel $L$SEH_body_sub_mod_256 + DD imagerel $L$SEH_info_sub_mod_256_prologue + + DD imagerel $L$SEH_body_sub_mod_256 + DD imagerel $L$SEH_epilogue_sub_mod_256 + DD imagerel $L$SEH_info_sub_mod_256_body + + DD imagerel $L$SEH_epilogue_sub_mod_256 + DD imagerel $L$SEH_end_sub_mod_256 + DD imagerel $L$SEH_info_sub_mod_256_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_add_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_add_mod_256_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h +$L$SEH_info_add_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_3_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_3_mod_256_body:: +DB 1,0,11,0 +DB 000h,0c4h,000h,000h +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_mul_by_3_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_lshift_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_lshift_mod_256_body:: +DB 1,0,11,0 +DB 000h,0c4h,000h,000h +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_lshift_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_rshift_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_rshift_mod_256_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h +$L$SEH_info_rshift_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_cneg_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_cneg_mod_256_body:: +DB 1,0,11,0 +DB 000h,0c4h,000h,000h +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_cneg_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sub_mod_256_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h +$L$SEH_info_sub_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/build/win64/add_mod_384-x86_64.asm b/build/win64/add_mod_384-x86_64.asm new file mode 100644 index 00000000..389a15ce --- /dev/null +++ b/build/win64/add_mod_384-x86_64.asm @@ -0,0 +1,2191 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' +EXTERN BLS12_381_P:NEAR + + +PUBLIC add_mod_384 + + +ALIGN 32 +add_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_add_mod_384:: + + + call __add_mod_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_add_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_mod_384:: +add_mod_384 ENDP + + +ALIGN 32 +__add_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +__add_mod_384_a_is_loaded:: + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + adc r10,QWORD PTR[16+rdx] + mov r14,r8 + adc r11,QWORD PTR[24+rdx] + mov r15,r9 + adc r12,QWORD PTR[32+rdx] + mov rax,r10 + adc r13,QWORD PTR[40+rdx] + mov rbx,r11 + sbb rdx,rdx + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + mov QWORD PTR[rdi],r8 + cmovc r11,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r12,rbp + mov QWORD PTR[16+rdi],r10 + cmovc r13,rsi + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__add_mod_384 ENDP + +PUBLIC add_mod_384x + + +ALIGN 32 +add_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_mod_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,24 + +$L$SEH_body_add_mod_384x:: + + + mov QWORD PTR[rsp],rsi + mov QWORD PTR[8+rsp],rdx + lea rsi,QWORD PTR[48+rsi] + lea rdx,QWORD PTR[48+rdx] + lea rdi,QWORD PTR[48+rdi] + call __add_mod_384 + + mov rsi,QWORD PTR[rsp] + mov rdx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-48))+rdi] + call __add_mod_384 + + mov r15,QWORD PTR[((24+0))+rsp] + + mov r14,QWORD PTR[((24+8))+rsp] + + mov r13,QWORD PTR[((24+16))+rsp] + + mov r12,QWORD PTR[((24+24))+rsp] + + mov rbx,QWORD PTR[((24+32))+rsp] + + mov rbp,QWORD PTR[((24+40))+rsp] + + lea rsp,QWORD PTR[((24+48))+rsp] + +$L$SEH_epilogue_add_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_mod_384x:: +add_mod_384x ENDP + + +PUBLIC lshift_mod_384 + + +ALIGN 32 +lshift_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_lshift_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_lshift_mod_384:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +$L$oop_lshift_mod_384:: + add r8,r8 + adc r9,r9 + adc r10,r10 + mov r14,r8 + adc r11,r11 + mov r15,r9 + adc r12,r12 + mov rax,r10 + adc r13,r13 + mov rbx,r11 + sbb rdi,rdi + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdi,0 + + mov rdi,QWORD PTR[rsp] + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + cmovc r11,rbx + cmovc r12,rbp + cmovc r13,rsi + + dec edx + jnz $L$oop_lshift_mod_384 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_lshift_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_lshift_mod_384:: +lshift_mod_384 ENDP + + +ALIGN 32 +__lshift_mod_384 PROC PRIVATE + DB 243,15,30,250 + add r8,r8 + adc r9,r9 + adc r10,r10 + mov r14,r8 + adc r11,r11 + mov r15,r9 + adc r12,r12 + mov rax,r10 + adc r13,r13 + mov rbx,r11 + sbb rdx,rdx + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + cmovc r11,rbx + cmovc r12,rbp + cmovc r13,rsi + + DB 0F3h,0C3h ;repret +__lshift_mod_384 ENDP + + +PUBLIC mul_by_3_mod_384 + + +ALIGN 32 +mul_by_3_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_3_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_mul_by_3_mod_384:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov rcx,rdx + + call __lshift_mod_384 + + mov rdx,QWORD PTR[rsp] + call __add_mod_384_a_is_loaded + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_3_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_3_mod_384:: +mul_by_3_mod_384 ENDP + +PUBLIC mul_by_8_mod_384 + + +ALIGN 32 +mul_by_8_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_8_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_mul_by_8_mod_384:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov rcx,rdx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_8_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_8_mod_384:: +mul_by_8_mod_384 ENDP + +PUBLIC mul_by_b_onE1 + + +ALIGN 32 +mul_by_b_onE1 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_b_onE1:: + mov rdi,rcx + mov rsi,rdx + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_mul_by_b_onE1:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + lea rcx,QWORD PTR[BLS12_381_P] + + call __lshift_mod_384 + call __lshift_mod_384 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_b_onE1:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_b_onE1:: +mul_by_b_onE1 ENDP + +PUBLIC mul_by_4b_onE1 + + +ALIGN 32 +mul_by_4b_onE1 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_4b_onE1:: + mov rdi,rcx + mov rsi,rdx + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_mul_by_4b_onE1:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + lea rcx,QWORD PTR[BLS12_381_P] + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_4b_onE1:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_4b_onE1:: +mul_by_4b_onE1 ENDP + + +PUBLIC mul_by_3_mod_384x + + +ALIGN 32 +mul_by_3_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_3_mod_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_mul_by_3_mod_384x:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov rcx,rdx + + call __lshift_mod_384 + + mov rdx,QWORD PTR[rsp] + call __add_mod_384_a_is_loaded + + mov rsi,QWORD PTR[rsp] + lea rdi,QWORD PTR[48+rdi] + + mov r8,QWORD PTR[48+rsi] + mov r9,QWORD PTR[56+rsi] + mov r10,QWORD PTR[64+rsi] + mov r11,QWORD PTR[72+rsi] + mov r12,QWORD PTR[80+rsi] + mov r13,QWORD PTR[88+rsi] + + call __lshift_mod_384 + + mov rdx,8*6 + add rdx,QWORD PTR[rsp] + call __add_mod_384_a_is_loaded + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_3_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_3_mod_384x:: +mul_by_3_mod_384x ENDP + +PUBLIC mul_by_8_mod_384x + + +ALIGN 32 +mul_by_8_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_8_mod_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_mul_by_8_mod_384x:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov rcx,rdx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov rsi,QWORD PTR[rsp] + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r8,QWORD PTR[((48+0))+rsi] + mov r9,QWORD PTR[((48+8))+rsi] + mov r10,QWORD PTR[((48+16))+rsi] + mov r11,QWORD PTR[((48+24))+rsi] + mov r12,QWORD PTR[((48+32))+rsi] + mov r13,QWORD PTR[((48+40))+rsi] + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov QWORD PTR[((48+0))+rdi],r8 + mov QWORD PTR[((48+8))+rdi],r9 + mov QWORD PTR[((48+16))+rdi],r10 + mov QWORD PTR[((48+24))+rdi],r11 + mov QWORD PTR[((48+32))+rdi],r12 + mov QWORD PTR[((48+40))+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_8_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_8_mod_384x:: +mul_by_8_mod_384x ENDP + +PUBLIC mul_by_b_onE2 + + +ALIGN 32 +mul_by_b_onE2 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_b_onE2:: + mov rdi,rcx + mov rsi,rdx + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_mul_by_b_onE2:: + + + lea rcx,QWORD PTR[BLS12_381_P] + lea rdx,QWORD PTR[48+rsi] + call __sub_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov rsi,QWORD PTR[rsp] + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[48+rdi] + call __add_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_b_onE2:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_b_onE2:: +mul_by_b_onE2 ENDP + +PUBLIC mul_by_4b_onE2 + + +ALIGN 32 +mul_by_4b_onE2 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_4b_onE2:: + mov rdi,rcx + mov rsi,rdx + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_mul_by_4b_onE2:: + + + lea rcx,QWORD PTR[BLS12_381_P] + lea rdx,QWORD PTR[48+rsi] + call __sub_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov rsi,QWORD PTR[rsp] + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[48+rdi] + call __add_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_4b_onE2:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_4b_onE2:: +mul_by_4b_onE2 ENDP + + +PUBLIC cneg_mod_384 + + +ALIGN 32 +cneg_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_cneg_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdx + +$L$SEH_body_cneg_mod_384:: + + + mov rdx,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r8,rdx + mov r11,QWORD PTR[24+rsi] + or rdx,r9 + mov r12,QWORD PTR[32+rsi] + or rdx,r10 + mov r13,QWORD PTR[40+rsi] + or rdx,r11 + mov rsi,-1 + or rdx,r12 + or rdx,r13 + + mov r14,QWORD PTR[rcx] + cmovnz rdx,rsi + mov r15,QWORD PTR[8+rcx] + mov rax,QWORD PTR[16+rcx] + and r14,rdx + mov rbx,QWORD PTR[24+rcx] + and r15,rdx + mov rbp,QWORD PTR[32+rcx] + and rax,rdx + mov rsi,QWORD PTR[40+rcx] + and rbx,rdx + mov rcx,QWORD PTR[rsp] + and rbp,rdx + and rsi,rdx + + sub r14,r8 + sbb r15,r9 + sbb rax,r10 + sbb rbx,r11 + sbb rbp,r12 + sbb rsi,r13 + + or rcx,rcx + + cmovz r14,r8 + cmovz r15,r9 + cmovz rax,r10 + mov QWORD PTR[rdi],r14 + cmovz rbx,r11 + mov QWORD PTR[8+rdi],r15 + cmovz rbp,r12 + mov QWORD PTR[16+rdi],rax + cmovz rsi,r13 + mov QWORD PTR[24+rdi],rbx + mov QWORD PTR[32+rdi],rbp + mov QWORD PTR[40+rdi],rsi + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_cneg_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_cneg_mod_384:: +cneg_mod_384 ENDP + + +PUBLIC sub_mod_384 + + +ALIGN 32 +sub_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sub_mod_384:: + + + call __sub_mod_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sub_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_mod_384:: +sub_mod_384 ENDP + + +ALIGN 32 +__sub_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + sub r8,QWORD PTR[rdx] + mov r14,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov r15,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rax,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbx,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rdx] + mov rbp,QWORD PTR[32+rcx] + sbb r13,QWORD PTR[40+rdx] + mov rsi,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r14,rdx + and r15,rdx + and rax,rdx + and rbx,rdx + and rbp,rdx + and rsi,rdx + + add r8,r14 + adc r9,r15 + mov QWORD PTR[rdi],r8 + adc r10,rax + mov QWORD PTR[8+rdi],r9 + adc r11,rbx + mov QWORD PTR[16+rdi],r10 + adc r12,rbp + mov QWORD PTR[24+rdi],r11 + adc r13,rsi + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__sub_mod_384 ENDP + +PUBLIC sub_mod_384x + + +ALIGN 32 +sub_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_mod_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,24 + +$L$SEH_body_sub_mod_384x:: + + + mov QWORD PTR[rsp],rsi + mov QWORD PTR[8+rsp],rdx + lea rsi,QWORD PTR[48+rsi] + lea rdx,QWORD PTR[48+rdx] + lea rdi,QWORD PTR[48+rdi] + call __sub_mod_384 + + mov rsi,QWORD PTR[rsp] + mov rdx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-48))+rdi] + call __sub_mod_384 + + mov r15,QWORD PTR[((24+0))+rsp] + + mov r14,QWORD PTR[((24+8))+rsp] + + mov r13,QWORD PTR[((24+16))+rsp] + + mov r12,QWORD PTR[((24+24))+rsp] + + mov rbx,QWORD PTR[((24+32))+rsp] + + mov rbp,QWORD PTR[((24+40))+rsp] + + lea rsp,QWORD PTR[((24+48))+rsp] + +$L$SEH_epilogue_sub_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_mod_384x:: +sub_mod_384x ENDP +PUBLIC mul_by_1_plus_i_mod_384x + + +ALIGN 32 +mul_by_1_plus_i_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_1_plus_i_mod_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,56 + +$L$SEH_body_mul_by_1_plus_i_mod_384x:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,r8 + add r8,QWORD PTR[48+rsi] + mov r15,r9 + adc r9,QWORD PTR[56+rsi] + mov rax,r10 + adc r10,QWORD PTR[64+rsi] + mov rbx,r11 + adc r11,QWORD PTR[72+rsi] + mov rcx,r12 + adc r12,QWORD PTR[80+rsi] + mov rbp,r13 + adc r13,QWORD PTR[88+rsi] + mov QWORD PTR[48+rsp],rdi + sbb rdi,rdi + + sub r14,QWORD PTR[48+rsi] + sbb r15,QWORD PTR[56+rsi] + sbb rax,QWORD PTR[64+rsi] + sbb rbx,QWORD PTR[72+rsi] + sbb rcx,QWORD PTR[80+rsi] + sbb rbp,QWORD PTR[88+rsi] + sbb rsi,rsi + + mov QWORD PTR[rsp],r8 + mov r8,QWORD PTR[rdx] + mov QWORD PTR[8+rsp],r9 + mov r9,QWORD PTR[8+rdx] + mov QWORD PTR[16+rsp],r10 + mov r10,QWORD PTR[16+rdx] + mov QWORD PTR[24+rsp],r11 + mov r11,QWORD PTR[24+rdx] + mov QWORD PTR[32+rsp],r12 + and r8,rsi + mov r12,QWORD PTR[32+rdx] + mov QWORD PTR[40+rsp],r13 + and r9,rsi + mov r13,QWORD PTR[40+rdx] + and r10,rsi + and r11,rsi + and r12,rsi + and r13,rsi + mov rsi,QWORD PTR[48+rsp] + + add r14,r8 + mov r8,QWORD PTR[rsp] + adc r15,r9 + mov r9,QWORD PTR[8+rsp] + adc rax,r10 + mov r10,QWORD PTR[16+rsp] + adc rbx,r11 + mov r11,QWORD PTR[24+rsp] + adc rcx,r12 + mov r12,QWORD PTR[32+rsp] + adc rbp,r13 + mov r13,QWORD PTR[40+rsp] + + mov QWORD PTR[rsi],r14 + mov r14,r8 + mov QWORD PTR[8+rsi],r15 + mov QWORD PTR[16+rsi],rax + mov r15,r9 + mov QWORD PTR[24+rsi],rbx + mov QWORD PTR[32+rsi],rcx + mov rax,r10 + mov QWORD PTR[40+rsi],rbp + + sub r8,QWORD PTR[rdx] + mov rbx,r11 + sbb r9,QWORD PTR[8+rdx] + sbb r10,QWORD PTR[16+rdx] + mov rcx,r12 + sbb r11,QWORD PTR[24+rdx] + sbb r12,QWORD PTR[32+rdx] + mov rbp,r13 + sbb r13,QWORD PTR[40+rdx] + sbb rdi,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + mov QWORD PTR[48+rsi],r8 + cmovc r11,rbx + mov QWORD PTR[56+rsi],r9 + cmovc r12,rcx + mov QWORD PTR[64+rsi],r10 + cmovc r13,rbp + mov QWORD PTR[72+rsi],r11 + mov QWORD PTR[80+rsi],r12 + mov QWORD PTR[88+rsi],r13 + + mov r15,QWORD PTR[((56+0))+rsp] + + mov r14,QWORD PTR[((56+8))+rsp] + + mov r13,QWORD PTR[((56+16))+rsp] + + mov r12,QWORD PTR[((56+24))+rsp] + + mov rbx,QWORD PTR[((56+32))+rsp] + + mov rbp,QWORD PTR[((56+40))+rsp] + + lea rsp,QWORD PTR[((56+48))+rsp] + +$L$SEH_epilogue_mul_by_1_plus_i_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_1_plus_i_mod_384x:: +mul_by_1_plus_i_mod_384x ENDP +PUBLIC sgn0_pty_mod_384 + + +ALIGN 32 +sgn0_pty_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0_pty_mod_384:: + mov rdi,rcx + mov rsi,rdx + + + +$L$SEH_body_sgn0_pty_mod_384:: + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + mov rcx,QWORD PTR[32+rdi] + mov rdx,QWORD PTR[40+rdi] + + xor rax,rax + mov rdi,r8 + add r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rcx,rcx + adc rdx,rdx + adc rax,0 + + sub r8,QWORD PTR[rsi] + sbb r9,QWORD PTR[8+rsi] + sbb r10,QWORD PTR[16+rsi] + sbb r11,QWORD PTR[24+rsi] + sbb rcx,QWORD PTR[32+rsi] + sbb rdx,QWORD PTR[40+rsi] + sbb rax,0 + + not rax + and rdi,1 + and rax,2 + or rax,rdi + +$L$SEH_epilogue_sgn0_pty_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0_pty_mod_384:: +sgn0_pty_mod_384 ENDP + +PUBLIC sgn0_pty_mod_384x + + +ALIGN 32 +sgn0_pty_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0_pty_mod_384x:: + mov rdi,rcx + mov rsi,rdx + + + + push rbp + + push rbx + + sub rsp,8 + +$L$SEH_body_sgn0_pty_mod_384x:: + + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + mov rcx,QWORD PTR[32+rdi] + mov rdx,QWORD PTR[40+rdi] + + mov rbx,r8 + or r8,r9 + or r8,r10 + or r8,r11 + or r8,rcx + or r8,rdx + + xor rax,rax + mov rbp,rbx + add rbx,rbx + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rcx,rcx + adc rdx,rdx + adc rax,0 + + sub rbx,QWORD PTR[rsi] + sbb r9,QWORD PTR[8+rsi] + sbb r10,QWORD PTR[16+rsi] + sbb r11,QWORD PTR[24+rsi] + sbb rcx,QWORD PTR[32+rsi] + sbb rdx,QWORD PTR[40+rsi] + sbb rax,0 + + mov QWORD PTR[rsp],r8 + not rax + and rbp,1 + and rax,2 + or rax,rbp + + mov r8,QWORD PTR[48+rdi] + mov r9,QWORD PTR[56+rdi] + mov r10,QWORD PTR[64+rdi] + mov r11,QWORD PTR[72+rdi] + mov rcx,QWORD PTR[80+rdi] + mov rdx,QWORD PTR[88+rdi] + + mov rbx,r8 + or r8,r9 + or r8,r10 + or r8,r11 + or r8,rcx + or r8,rdx + + xor rdi,rdi + mov rbp,rbx + add rbx,rbx + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rcx,rcx + adc rdx,rdx + adc rdi,0 + + sub rbx,QWORD PTR[rsi] + sbb r9,QWORD PTR[8+rsi] + sbb r10,QWORD PTR[16+rsi] + sbb r11,QWORD PTR[24+rsi] + sbb rcx,QWORD PTR[32+rsi] + sbb rdx,QWORD PTR[40+rsi] + sbb rdi,0 + + mov rbx,QWORD PTR[rsp] + + not rdi + + test r8,r8 + cmovnz rax,rdi + + test rbx,rbx + cmovz rbp,rdi + + and rbp,1 + and rax,2 + or rax,rbp + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_sgn0_pty_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0_pty_mod_384x:: +sgn0_pty_mod_384x ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_add_mod_384 + DD imagerel $L$SEH_body_add_mod_384 + DD imagerel $L$SEH_info_add_mod_384_prologue + + DD imagerel $L$SEH_body_add_mod_384 + DD imagerel $L$SEH_epilogue_add_mod_384 + DD imagerel $L$SEH_info_add_mod_384_body + + DD imagerel $L$SEH_epilogue_add_mod_384 + DD imagerel $L$SEH_end_add_mod_384 + DD imagerel $L$SEH_info_add_mod_384_epilogue + + DD imagerel $L$SEH_begin_add_mod_384x + DD imagerel $L$SEH_body_add_mod_384x + DD imagerel $L$SEH_info_add_mod_384x_prologue + + DD imagerel $L$SEH_body_add_mod_384x + DD imagerel $L$SEH_epilogue_add_mod_384x + DD imagerel $L$SEH_info_add_mod_384x_body + + DD imagerel $L$SEH_epilogue_add_mod_384x + DD imagerel $L$SEH_end_add_mod_384x + DD imagerel $L$SEH_info_add_mod_384x_epilogue + + DD imagerel $L$SEH_begin_lshift_mod_384 + DD imagerel $L$SEH_body_lshift_mod_384 + DD imagerel $L$SEH_info_lshift_mod_384_prologue + + DD imagerel $L$SEH_body_lshift_mod_384 + DD imagerel $L$SEH_epilogue_lshift_mod_384 + DD imagerel $L$SEH_info_lshift_mod_384_body + + DD imagerel $L$SEH_epilogue_lshift_mod_384 + DD imagerel $L$SEH_end_lshift_mod_384 + DD imagerel $L$SEH_info_lshift_mod_384_epilogue + + DD imagerel $L$SEH_begin_mul_by_3_mod_384 + DD imagerel $L$SEH_body_mul_by_3_mod_384 + DD imagerel $L$SEH_info_mul_by_3_mod_384_prologue + + DD imagerel $L$SEH_body_mul_by_3_mod_384 + DD imagerel $L$SEH_epilogue_mul_by_3_mod_384 + DD imagerel $L$SEH_info_mul_by_3_mod_384_body + + DD imagerel $L$SEH_epilogue_mul_by_3_mod_384 + DD imagerel $L$SEH_end_mul_by_3_mod_384 + DD imagerel $L$SEH_info_mul_by_3_mod_384_epilogue + + DD imagerel $L$SEH_begin_mul_by_8_mod_384 + DD imagerel $L$SEH_body_mul_by_8_mod_384 + DD imagerel $L$SEH_info_mul_by_8_mod_384_prologue + + DD imagerel $L$SEH_body_mul_by_8_mod_384 + DD imagerel $L$SEH_epilogue_mul_by_8_mod_384 + DD imagerel $L$SEH_info_mul_by_8_mod_384_body + + DD imagerel $L$SEH_epilogue_mul_by_8_mod_384 + DD imagerel $L$SEH_end_mul_by_8_mod_384 + DD imagerel $L$SEH_info_mul_by_8_mod_384_epilogue + + DD imagerel $L$SEH_begin_mul_by_b_onE1 + DD imagerel $L$SEH_body_mul_by_b_onE1 + DD imagerel $L$SEH_info_mul_by_b_onE1_prologue + + DD imagerel $L$SEH_body_mul_by_b_onE1 + DD imagerel $L$SEH_epilogue_mul_by_b_onE1 + DD imagerel $L$SEH_info_mul_by_b_onE1_body + + DD imagerel $L$SEH_epilogue_mul_by_b_onE1 + DD imagerel $L$SEH_end_mul_by_b_onE1 + DD imagerel $L$SEH_info_mul_by_b_onE1_epilogue + + DD imagerel $L$SEH_begin_mul_by_4b_onE1 + DD imagerel $L$SEH_body_mul_by_4b_onE1 + DD imagerel $L$SEH_info_mul_by_4b_onE1_prologue + + DD imagerel $L$SEH_body_mul_by_4b_onE1 + DD imagerel $L$SEH_epilogue_mul_by_4b_onE1 + DD imagerel $L$SEH_info_mul_by_4b_onE1_body + + DD imagerel $L$SEH_epilogue_mul_by_4b_onE1 + DD imagerel $L$SEH_end_mul_by_4b_onE1 + DD imagerel $L$SEH_info_mul_by_4b_onE1_epilogue + + DD imagerel $L$SEH_begin_mul_by_3_mod_384x + DD imagerel $L$SEH_body_mul_by_3_mod_384x + DD imagerel $L$SEH_info_mul_by_3_mod_384x_prologue + + DD imagerel $L$SEH_body_mul_by_3_mod_384x + DD imagerel $L$SEH_epilogue_mul_by_3_mod_384x + DD imagerel $L$SEH_info_mul_by_3_mod_384x_body + + DD imagerel $L$SEH_epilogue_mul_by_3_mod_384x + DD imagerel $L$SEH_end_mul_by_3_mod_384x + DD imagerel $L$SEH_info_mul_by_3_mod_384x_epilogue + + DD imagerel $L$SEH_begin_mul_by_8_mod_384x + DD imagerel $L$SEH_body_mul_by_8_mod_384x + DD imagerel $L$SEH_info_mul_by_8_mod_384x_prologue + + DD imagerel $L$SEH_body_mul_by_8_mod_384x + DD imagerel $L$SEH_epilogue_mul_by_8_mod_384x + DD imagerel $L$SEH_info_mul_by_8_mod_384x_body + + DD imagerel $L$SEH_epilogue_mul_by_8_mod_384x + DD imagerel $L$SEH_end_mul_by_8_mod_384x + DD imagerel $L$SEH_info_mul_by_8_mod_384x_epilogue + + DD imagerel $L$SEH_begin_mul_by_b_onE2 + DD imagerel $L$SEH_body_mul_by_b_onE2 + DD imagerel $L$SEH_info_mul_by_b_onE2_prologue + + DD imagerel $L$SEH_body_mul_by_b_onE2 + DD imagerel $L$SEH_epilogue_mul_by_b_onE2 + DD imagerel $L$SEH_info_mul_by_b_onE2_body + + DD imagerel $L$SEH_epilogue_mul_by_b_onE2 + DD imagerel $L$SEH_end_mul_by_b_onE2 + DD imagerel $L$SEH_info_mul_by_b_onE2_epilogue + + DD imagerel $L$SEH_begin_mul_by_4b_onE2 + DD imagerel $L$SEH_body_mul_by_4b_onE2 + DD imagerel $L$SEH_info_mul_by_4b_onE2_prologue + + DD imagerel $L$SEH_body_mul_by_4b_onE2 + DD imagerel $L$SEH_epilogue_mul_by_4b_onE2 + DD imagerel $L$SEH_info_mul_by_4b_onE2_body + + DD imagerel $L$SEH_epilogue_mul_by_4b_onE2 + DD imagerel $L$SEH_end_mul_by_4b_onE2 + DD imagerel $L$SEH_info_mul_by_4b_onE2_epilogue + + DD imagerel $L$SEH_begin_cneg_mod_384 + DD imagerel $L$SEH_body_cneg_mod_384 + DD imagerel $L$SEH_info_cneg_mod_384_prologue + + DD imagerel $L$SEH_body_cneg_mod_384 + DD imagerel $L$SEH_epilogue_cneg_mod_384 + DD imagerel $L$SEH_info_cneg_mod_384_body + + DD imagerel $L$SEH_epilogue_cneg_mod_384 + DD imagerel $L$SEH_end_cneg_mod_384 + DD imagerel $L$SEH_info_cneg_mod_384_epilogue + + DD imagerel $L$SEH_begin_sub_mod_384 + DD imagerel $L$SEH_body_sub_mod_384 + DD imagerel $L$SEH_info_sub_mod_384_prologue + + DD imagerel $L$SEH_body_sub_mod_384 + DD imagerel $L$SEH_epilogue_sub_mod_384 + DD imagerel $L$SEH_info_sub_mod_384_body + + DD imagerel $L$SEH_epilogue_sub_mod_384 + DD imagerel $L$SEH_end_sub_mod_384 + DD imagerel $L$SEH_info_sub_mod_384_epilogue + + DD imagerel $L$SEH_begin_sub_mod_384x + DD imagerel $L$SEH_body_sub_mod_384x + DD imagerel $L$SEH_info_sub_mod_384x_prologue + + DD imagerel $L$SEH_body_sub_mod_384x + DD imagerel $L$SEH_epilogue_sub_mod_384x + DD imagerel $L$SEH_info_sub_mod_384x_body + + DD imagerel $L$SEH_epilogue_sub_mod_384x + DD imagerel $L$SEH_end_sub_mod_384x + DD imagerel $L$SEH_info_sub_mod_384x_epilogue + + DD imagerel $L$SEH_begin_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_body_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_prologue + + DD imagerel $L$SEH_body_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_epilogue_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_body + + DD imagerel $L$SEH_epilogue_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_end_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_epilogue + + DD imagerel $L$SEH_begin_sgn0_pty_mod_384 + DD imagerel $L$SEH_body_sgn0_pty_mod_384 + DD imagerel $L$SEH_info_sgn0_pty_mod_384_prologue + + DD imagerel $L$SEH_body_sgn0_pty_mod_384 + DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384 + DD imagerel $L$SEH_info_sgn0_pty_mod_384_body + + DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384 + DD imagerel $L$SEH_end_sgn0_pty_mod_384 + DD imagerel $L$SEH_info_sgn0_pty_mod_384_epilogue + + DD imagerel $L$SEH_begin_sgn0_pty_mod_384x + DD imagerel $L$SEH_body_sgn0_pty_mod_384x + DD imagerel $L$SEH_info_sgn0_pty_mod_384x_prologue + + DD imagerel $L$SEH_body_sgn0_pty_mod_384x + DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384x + DD imagerel $L$SEH_info_sgn0_pty_mod_384x_body + + DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384x + DD imagerel $L$SEH_end_sgn0_pty_mod_384x + DD imagerel $L$SEH_info_sgn0_pty_mod_384x_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_add_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_add_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_add_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_add_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_add_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h +$L$SEH_info_add_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_lshift_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_lshift_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_lshift_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_3_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_3_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_by_3_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_8_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_8_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_by_8_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_b_onE1_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_b_onE1_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_by_b_onE1_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_4b_onE1_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_4b_onE1_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_by_4b_onE1_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_3_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_3_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_by_3_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_8_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_8_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_by_8_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_b_onE2_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_b_onE2_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_by_b_onE2_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_4b_onE2_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_4b_onE2_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_by_4b_onE2_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_cneg_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_cneg_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_cneg_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sub_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sub_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sub_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h +$L$SEH_info_sub_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_1_plus_i_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_1_plus_i_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,007h,000h +DB 000h,0e4h,008h,000h +DB 000h,0d4h,009h,000h +DB 000h,0c4h,00ah,000h +DB 000h,034h,00bh,000h +DB 000h,054h,00ch,000h +DB 000h,074h,00eh,000h +DB 000h,064h,00fh,000h +DB 000h,0c2h +DB 000h,000h +$L$SEH_info_mul_by_1_plus_i_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0_pty_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sgn0_pty_mod_384_body:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sgn0_pty_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0_pty_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sgn0_pty_mod_384x_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h +$L$SEH_info_sgn0_pty_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/build/win64/add_mod_384x384-x86_64.asm b/build/win64/add_mod_384x384-x86_64.asm new file mode 100644 index 00000000..57d1752f --- /dev/null +++ b/build/win64/add_mod_384x384-x86_64.asm @@ -0,0 +1,334 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + + +ALIGN 32 +__add_mod_384x384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + + add r8,QWORD PTR[rdx] + mov r15,QWORD PTR[56+rsi] + adc r9,QWORD PTR[8+rdx] + mov rax,QWORD PTR[64+rsi] + adc r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[72+rsi] + adc r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[80+rsi] + adc r12,QWORD PTR[32+rdx] + mov rsi,QWORD PTR[88+rsi] + adc r13,QWORD PTR[40+rdx] + mov QWORD PTR[rdi],r8 + adc r14,QWORD PTR[48+rdx] + mov QWORD PTR[8+rdi],r9 + adc r15,QWORD PTR[56+rdx] + mov QWORD PTR[16+rdi],r10 + adc rax,QWORD PTR[64+rdx] + mov QWORD PTR[32+rdi],r12 + mov r8,r14 + adc rbx,QWORD PTR[72+rdx] + mov QWORD PTR[24+rdi],r11 + mov r9,r15 + adc rbp,QWORD PTR[80+rdx] + mov QWORD PTR[40+rdi],r13 + mov r10,rax + adc rsi,QWORD PTR[88+rdx] + mov r11,rbx + sbb rdx,rdx + + sub r14,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + mov r12,rbp + sbb rax,QWORD PTR[16+rcx] + sbb rbx,QWORD PTR[24+rcx] + sbb rbp,QWORD PTR[32+rcx] + mov r13,rsi + sbb rsi,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r14,r8 + cmovc r15,r9 + cmovc rax,r10 + mov QWORD PTR[48+rdi],r14 + cmovc rbx,r11 + mov QWORD PTR[56+rdi],r15 + cmovc rbp,r12 + mov QWORD PTR[64+rdi],rax + cmovc rsi,r13 + mov QWORD PTR[72+rdi],rbx + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rsi + + DB 0F3h,0C3h ;repret +__add_mod_384x384 ENDP + + +ALIGN 32 +__sub_mod_384x384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + + sub r8,QWORD PTR[rdx] + mov r15,QWORD PTR[56+rsi] + sbb r9,QWORD PTR[8+rdx] + mov rax,QWORD PTR[64+rsi] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[72+rsi] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[80+rsi] + sbb r12,QWORD PTR[32+rdx] + mov rsi,QWORD PTR[88+rsi] + sbb r13,QWORD PTR[40+rdx] + mov QWORD PTR[rdi],r8 + sbb r14,QWORD PTR[48+rdx] + mov r8,QWORD PTR[rcx] + mov QWORD PTR[8+rdi],r9 + sbb r15,QWORD PTR[56+rdx] + mov r9,QWORD PTR[8+rcx] + mov QWORD PTR[16+rdi],r10 + sbb rax,QWORD PTR[64+rdx] + mov r10,QWORD PTR[16+rcx] + mov QWORD PTR[24+rdi],r11 + sbb rbx,QWORD PTR[72+rdx] + mov r11,QWORD PTR[24+rcx] + mov QWORD PTR[32+rdi],r12 + sbb rbp,QWORD PTR[80+rdx] + mov r12,QWORD PTR[32+rcx] + mov QWORD PTR[40+rdi],r13 + sbb rsi,QWORD PTR[88+rdx] + mov r13,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r8,rdx + and r9,rdx + and r10,rdx + and r11,rdx + and r12,rdx + and r13,rdx + + add r14,r8 + adc r15,r9 + mov QWORD PTR[48+rdi],r14 + adc rax,r10 + mov QWORD PTR[56+rdi],r15 + adc rbx,r11 + mov QWORD PTR[64+rdi],rax + adc rbp,r12 + mov QWORD PTR[72+rdi],rbx + adc rsi,r13 + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rsi + + DB 0F3h,0C3h ;repret +__sub_mod_384x384 ENDP + +PUBLIC add_mod_384x384 + + +ALIGN 32 +add_mod_384x384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_mod_384x384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_add_mod_384x384:: + + + call __add_mod_384x384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_add_mod_384x384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_mod_384x384:: +add_mod_384x384 ENDP + +PUBLIC sub_mod_384x384 + + +ALIGN 32 +sub_mod_384x384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_mod_384x384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sub_mod_384x384:: + + + call __sub_mod_384x384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sub_mod_384x384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_mod_384x384:: +sub_mod_384x384 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_add_mod_384x384 + DD imagerel $L$SEH_body_add_mod_384x384 + DD imagerel $L$SEH_info_add_mod_384x384_prologue + + DD imagerel $L$SEH_body_add_mod_384x384 + DD imagerel $L$SEH_epilogue_add_mod_384x384 + DD imagerel $L$SEH_info_add_mod_384x384_body + + DD imagerel $L$SEH_epilogue_add_mod_384x384 + DD imagerel $L$SEH_end_add_mod_384x384 + DD imagerel $L$SEH_info_add_mod_384x384_epilogue + + DD imagerel $L$SEH_begin_sub_mod_384x384 + DD imagerel $L$SEH_body_sub_mod_384x384 + DD imagerel $L$SEH_info_sub_mod_384x384_prologue + + DD imagerel $L$SEH_body_sub_mod_384x384 + DD imagerel $L$SEH_epilogue_sub_mod_384x384 + DD imagerel $L$SEH_info_sub_mod_384x384_body + + DD imagerel $L$SEH_epilogue_sub_mod_384x384 + DD imagerel $L$SEH_end_sub_mod_384x384 + DD imagerel $L$SEH_info_sub_mod_384x384_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_add_mod_384x384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_add_mod_384x384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_add_mod_384x384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_mod_384x384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sub_mod_384x384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sub_mod_384x384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/build/win64/inverse_mod_384-x86_64.asm b/build/win64/inverse_mod_384-x86_64.asm new file mode 100644 index 00000000..08407802 --- /dev/null +++ b/build/win64/inverse_mod_384-x86_64.asm @@ -0,0 +1,419 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +ALIGN 32 +$L$one:: + DQ 1,0,0,0,0,0,0,0 + +PUBLIC eucl_inverse_mod_384 + + +ALIGN 32 +eucl_inverse_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_eucl_inverse_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,216 + +$L$SEH_body_eucl_inverse_mod_384:: + + + mov QWORD PTR[rsp],rdi + lea rbp,QWORD PTR[$L$one] + cmp rcx,0 + cmove rcx,rbp + + mov rax,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r8,rax + or rax,r9 + or rax,r10 + or rax,r11 + or rax,r12 + or rax,r13 + jz $L$abort + + lea rsi,QWORD PTR[16+rsp] + mov r14,QWORD PTR[rcx] + mov r15,QWORD PTR[8+rcx] + mov rax,QWORD PTR[16+rcx] + mov rbx,QWORD PTR[24+rcx] + mov rbp,QWORD PTR[32+rcx] + mov rdi,QWORD PTR[40+rcx] + + mov QWORD PTR[rsi],r8 + mov QWORD PTR[8+rsi],r9 + mov QWORD PTR[16+rsi],r10 + mov QWORD PTR[24+rsi],r11 + mov QWORD PTR[32+rsi],r12 + mov QWORD PTR[40+rsi],r13 + + lea rcx,QWORD PTR[112+rsp] + mov r8,QWORD PTR[rdx] + mov r9,QWORD PTR[8+rdx] + mov r10,QWORD PTR[16+rdx] + mov r11,QWORD PTR[24+rdx] + mov r12,QWORD PTR[32+rdx] + mov r13,QWORD PTR[40+rdx] + + mov QWORD PTR[48+rsi],r14 + mov QWORD PTR[56+rsi],r15 + mov QWORD PTR[64+rsi],rax + mov QWORD PTR[72+rsi],rbx + mov QWORD PTR[80+rsi],rbp + mov QWORD PTR[88+rsi],rdi + + mov QWORD PTR[rcx],r8 + mov QWORD PTR[8+rcx],r9 + mov QWORD PTR[16+rcx],r10 + mov QWORD PTR[24+rcx],r11 + mov QWORD PTR[32+rcx],r12 + mov QWORD PTR[40+rcx],r13 + + xor eax,eax + mov QWORD PTR[48+rcx],rax + mov QWORD PTR[56+rcx],rax + mov QWORD PTR[64+rcx],rax + mov QWORD PTR[72+rcx],rax + mov QWORD PTR[80+rcx],rax + mov QWORD PTR[88+rcx],rax + jmp $L$oop_inv + +ALIGN 32 +$L$oop_inv:: + lea rsi,QWORD PTR[112+rsp] + call __remove_powers_of_2 + + lea rsi,QWORD PTR[16+rsp] + call __remove_powers_of_2 + + lea rcx,QWORD PTR[112+rsp] + sub r8,QWORD PTR[((112+0))+rsp] + sbb r9,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + sbb r13,QWORD PTR[40+rcx] + jae $L$u_greater_than_v + + + xchg rsi,rcx + + not r8 + not r9 + not r10 + not r11 + not r12 + not r13 + + add r8,1 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + +$L$u_greater_than_v:: + mov r14,QWORD PTR[48+rsi] + mov r15,QWORD PTR[56+rsi] + mov rax,QWORD PTR[64+rsi] + mov rbx,QWORD PTR[72+rsi] + mov rbp,QWORD PTR[80+rsi] + mov rdi,QWORD PTR[88+rsi] + + sub r14,QWORD PTR[48+rcx] + sbb r15,QWORD PTR[56+rcx] + sbb rax,QWORD PTR[64+rcx] + sbb rbx,QWORD PTR[72+rcx] + sbb rbp,QWORD PTR[80+rcx] + sbb rdi,QWORD PTR[88+rcx] + + mov QWORD PTR[rsi],r8 + sbb r8,r8 + mov QWORD PTR[8+rsi],r9 + mov r9,r8 + mov QWORD PTR[16+rsi],r10 + mov r10,r8 + mov QWORD PTR[24+rsi],r11 + mov r11,r8 + mov QWORD PTR[32+rsi],r12 + mov r12,r8 + mov QWORD PTR[40+rsi],r13 + mov r13,r8 + + and r8,QWORD PTR[rdx] + and r9,QWORD PTR[8+rdx] + and r10,QWORD PTR[16+rdx] + and r11,QWORD PTR[24+rdx] + and r12,QWORD PTR[32+rdx] + and r13,QWORD PTR[40+rdx] + + add r14,r8 + adc r15,r9 + adc rax,r10 + adc rbx,r11 + adc rbp,r12 + adc rdi,r13 + + mov QWORD PTR[48+rsi],r14 + mov QWORD PTR[56+rsi],r15 + mov QWORD PTR[64+rsi],rax + mov QWORD PTR[72+rsi],rbx + mov QWORD PTR[80+rsi],rbp + mov QWORD PTR[88+rsi],rdi + + mov r8,QWORD PTR[((16+0))+rsp] + mov r9,QWORD PTR[((16+8))+rsp] + mov r10,QWORD PTR[((16+16))+rsp] + mov r11,QWORD PTR[((16+24))+rsp] + or r8,r9 + or r10,QWORD PTR[((16+32))+rsp] + or r11,QWORD PTR[((16+40))+rsp] +DB 067h + or r8,r10 + or r8,r11 + jnz $L$oop_inv + + lea rsi,QWORD PTR[112+rsp] + mov rdi,QWORD PTR[rsp] + mov eax,1 + + mov r8,QWORD PTR[48+rsi] + mov r9,QWORD PTR[56+rsi] + mov r10,QWORD PTR[64+rsi] + mov r11,QWORD PTR[72+rsi] + mov r12,QWORD PTR[80+rsi] + mov r13,QWORD PTR[88+rsi] + +$L$abort:: + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + lea r8,QWORD PTR[216+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_eucl_inverse_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_eucl_inverse_mod_384:: +eucl_inverse_mod_384 ENDP + + +ALIGN 32 +__remove_powers_of_2 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +$L$oop_of_2:: + bsf rcx,r8 + mov eax,63 + cmovz ecx,eax + + cmp ecx,0 + je $L$oop_of_2_done + + shr r8,cl + mov r14,r9 + shr r9,cl + mov r15,r10 + shr r10,cl + mov rax,r11 + shr r11,cl + mov rbx,r12 + shr r12,cl + mov rbp,r13 + shr r13,cl + neg cl + shl r14,cl + shl r15,cl + or r8,r14 + mov r14,QWORD PTR[48+rsi] + shl rax,cl + or r9,r15 + mov r15,QWORD PTR[56+rsi] + shl rbx,cl + or r10,rax + mov rax,QWORD PTR[64+rsi] + shl rbp,cl + or r11,rbx + mov rbx,QWORD PTR[72+rsi] + or r12,rbp + mov rbp,QWORD PTR[80+rsi] + neg cl + mov rdi,QWORD PTR[88+rsi] + + mov QWORD PTR[rsi],r8 + mov QWORD PTR[8+rsi],r9 + mov QWORD PTR[16+rsi],r10 + mov QWORD PTR[24+rsi],r11 + mov QWORD PTR[32+rsi],r12 + mov QWORD PTR[40+rsi],r13 + jmp $L$oop_div_by_2 + +ALIGN 32 +$L$oop_div_by_2:: + mov r13,1 + mov r8,QWORD PTR[rdx] + and r13,r14 + mov r9,QWORD PTR[8+rdx] + neg r13 + mov r10,QWORD PTR[16+rdx] + and r8,r13 + mov r11,QWORD PTR[24+rdx] + and r9,r13 + mov r12,QWORD PTR[32+rdx] + and r10,r13 + and r11,r13 + and r12,r13 + and r13,QWORD PTR[40+rdx] + + add r14,r8 + adc r15,r9 + adc rax,r10 + adc rbx,r11 + adc rbp,r12 + adc rdi,r13 + sbb r13,r13 + + shr r14,1 + mov r8,r15 + shr r15,1 + mov r9,rax + shr rax,1 + mov r10,rbx + shr rbx,1 + mov r11,rbp + shr rbp,1 + mov r12,rdi + shr rdi,1 + shl r8,63 + shl r9,63 + or r14,r8 + shl r10,63 + or r15,r9 + shl r11,63 + or rax,r10 + shl r12,63 + or rbx,r11 + shl r13,63 + or rbp,r12 + or rdi,r13 + + dec ecx + jnz $L$oop_div_by_2 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov QWORD PTR[48+rsi],r14 + mov QWORD PTR[56+rsi],r15 + mov QWORD PTR[64+rsi],rax + mov QWORD PTR[72+rsi],rbx + mov QWORD PTR[80+rsi],rbp + mov QWORD PTR[88+rsi],rdi + + test r8,1 +DB 02eh + jz $L$oop_of_2 + +$L$oop_of_2_done:: + DB 0F3h,0C3h ;repret +__remove_powers_of_2 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_eucl_inverse_mod_384 + DD imagerel $L$SEH_body_eucl_inverse_mod_384 + DD imagerel $L$SEH_info_eucl_inverse_mod_384_prologue + + DD imagerel $L$SEH_body_eucl_inverse_mod_384 + DD imagerel $L$SEH_epilogue_eucl_inverse_mod_384 + DD imagerel $L$SEH_info_eucl_inverse_mod_384_body + + DD imagerel $L$SEH_epilogue_eucl_inverse_mod_384 + DD imagerel $L$SEH_end_eucl_inverse_mod_384 + DD imagerel $L$SEH_info_eucl_inverse_mod_384_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_eucl_inverse_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_eucl_inverse_mod_384_body:: +DB 1,0,18,0 +DB 000h,0f4h,01bh,000h +DB 000h,0e4h,01ch,000h +DB 000h,0d4h,01dh,000h +DB 000h,0c4h,01eh,000h +DB 000h,034h,01fh,000h +DB 000h,054h,020h,000h +DB 000h,074h,022h,000h +DB 000h,064h,023h,000h +DB 000h,001h,021h,000h +$L$SEH_info_eucl_inverse_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/build/win64/mulq_mont_256-x86_64.asm b/build/win64/mulq_mont_256-x86_64.asm new file mode 100644 index 00000000..c3bf8634 --- /dev/null +++ b/build/win64/mulq_mont_256-x86_64.asm @@ -0,0 +1,884 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC mul_mont_sparse_256 + + +ALIGN 32 +mul_mont_sparse_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_mont_sparse_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_mul_mont_sparse_256:: + + + mov rax,QWORD PTR[rdx] + mov r13,QWORD PTR[rsi] + mov r14,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov rbp,QWORD PTR[24+rsi] + mov rbx,rdx + + mov r15,rax + mul r13 + mov r9,rax + mov rax,r15 + mov r10,rdx + call __mulq_mont_sparse_256 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_mont_sparse_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_mont_sparse_256:: +mul_mont_sparse_256 ENDP + +PUBLIC sqr_mont_sparse_256 + + +ALIGN 32 +sqr_mont_sparse_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_mont_sparse_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_sqr_mont_sparse_256:: + + + mov rax,QWORD PTR[rsi] + mov r8,rcx + mov r14,QWORD PTR[8+rsi] + mov rcx,rdx + mov r12,QWORD PTR[16+rsi] + lea rbx,QWORD PTR[rsi] + mov rbp,QWORD PTR[24+rsi] + + mov r15,rax + mul rax + mov r9,rax + mov rax,r15 + mov r10,rdx + call __mulq_mont_sparse_256 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqr_mont_sparse_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_mont_sparse_256:: +sqr_mont_sparse_256 ENDP + +ALIGN 32 +__mulq_mont_sparse_256 PROC PRIVATE + DB 243,15,30,250 + mul r14 + add r10,rax + mov rax,r15 + adc rdx,0 + mov r11,rdx + + mul r12 + add r11,rax + mov rax,r15 + adc rdx,0 + mov r12,rdx + + mul rbp + add r12,rax + mov rax,QWORD PTR[8+rbx] + adc rdx,0 + xor r14,r14 + mov r13,rdx + + mov rdi,r9 + imul r9,r8 + + + mov r15,rax + mul QWORD PTR[rsi] + add r10,rax + mov rax,r15 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[8+rsi] + add r11,rax + mov rax,r15 + adc rdx,0 + add r11,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rsi] + add r12,rax + mov rax,r15 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rsi] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,rbp + adc r14,rdx + xor r15,r15 + + + mul QWORD PTR[rcx] + add rdi,rax + mov rax,r9 + adc rdi,rdx + + mul QWORD PTR[8+rcx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,rdi + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r12,rax + mov rax,QWORD PTR[16+rbx] + adc rdx,0 + add r12,rbp + adc rdx,0 + add r13,rdx + adc r14,0 + adc r15,0 + mov rdi,r10 + imul r10,r8 + + + mov r9,rax + mul QWORD PTR[rsi] + add r11,rax + mov rax,r9 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[8+rsi] + add r12,rax + mov rax,r9 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rsi] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rsi] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,rbp + adc r15,rdx + xor r9,r9 + + + mul QWORD PTR[rcx] + add rdi,rax + mov rax,r10 + adc rdi,rdx + + mul QWORD PTR[8+rcx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,rdi + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r13,rax + mov rax,QWORD PTR[24+rbx] + adc rdx,0 + add r13,rbp + adc rdx,0 + add r14,rdx + adc r15,0 + adc r9,0 + mov rdi,r11 + imul r11,r8 + + + mov r10,rax + mul QWORD PTR[rsi] + add r12,rax + mov rax,r10 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[8+rsi] + add r13,rax + mov rax,r10 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rsi] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rsi] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,rbp + adc r9,rdx + xor r10,r10 + + + mul QWORD PTR[rcx] + add rdi,rax + mov rax,r11 + adc rdi,rdx + + mul QWORD PTR[8+rcx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,rdi + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,rbp + adc rdx,0 + add r15,rdx + adc r9,0 + adc r10,0 + imul rax,r8 + mov rsi,QWORD PTR[8+rsp] + + + mov r11,rax + mul QWORD PTR[rcx] + add r12,rax + mov rax,r11 + adc r12,rdx + + mul QWORD PTR[8+rcx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r12 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r14,rax + mov rax,r11 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + mov rbx,r14 + add r15,rbp + adc rdx,0 + add r15,rax + mov rax,r13 + adc rdx,0 + add r9,rdx + adc r10,0 + + + + + mov r12,r15 + sub r13,QWORD PTR[rcx] + sbb r14,QWORD PTR[8+rcx] + sbb r15,QWORD PTR[16+rcx] + mov rbp,r9 + sbb r9,QWORD PTR[24+rcx] + sbb r10,0 + + cmovc r13,rax + cmovc r14,rbx + cmovc r15,r12 + mov QWORD PTR[rsi],r13 + cmovc r9,rbp + mov QWORD PTR[8+rsi],r14 + mov QWORD PTR[16+rsi],r15 + mov QWORD PTR[24+rsi],r9 + + DB 0F3h,0C3h ;repret + +__mulq_mont_sparse_256 ENDP +PUBLIC from_mont_256 + + +ALIGN 32 +from_mont_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_from_mont_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_from_mont_256:: + + + mov rbx,rdx + call __mulq_by_1_mont_256 + + + + + + mov r10,r14 + mov r11,r15 + mov r12,r9 + + sub r13,QWORD PTR[rbx] + sbb r14,QWORD PTR[8+rbx] + sbb r15,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + + cmovnc rax,r13 + cmovnc r10,r14 + cmovnc r11,r15 + mov QWORD PTR[rdi],rax + cmovnc r12,r9 + mov QWORD PTR[8+rdi],r10 + mov QWORD PTR[16+rdi],r11 + mov QWORD PTR[24+rdi],r12 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_from_mont_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_from_mont_256:: +from_mont_256 ENDP + +PUBLIC redc_mont_256 + + +ALIGN 32 +redc_mont_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_redc_mont_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_redc_mont_256:: + + + mov rbx,rdx + call __mulq_by_1_mont_256 + + add r13,QWORD PTR[32+rsi] + adc r14,QWORD PTR[40+rsi] + mov rax,r13 + adc r15,QWORD PTR[48+rsi] + mov r10,r14 + adc r9,QWORD PTR[56+rsi] + sbb rsi,rsi + + + + + mov r11,r15 + sub r13,QWORD PTR[rbx] + sbb r14,QWORD PTR[8+rbx] + sbb r15,QWORD PTR[16+rbx] + mov r12,r9 + sbb r9,QWORD PTR[24+rbx] + sbb rsi,0 + + cmovnc rax,r13 + cmovnc r10,r14 + cmovnc r11,r15 + mov QWORD PTR[rdi],rax + cmovnc r12,r9 + mov QWORD PTR[8+rdi],r10 + mov QWORD PTR[16+rdi],r11 + mov QWORD PTR[24+rdi],r12 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_redc_mont_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_redc_mont_256:: +redc_mont_256 ENDP + +ALIGN 32 +__mulq_by_1_mont_256 PROC PRIVATE + DB 243,15,30,250 + mov rax,QWORD PTR[rsi] + mov r10,QWORD PTR[8+rsi] + mov r11,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + + mov r13,rax + imul rax,rcx + mov r9,rax + + mul QWORD PTR[rbx] + add r13,rax + mov rax,r9 + adc r13,rdx + + mul QWORD PTR[8+rbx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,r13 + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[16+rbx] + mov r14,r10 + imul r10,rcx + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,r13 + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[24+rbx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r13 + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[rbx] + add r14,rax + mov rax,r10 + adc r14,rdx + + mul QWORD PTR[8+rbx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[16+rbx] + mov r15,r11 + imul r11,rcx + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[24+rbx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[rbx] + add r15,rax + mov rax,r11 + adc r15,rdx + + mul QWORD PTR[8+rbx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rbx] + mov r9,r12 + imul r12,rcx + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[24+rbx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[rbx] + add r9,rax + mov rax,r12 + adc r9,rdx + + mul QWORD PTR[8+rbx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rbx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rbx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r9 + adc rdx,0 + mov r9,rdx + DB 0F3h,0C3h ;repret +__mulq_by_1_mont_256 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_mul_mont_sparse_256 + DD imagerel $L$SEH_body_mul_mont_sparse_256 + DD imagerel $L$SEH_info_mul_mont_sparse_256_prologue + + DD imagerel $L$SEH_body_mul_mont_sparse_256 + DD imagerel $L$SEH_epilogue_mul_mont_sparse_256 + DD imagerel $L$SEH_info_mul_mont_sparse_256_body + + DD imagerel $L$SEH_epilogue_mul_mont_sparse_256 + DD imagerel $L$SEH_end_mul_mont_sparse_256 + DD imagerel $L$SEH_info_mul_mont_sparse_256_epilogue + + DD imagerel $L$SEH_begin_sqr_mont_sparse_256 + DD imagerel $L$SEH_body_sqr_mont_sparse_256 + DD imagerel $L$SEH_info_sqr_mont_sparse_256_prologue + + DD imagerel $L$SEH_body_sqr_mont_sparse_256 + DD imagerel $L$SEH_epilogue_sqr_mont_sparse_256 + DD imagerel $L$SEH_info_sqr_mont_sparse_256_body + + DD imagerel $L$SEH_epilogue_sqr_mont_sparse_256 + DD imagerel $L$SEH_end_sqr_mont_sparse_256 + DD imagerel $L$SEH_info_sqr_mont_sparse_256_epilogue + + DD imagerel $L$SEH_begin_from_mont_256 + DD imagerel $L$SEH_body_from_mont_256 + DD imagerel $L$SEH_info_from_mont_256_prologue + + DD imagerel $L$SEH_body_from_mont_256 + DD imagerel $L$SEH_epilogue_from_mont_256 + DD imagerel $L$SEH_info_from_mont_256_body + + DD imagerel $L$SEH_epilogue_from_mont_256 + DD imagerel $L$SEH_end_from_mont_256 + DD imagerel $L$SEH_info_from_mont_256_epilogue + + DD imagerel $L$SEH_begin_redc_mont_256 + DD imagerel $L$SEH_body_redc_mont_256 + DD imagerel $L$SEH_info_redc_mont_256_prologue + + DD imagerel $L$SEH_body_redc_mont_256 + DD imagerel $L$SEH_epilogue_redc_mont_256 + DD imagerel $L$SEH_info_redc_mont_256_body + + DD imagerel $L$SEH_epilogue_redc_mont_256 + DD imagerel $L$SEH_end_redc_mont_256 + DD imagerel $L$SEH_info_redc_mont_256_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_mul_mont_sparse_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_mont_sparse_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_mont_sparse_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_mont_sparse_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_mont_sparse_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sqr_mont_sparse_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_from_mont_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_from_mont_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_from_mont_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_redc_mont_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_redc_mont_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_redc_mont_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/build/win64/mulq_mont_384-x86_64.asm b/build/win64/mulq_mont_384-x86_64.asm new file mode 100644 index 00000000..c6d6d290 --- /dev/null +++ b/build/win64/mulq_mont_384-x86_64.asm @@ -0,0 +1,4232 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + + + + + + + + +ALIGN 32 +__sub_mod_384x384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + + sub r8,QWORD PTR[rdx] + mov r15,QWORD PTR[56+rsi] + sbb r9,QWORD PTR[8+rdx] + mov rax,QWORD PTR[64+rsi] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[72+rsi] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[80+rsi] + sbb r12,QWORD PTR[32+rdx] + mov rsi,QWORD PTR[88+rsi] + sbb r13,QWORD PTR[40+rdx] + mov QWORD PTR[rdi],r8 + sbb r14,QWORD PTR[48+rdx] + mov r8,QWORD PTR[rcx] + mov QWORD PTR[8+rdi],r9 + sbb r15,QWORD PTR[56+rdx] + mov r9,QWORD PTR[8+rcx] + mov QWORD PTR[16+rdi],r10 + sbb rax,QWORD PTR[64+rdx] + mov r10,QWORD PTR[16+rcx] + mov QWORD PTR[24+rdi],r11 + sbb rbx,QWORD PTR[72+rdx] + mov r11,QWORD PTR[24+rcx] + mov QWORD PTR[32+rdi],r12 + sbb rbp,QWORD PTR[80+rdx] + mov r12,QWORD PTR[32+rcx] + mov QWORD PTR[40+rdi],r13 + sbb rsi,QWORD PTR[88+rdx] + mov r13,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r8,rdx + and r9,rdx + and r10,rdx + and r11,rdx + and r12,rdx + and r13,rdx + + add r14,r8 + adc r15,r9 + mov QWORD PTR[48+rdi],r14 + adc rax,r10 + mov QWORD PTR[56+rdi],r15 + adc rbx,r11 + mov QWORD PTR[64+rdi],rax + adc rbp,r12 + mov QWORD PTR[72+rdi],rbx + adc rsi,r13 + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rsi + + DB 0F3h,0C3h ;repret +__sub_mod_384x384 ENDP + + +ALIGN 32 +__add_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + adc r10,QWORD PTR[16+rdx] + mov r14,r8 + adc r11,QWORD PTR[24+rdx] + mov r15,r9 + adc r12,QWORD PTR[32+rdx] + mov rax,r10 + adc r13,QWORD PTR[40+rdx] + mov rbx,r11 + sbb rdx,rdx + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + mov QWORD PTR[rdi],r8 + cmovc r11,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r12,rbp + mov QWORD PTR[16+rdi],r10 + cmovc r13,rsi + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__add_mod_384 ENDP + + +ALIGN 32 +__sub_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +__sub_mod_384_a_is_loaded:: + sub r8,QWORD PTR[rdx] + mov r14,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov r15,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rax,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbx,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rdx] + mov rbp,QWORD PTR[32+rcx] + sbb r13,QWORD PTR[40+rdx] + mov rsi,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r14,rdx + and r15,rdx + and rax,rdx + and rbx,rdx + and rbp,rdx + and rsi,rdx + + add r8,r14 + adc r9,r15 + mov QWORD PTR[rdi],r8 + adc r10,rax + mov QWORD PTR[8+rdi],r9 + adc r11,rbx + mov QWORD PTR[16+rdi],r10 + adc r12,rbp + mov QWORD PTR[24+rdi],r11 + adc r13,rsi + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__sub_mod_384 ENDP +PUBLIC mul_mont_384x + + +ALIGN 32 +mul_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_mont_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,328 + +$L$SEH_body_mul_mont_384x:: + + + mov rbx,rdx + mov QWORD PTR[32+rsp],rdi + mov QWORD PTR[24+rsp],rsi + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[8+rsp],rcx + mov QWORD PTR[rsp],r8 + + + + + lea rdi,QWORD PTR[40+rsp] + call __mulq_384 + + + lea rbx,QWORD PTR[48+rbx] + lea rsi,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((40+96))+rsp] + call __mulq_384 + + + mov rcx,QWORD PTR[8+rsp] + lea rdx,QWORD PTR[((-48))+rsi] + lea rdi,QWORD PTR[((40+192+48))+rsp] + call __add_mod_384 + + mov rsi,QWORD PTR[16+rsp] + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((-48))+rdi] + call __add_mod_384 + + lea rbx,QWORD PTR[rdi] + lea rsi,QWORD PTR[48+rdi] + call __mulq_384 + + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[40+rsp] + mov rcx,QWORD PTR[8+rsp] + call __sub_mod_384x384 + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[((-96))+rdi] + call __sub_mod_384x384 + + + lea rsi,QWORD PTR[40+rsp] + lea rdx,QWORD PTR[((40+96))+rsp] + lea rdi,QWORD PTR[40+rsp] + call __sub_mod_384x384 + + mov rbx,rcx + + + lea rsi,QWORD PTR[40+rsp] + mov rcx,QWORD PTR[rsp] + mov rdi,QWORD PTR[32+rsp] + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + + lea rsi,QWORD PTR[((40+192))+rsp] + mov rcx,QWORD PTR[rsp] + lea rdi,QWORD PTR[48+rdi] + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + lea r8,QWORD PTR[328+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_mul_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_mont_384x:: +mul_mont_384x ENDP +PUBLIC sqr_mont_384x + + +ALIGN 32 +sqr_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_mont_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_sqr_mont_384x:: + + + mov QWORD PTR[rsp],rcx + mov rcx,rdx + mov QWORD PTR[16+rsp],rsi +DB 102,72,15,110,199 + + + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[32+rsp] + call __add_mod_384 + + + mov rsi,QWORD PTR[16+rsp] + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((32+48))+rsp] + call __sub_mod_384 + + + mov rsi,QWORD PTR[16+rsp] + lea rbx,QWORD PTR[48+rsi] + + mov rax,QWORD PTR[48+rsi] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov r13,QWORD PTR[24+rsi] + + call __mulq_mont_384 + add r14,r14 + adc r15,r15 + adc r8,r8 + mov r12,r14 + adc r9,r9 + mov r13,r15 + adc r10,r10 + mov rax,r8 + adc r11,r11 + mov rbx,r9 + sbb rdx,rdx + + sub r14,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + mov rbp,r10 + sbb r8,QWORD PTR[16+rcx] + sbb r9,QWORD PTR[24+rcx] + sbb r10,QWORD PTR[32+rcx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r14,r12 + cmovc r15,r13 + cmovc r8,rax + mov QWORD PTR[48+rdi],r14 + cmovc r9,rbx + mov QWORD PTR[56+rdi],r15 + cmovc r10,rbp + mov QWORD PTR[64+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[72+rdi],r9 + mov QWORD PTR[80+rdi],r10 + mov QWORD PTR[88+rdi],r11 + + lea rsi,QWORD PTR[32+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + + mov rax,QWORD PTR[((32+48))+rsp] + mov r14,QWORD PTR[((32+0))+rsp] + mov r15,QWORD PTR[((32+8))+rsp] + mov r12,QWORD PTR[((32+16))+rsp] + mov r13,QWORD PTR[((32+24))+rsp] + + call __mulq_mont_384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_mont_384x:: +sqr_mont_384x ENDP + +PUBLIC mul_382x + + +ALIGN 32 +mul_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_382x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_mul_382x:: + + + lea rdi,QWORD PTR[96+rdi] + mov QWORD PTR[rsp],rsi + mov QWORD PTR[8+rsp],rdx + mov QWORD PTR[16+rsp],rdi + mov QWORD PTR[24+rsp],rcx + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + add r8,QWORD PTR[48+rsi] + adc r9,QWORD PTR[56+rsi] + adc r10,QWORD PTR[64+rsi] + adc r11,QWORD PTR[72+rsi] + adc r12,QWORD PTR[80+rsi] + adc r13,QWORD PTR[88+rsi] + + mov QWORD PTR[((32+0))+rsp],r8 + mov QWORD PTR[((32+8))+rsp],r9 + mov QWORD PTR[((32+16))+rsp],r10 + mov QWORD PTR[((32+24))+rsp],r11 + mov QWORD PTR[((32+32))+rsp],r12 + mov QWORD PTR[((32+40))+rsp],r13 + + + mov r8,QWORD PTR[rdx] + mov r9,QWORD PTR[8+rdx] + mov r10,QWORD PTR[16+rdx] + mov r11,QWORD PTR[24+rdx] + mov r12,QWORD PTR[32+rdx] + mov r13,QWORD PTR[40+rdx] + + add r8,QWORD PTR[48+rdx] + adc r9,QWORD PTR[56+rdx] + adc r10,QWORD PTR[64+rdx] + adc r11,QWORD PTR[72+rdx] + adc r12,QWORD PTR[80+rdx] + adc r13,QWORD PTR[88+rdx] + + mov QWORD PTR[((32+48))+rsp],r8 + mov QWORD PTR[((32+56))+rsp],r9 + mov QWORD PTR[((32+64))+rsp],r10 + mov QWORD PTR[((32+72))+rsp],r11 + mov QWORD PTR[((32+80))+rsp],r12 + mov QWORD PTR[((32+88))+rsp],r13 + + + lea rsi,QWORD PTR[((32+0))+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + call __mulq_384 + + + mov rsi,QWORD PTR[rsp] + mov rbx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-96))+rdi] + call __mulq_384 + + + lea rsi,QWORD PTR[48+rsi] + lea rbx,QWORD PTR[48+rbx] + lea rdi,QWORD PTR[32+rsp] + call __mulq_384 + + + mov rsi,QWORD PTR[16+rsp] + lea rdx,QWORD PTR[32+rsp] + mov rcx,QWORD PTR[24+rsp] + mov rdi,rsi + call __sub_mod_384x384 + + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[((-96))+rdi] + call __sub_mod_384x384 + + + lea rsi,QWORD PTR[((-96))+rdi] + lea rdx,QWORD PTR[32+rsp] + lea rdi,QWORD PTR[((-96))+rdi] + call __sub_mod_384x384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_mul_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_382x:: +mul_382x ENDP +PUBLIC sqr_382x + + +ALIGN 32 +sqr_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_382x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_sqr_382x:: + + + mov rcx,rdx + + + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov rbx,QWORD PTR[24+rsi] + mov rbp,QWORD PTR[32+rsi] + mov rdx,QWORD PTR[40+rsi] + + mov r8,r14 + add r14,QWORD PTR[48+rsi] + mov r9,r15 + adc r15,QWORD PTR[56+rsi] + mov r10,rax + adc rax,QWORD PTR[64+rsi] + mov r11,rbx + adc rbx,QWORD PTR[72+rsi] + mov r12,rbp + adc rbp,QWORD PTR[80+rsi] + mov r13,rdx + adc rdx,QWORD PTR[88+rsi] + + mov QWORD PTR[rdi],r14 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],rax + mov QWORD PTR[24+rdi],rbx + mov QWORD PTR[32+rdi],rbp + mov QWORD PTR[40+rdi],rdx + + + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[48+rdi] + call __sub_mod_384_a_is_loaded + + + lea rsi,QWORD PTR[rdi] + lea rbx,QWORD PTR[((-48))+rdi] + lea rdi,QWORD PTR[((-48))+rdi] + call __mulq_384 + + + mov rsi,QWORD PTR[rsp] + lea rbx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[96+rdi] + call __mulq_384 + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + mov r12,QWORD PTR[32+rdi] + mov r13,QWORD PTR[40+rdi] + mov r14,QWORD PTR[48+rdi] + mov r15,QWORD PTR[56+rdi] + mov rax,QWORD PTR[64+rdi] + mov rbx,QWORD PTR[72+rdi] + mov rbp,QWORD PTR[80+rdi] + add r8,r8 + mov rdx,QWORD PTR[88+rdi] + adc r9,r9 + mov QWORD PTR[rdi],r8 + adc r10,r10 + mov QWORD PTR[8+rdi],r9 + adc r11,r11 + mov QWORD PTR[16+rdi],r10 + adc r12,r12 + mov QWORD PTR[24+rdi],r11 + adc r13,r13 + mov QWORD PTR[32+rdi],r12 + adc r14,r14 + mov QWORD PTR[40+rdi],r13 + adc r15,r15 + mov QWORD PTR[48+rdi],r14 + adc rax,rax + mov QWORD PTR[56+rdi],r15 + adc rbx,rbx + mov QWORD PTR[64+rdi],rax + adc rbp,rbp + mov QWORD PTR[72+rdi],rbx + adc rdx,rdx + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rdx + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqr_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_382x:: +sqr_382x ENDP +PUBLIC mul_384 + + +ALIGN 32 +mul_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + +$L$SEH_body_mul_384:: + + + mov rbx,rdx + call __mulq_384 + + mov r12,QWORD PTR[rsp] + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_mul_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_384:: +mul_384 ENDP + + +ALIGN 32 +__mulq_384 PROC PRIVATE + DB 243,15,30,250 + mov rax,QWORD PTR[rbx] + + mov rbp,rax + mul QWORD PTR[rsi] + mov QWORD PTR[rdi],rax + mov rax,rbp + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r11,rax + mov rax,QWORD PTR[8+rbx] + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[8+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,QWORD PTR[16+rbx] + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[16+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,QWORD PTR[24+rbx] + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[24+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,QWORD PTR[32+rbx] + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[32+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,QWORD PTR[40+rbx] + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[40+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,rax + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov QWORD PTR[48+rdi],rcx + mov QWORD PTR[56+rdi],r8 + mov QWORD PTR[64+rdi],r9 + mov QWORD PTR[72+rdi],r10 + mov QWORD PTR[80+rdi],r11 + mov QWORD PTR[88+rdi],r12 + + DB 0F3h,0C3h ;repret +__mulq_384 ENDP +PUBLIC sqr_384 + + +ALIGN 32 +sqr_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_384:: + mov rdi,rcx + mov rsi,rdx + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sqr_384:: + + + call __sqrq_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqr_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_384:: +sqr_384 ENDP + + +ALIGN 32 +__sqrq_384 PROC PRIVATE + DB 243,15,30,250 + mov rax,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rcx,QWORD PTR[16+rsi] + mov rbx,QWORD PTR[24+rsi] + + + mov r14,rax + mul r15 + mov r9,rax + mov rax,r14 + mov rbp,QWORD PTR[32+rsi] + mov r10,rdx + + mul rcx + add r10,rax + mov rax,r14 + adc rdx,0 + mov rsi,QWORD PTR[40+rsi] + mov r11,rdx + + mul rbx + add r11,rax + mov rax,r14 + adc rdx,0 + mov r12,rdx + + mul rbp + add r12,rax + mov rax,r14 + adc rdx,0 + mov r13,rdx + + mul rsi + add r13,rax + mov rax,r14 + adc rdx,0 + mov r14,rdx + + mul rax + xor r8,r8 + mov QWORD PTR[rdi],rax + mov rax,r15 + add r9,r9 + adc r8,0 + add r9,rdx + adc r8,0 + mov QWORD PTR[8+rdi],r9 + + mul rcx + add r11,rax + mov rax,r15 + adc rdx,0 + mov r9,rdx + + mul rbx + add r12,rax + mov rax,r15 + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mul rbp + add r13,rax + mov rax,r15 + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mul rsi + add r14,rax + mov rax,r15 + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r15,rdx + + mul rax + xor r9,r9 + add r8,rax + mov rax,rcx + add r10,r10 + adc r11,r11 + adc r9,0 + add r10,r8 + adc r11,rdx + adc r9,0 + mov QWORD PTR[16+rdi],r10 + + mul rbx + add r13,rax + mov rax,rcx + adc rdx,0 + mov QWORD PTR[24+rdi],r11 + mov r8,rdx + + mul rbp + add r14,rax + mov rax,rcx + adc rdx,0 + add r14,r8 + adc rdx,0 + mov r8,rdx + + mul rsi + add r15,rax + mov rax,rcx + adc rdx,0 + add r15,r8 + adc rdx,0 + mov rcx,rdx + + mul rax + xor r11,r11 + add r9,rax + mov rax,rbx + add r12,r12 + adc r13,r13 + adc r11,0 + add r12,r9 + adc r13,rdx + adc r11,0 + mov QWORD PTR[32+rdi],r12 + + + mul rbp + add r15,rax + mov rax,rbx + adc rdx,0 + mov QWORD PTR[40+rdi],r13 + mov r8,rdx + + mul rsi + add rcx,rax + mov rax,rbx + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov rbx,rdx + + mul rax + xor r12,r12 + add r11,rax + mov rax,rbp + add r14,r14 + adc r15,r15 + adc r12,0 + add r14,r11 + adc r15,rdx + mov QWORD PTR[48+rdi],r14 + adc r12,0 + mov QWORD PTR[56+rdi],r15 + + + mul rsi + add rbx,rax + mov rax,rbp + adc rdx,0 + mov rbp,rdx + + mul rax + xor r13,r13 + add r12,rax + mov rax,rsi + add rcx,rcx + adc rbx,rbx + adc r13,0 + add rcx,r12 + adc rbx,rdx + mov QWORD PTR[64+rdi],rcx + adc r13,0 + mov QWORD PTR[72+rdi],rbx + + + mul rax + add rax,r13 + add rbp,rbp + adc rdx,0 + add rax,rbp + adc rdx,0 + mov QWORD PTR[80+rdi],rax + mov QWORD PTR[88+rdi],rdx + + DB 0F3h,0C3h ;repret +__sqrq_384 ENDP + +PUBLIC sqr_mont_384 + + +ALIGN 32 +sqr_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8*15 + +$L$SEH_body_sqr_mont_384:: + + + mov QWORD PTR[96+rsp],rcx + mov QWORD PTR[104+rsp],rdx + mov QWORD PTR[112+rsp],rdi + + mov rdi,rsp + call __sqrq_384 + + lea rsi,QWORD PTR[rsp] + mov rcx,QWORD PTR[96+rsp] + mov rbx,QWORD PTR[104+rsp] + mov rdi,QWORD PTR[112+rsp] + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + lea r8,QWORD PTR[120+rsp] + mov r15,QWORD PTR[120+rsp] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_mont_384:: +sqr_mont_384 ENDP + + + +PUBLIC redc_mont_384 + + +ALIGN 32 +redc_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_redc_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_redc_mont_384:: + + + mov rbx,rdx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_redc_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_redc_mont_384:: +redc_mont_384 ENDP + + + + +PUBLIC from_mont_384 + + +ALIGN 32 +from_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_from_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_from_mont_384:: + + + mov rbx,rdx + call __mulq_by_1_mont_384 + + + + + + mov rcx,r15 + mov rdx,r8 + mov rbp,r9 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + mov r13,r10 + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rbx] + + cmovc r14,rax + cmovc r15,rcx + cmovc r8,rdx + mov QWORD PTR[rdi],r14 + cmovc r9,rbp + mov QWORD PTR[8+rdi],r15 + cmovc r10,r13 + mov QWORD PTR[16+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_from_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_from_mont_384:: +from_mont_384 ENDP + +ALIGN 32 +__mulq_by_1_mont_384 PROC PRIVATE + DB 243,15,30,250 + mov rax,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,rax + imul rax,rcx + mov r8,rax + + mul QWORD PTR[rbx] + add r14,rax + mov rax,r8 + adc r14,rdx + + mul QWORD PTR[8+rbx] + add r9,rax + mov rax,r8 + adc rdx,0 + add r9,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[16+rbx] + add r10,rax + mov rax,r8 + adc rdx,0 + add r10,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[24+rbx] + add r11,rax + mov rax,r8 + adc rdx,0 + mov r15,r9 + imul r9,rcx + add r11,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[32+rbx] + add r12,rax + mov rax,r8 + adc rdx,0 + add r12,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[40+rbx] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[rbx] + add r15,rax + mov rax,r9 + adc r15,rdx + + mul QWORD PTR[8+rbx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rbx] + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[24+rbx] + add r12,rax + mov rax,r9 + adc rdx,0 + mov r8,r10 + imul r10,rcx + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[32+rbx] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[40+rbx] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[rbx] + add r8,rax + mov rax,r10 + adc r8,rdx + + mul QWORD PTR[8+rbx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rbx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[24+rbx] + add r13,rax + mov rax,r10 + adc rdx,0 + mov r9,r11 + imul r11,rcx + add r13,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[32+rbx] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[40+rbx] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[rbx] + add r9,rax + mov rax,r11 + adc r9,rdx + + mul QWORD PTR[8+rbx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rbx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rbx] + add r14,rax + mov rax,r11 + adc rdx,0 + mov r10,r12 + imul r12,rcx + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[32+rbx] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[40+rbx] + add r8,rax + mov rax,r12 + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[rbx] + add r10,rax + mov rax,r12 + adc r10,rdx + + mul QWORD PTR[8+rbx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rbx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[24+rbx] + add r15,rax + mov rax,r12 + adc rdx,0 + mov r11,r13 + imul r13,rcx + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rbx] + add r8,rax + mov rax,r12 + adc rdx,0 + add r8,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[40+rbx] + add r9,rax + mov rax,r13 + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[rbx] + add r11,rax + mov rax,r13 + adc r11,rdx + + mul QWORD PTR[8+rbx] + add r14,rax + mov rax,r13 + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rbx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[24+rbx] + add r8,rax + mov rax,r13 + adc rdx,0 + add r8,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[32+rbx] + add r9,rax + mov rax,r13 + adc rdx,0 + add r9,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rbx] + add r10,rax + mov rax,r14 + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + DB 0F3h,0C3h ;repret +__mulq_by_1_mont_384 ENDP + + +ALIGN 32 +__redc_tail_mont_384 PROC PRIVATE + DB 243,15,30,250 + add r14,QWORD PTR[48+rsi] + mov rax,r14 + adc r15,QWORD PTR[56+rsi] + adc r8,QWORD PTR[64+rsi] + adc r9,QWORD PTR[72+rsi] + mov rcx,r15 + adc r10,QWORD PTR[80+rsi] + adc r11,QWORD PTR[88+rsi] + sbb r12,r12 + + + + + mov rdx,r8 + mov rbp,r9 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + mov r13,r10 + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rbx] + sbb r12,0 + + cmovc r14,rax + cmovc r15,rcx + cmovc r8,rdx + mov QWORD PTR[rdi],r14 + cmovc r9,rbp + mov QWORD PTR[8+rdi],r15 + cmovc r10,r13 + mov QWORD PTR[16+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + DB 0F3h,0C3h ;repret +__redc_tail_mont_384 ENDP + +PUBLIC sgn0_pty_mont_384 + + +ALIGN 32 +sgn0_pty_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0_pty_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sgn0_pty_mont_384:: + + + mov rbx,rsi + lea rsi,QWORD PTR[rdi] + mov rcx,rdx + call __mulq_by_1_mont_384 + + xor rax,rax + mov r13,r14 + add r14,r14 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rax,0 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rax,0 + + not rax + and r13,1 + and rax,2 + or rax,r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sgn0_pty_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0_pty_mont_384:: +sgn0_pty_mont_384 ENDP + +PUBLIC sgn0_pty_mont_384x + + +ALIGN 32 +sgn0_pty_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0_pty_mont_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sgn0_pty_mont_384x:: + + + mov rbx,rsi + lea rsi,QWORD PTR[48+rdi] + mov rcx,rdx + call __mulq_by_1_mont_384 + + mov r12,r14 + or r14,r15 + or r14,r8 + or r14,r9 + or r14,r10 + or r14,r11 + + lea rsi,QWORD PTR[rdi] + xor rdi,rdi + mov r13,r12 + add r12,r12 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rdi,0 + + sub r12,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rdi,0 + + mov QWORD PTR[rsp],r14 + not rdi + and r13,1 + and rdi,2 + or rdi,r13 + + call __mulq_by_1_mont_384 + + mov r12,r14 + or r14,r15 + or r14,r8 + or r14,r9 + or r14,r10 + or r14,r11 + + xor rax,rax + mov r13,r12 + add r12,r12 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rax,0 + + sub r12,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rax,0 + + mov r12,QWORD PTR[rsp] + + not rax + + test r14,r14 + cmovz r13,rdi + + test r12,r12 + cmovnz rax,rdi + + and r13,1 + and rax,2 + or rax,r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sgn0_pty_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0_pty_mont_384x:: +sgn0_pty_mont_384x ENDP +PUBLIC mul_mont_384 + + +ALIGN 32 +mul_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push r8 + +$L$SEH_body_mul_mont_384:: + + + mov rax,QWORD PTR[rdx] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov r13,QWORD PTR[24+rsi] + mov rbx,rdx +DB 102,72,15,110,199 + + call __mulq_mont_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_mont_384:: +mul_mont_384 ENDP + +ALIGN 32 +__mulq_mont_384 PROC PRIVATE + DB 243,15,30,250 + mov rdi,rax + mul r14 + mov r8,rax + mov rax,rdi + mov r9,rdx + + mul r15 + add r9,rax + mov rax,rdi + adc rdx,0 + mov r10,rdx + + mul r12 + add r10,rax + mov rax,rdi + adc rdx,0 + mov r11,rdx + + mov rbp,r8 + imul r8,QWORD PTR[8+rsp] + + mul r13 + add r11,rax + mov rax,rdi + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[32+rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[40+rsi] + add r13,rax + mov rax,r8 + adc rdx,0 + xor r15,r15 + mov r14,rdx + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r8 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r9,rax + mov rax,r8 + adc rdx,0 + add r9,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r10,rax + mov rax,r8 + adc rdx,0 + add r10,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r11,rbp + adc rdx,0 + add r11,rax + mov rax,r8 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r12,rax + mov rax,r8 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r13,rax + mov rax,QWORD PTR[8+rbx] + adc rdx,0 + add r13,rbp + adc r14,rdx + adc r15,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r9,rax + mov rax,rdi + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[8+rsi] + add r10,rax + mov rax,rdi + adc rdx,0 + add r10,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r11,rax + mov rax,rdi + adc rdx,0 + add r11,r8 + adc rdx,0 + mov r8,rdx + + mov rbp,r9 + imul r9,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + add r12,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[32+rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + add r13,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[40+rsi] + add r14,r8 + adc rdx,0 + xor r8,r8 + add r14,rax + mov rax,r9 + adc r15,rdx + adc r8,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r9 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r12,rbp + adc rdx,0 + add r12,rax + mov rax,r9 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r14,rax + mov rax,QWORD PTR[16+rbx] + adc rdx,0 + add r14,rbp + adc r15,rdx + adc r8,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r10,rax + mov rax,rdi + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[8+rsi] + add r11,rax + mov rax,rdi + adc rdx,0 + add r11,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mov rbp,r10 + imul r10,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[32+rsi] + add r14,rax + mov rax,rdi + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[40+rsi] + add r15,r9 + adc rdx,0 + xor r9,r9 + add r15,rax + mov rax,r10 + adc r8,rdx + adc r9,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r10 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r13,rbp + adc rdx,0 + add r13,rax + mov rax,r10 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r15,rax + mov rax,QWORD PTR[24+rbx] + adc rdx,0 + add r15,rbp + adc r8,rdx + adc r9,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r11,rax + mov rax,rdi + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[8+rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + add r12,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mov rbp,r11 + imul r11,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r14,rax + mov rax,rdi + adc rdx,0 + add r14,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r15,rax + mov rax,rdi + adc rdx,0 + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[40+rsi] + add r8,r10 + adc rdx,0 + xor r10,r10 + add r8,rax + mov rax,r11 + adc r9,rdx + adc r10,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r11 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r14,rbp + adc rdx,0 + add r14,rax + mov rax,r11 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r8,rax + mov rax,QWORD PTR[32+rbx] + adc rdx,0 + add r8,rbp + adc r9,rdx + adc r10,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[8+rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + add r13,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rsi] + add r14,rax + mov rax,rdi + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mov rbp,r12 + imul r12,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r15,rax + mov rax,rdi + adc rdx,0 + add r15,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[32+rsi] + add r8,rax + mov rax,rdi + adc rdx,0 + add r8,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r9,r11 + adc rdx,0 + xor r11,r11 + add r9,rax + mov rax,r12 + adc r10,rdx + adc r11,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r12 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r15,rbp + adc rdx,0 + add r15,rax + mov rax,r12 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r8,rax + mov rax,r12 + adc rdx,0 + add r8,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r9,rax + mov rax,QWORD PTR[40+rbx] + adc rdx,0 + add r9,rbp + adc r10,rdx + adc r11,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[8+rsi] + add r14,rax + mov rax,rdi + adc rdx,0 + add r14,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[16+rsi] + add r15,rax + mov rax,rdi + adc rdx,0 + add r15,r12 + adc rdx,0 + mov r12,rdx + + mov rbp,r13 + imul r13,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r8,rax + mov rax,rdi + adc rdx,0 + add r8,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[32+rsi] + add r9,rax + mov rax,rdi + adc rdx,0 + add r9,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[40+rsi] + add r10,r12 + adc rdx,0 + xor r12,r12 + add r10,rax + mov rax,r13 + adc r11,rdx + adc r12,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r13 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r14,rax + mov rax,r13 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r8,rbp + adc rdx,0 + add r8,rax + mov rax,r13 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r9,rax + mov rax,r13 + adc rdx,0 + add r9,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r10,rax + mov rax,r14 + adc rdx,0 + add r10,rbp + adc r11,rdx + adc r12,0 + + + + +DB 102,72,15,126,199 + sub r14,QWORD PTR[rcx] + mov rdx,r15 + sbb r15,QWORD PTR[8+rcx] + mov rbx,r8 + sbb r8,QWORD PTR[16+rcx] + mov rsi,r9 + sbb r9,QWORD PTR[24+rcx] + mov rbp,r10 + sbb r10,QWORD PTR[32+rcx] + mov r13,r11 + sbb r11,QWORD PTR[40+rcx] + sbb r12,0 + + cmovc r14,rax + cmovc r15,rdx + cmovc r8,rbx + mov QWORD PTR[rdi],r14 + cmovc r9,rsi + mov QWORD PTR[8+rdi],r15 + cmovc r10,rbp + mov QWORD PTR[16+rdi],r8 + cmovc r11,r13 + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + DB 0F3h,0C3h ;repret +__mulq_mont_384 ENDP +PUBLIC sqr_n_mul_mont_384 + + +ALIGN 32 +sqr_n_mul_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_n_mul_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + mov r9,QWORD PTR[48+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8*17 + +$L$SEH_body_sqr_n_mul_mont_384:: + + + mov QWORD PTR[rsp],r8 + mov QWORD PTR[8+rsp],rcx +DB 102,72,15,110,199 + lea rdi,QWORD PTR[32+rsp] + mov QWORD PTR[24+rsp],r9 + movq xmm2,QWORD PTR[r9] + +$L$oop_sqr_384:: + movd xmm1,edx + + call __sqrq_384 + + lea rsi,QWORD PTR[rdi] + mov rcx,QWORD PTR[rsp] + mov rbx,QWORD PTR[8+rsp] + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movd edx,xmm1 + lea rsi,QWORD PTR[rdi] + dec edx + jnz $L$oop_sqr_384 + +DB 102,72,15,126,208 + mov rcx,rbx + mov rbx,QWORD PTR[24+rsp] + + + + + + + mov r12,r8 + mov r13,r9 + + call __mulq_mont_384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[136+rsp] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_n_mul_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_n_mul_mont_384:: +sqr_n_mul_mont_384 ENDP + +PUBLIC sqr_n_mul_mont_383 + + +ALIGN 32 +sqr_n_mul_mont_383 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_n_mul_mont_383:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + mov r9,QWORD PTR[48+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8*17 + +$L$SEH_body_sqr_n_mul_mont_383:: + + + mov QWORD PTR[rsp],r8 + mov QWORD PTR[8+rsp],rcx +DB 102,72,15,110,199 + lea rdi,QWORD PTR[32+rsp] + mov QWORD PTR[24+rsp],r9 + movq xmm2,QWORD PTR[r9] + +$L$oop_sqr_383:: + movd xmm1,edx + + call __sqrq_384 + + lea rsi,QWORD PTR[rdi] + mov rcx,QWORD PTR[rsp] + mov rbx,QWORD PTR[8+rsp] + call __mulq_by_1_mont_384 + + movd edx,xmm1 + add r14,QWORD PTR[48+rsi] + adc r15,QWORD PTR[56+rsi] + adc r8,QWORD PTR[64+rsi] + adc r9,QWORD PTR[72+rsi] + adc r10,QWORD PTR[80+rsi] + adc r11,QWORD PTR[88+rsi] + lea rsi,QWORD PTR[rdi] + + mov QWORD PTR[rdi],r14 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],r8 + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + dec edx + jnz $L$oop_sqr_383 + +DB 102,72,15,126,208 + mov rcx,rbx + mov rbx,QWORD PTR[24+rsp] + + + + + + + mov r12,r8 + mov r13,r9 + + call __mulq_mont_384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[136+rsp] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_n_mul_mont_383:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_n_mul_mont_383:: +sqr_n_mul_mont_383 ENDP + +ALIGN 32 +__mulq_mont_383_nonred PROC PRIVATE + DB 243,15,30,250 + mov rbp,rax + mul r14 + mov r8,rax + mov rax,rbp + mov r9,rdx + + mul r15 + add r9,rax + mov rax,rbp + adc rdx,0 + mov r10,rdx + + mul r12 + add r10,rax + mov rax,rbp + adc rdx,0 + mov r11,rdx + + mov r15,r8 + imul r8,QWORD PTR[8+rsp] + + mul r13 + add r11,rax + mov rax,rbp + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[32+rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[40+rsi] + add r13,rax + mov rax,r8 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[rcx] + add r15,rax + mov rax,r8 + adc r15,rdx + + mul QWORD PTR[8+rcx] + add r9,rax + mov rax,r8 + adc rdx,0 + add r9,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rcx] + add r10,rax + mov rax,r8 + adc rdx,0 + add r10,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[24+rcx] + add r11,r15 + adc rdx,0 + add r11,rax + mov rax,r8 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[32+rcx] + add r12,rax + mov rax,r8 + adc rdx,0 + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[40+rcx] + add r13,rax + mov rax,QWORD PTR[8+rbx] + adc rdx,0 + add r13,r15 + adc r14,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[8+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r10,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r11,r15 + adc rdx,0 + mov r15,rdx + + mov r8,r9 + imul r9,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[32+rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + add r13,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[40+rsi] + add r14,r15 + adc rdx,0 + add r14,rax + mov rax,r9 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[rcx] + add r8,rax + mov rax,r9 + adc r8,rdx + + mul QWORD PTR[8+rcx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rcx] + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[24+rcx] + add r12,r8 + adc rdx,0 + add r12,rax + mov rax,r9 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[32+rcx] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[40+rcx] + add r14,rax + mov rax,QWORD PTR[16+rbx] + adc rdx,0 + add r14,r8 + adc r15,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[8+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r11,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + add r12,r8 + adc rdx,0 + mov r8,rdx + + mov r9,r10 + imul r10,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + add r13,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[32+rsi] + add r14,rax + mov rax,rbp + adc rdx,0 + add r14,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[40+rsi] + add r15,r8 + adc rdx,0 + add r15,rax + mov rax,r10 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[rcx] + add r9,rax + mov rax,r10 + adc r9,rdx + + mul QWORD PTR[8+rcx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rcx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rcx] + add r13,r9 + adc rdx,0 + add r13,rax + mov rax,r10 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[32+rcx] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[40+rcx] + add r15,rax + mov rax,QWORD PTR[24+rbx] + adc rdx,0 + add r15,r9 + adc r8,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[8+rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mov r10,r11 + imul r11,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r14,rax + mov rax,rbp + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[32+rsi] + add r15,rax + mov rax,rbp + adc rdx,0 + add r15,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[40+rsi] + add r8,r9 + adc rdx,0 + add r8,rax + mov rax,r11 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[rcx] + add r10,rax + mov rax,r11 + adc r10,rdx + + mul QWORD PTR[8+rcx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rcx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[24+rcx] + add r14,r10 + adc rdx,0 + add r14,rax + mov rax,r11 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rcx] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[40+rcx] + add r8,rax + mov rax,QWORD PTR[32+rbx] + adc rdx,0 + add r8,r10 + adc r9,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[8+rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rsi] + add r14,rax + mov rax,rbp + adc rdx,0 + add r14,r10 + adc rdx,0 + mov r10,rdx + + mov r11,r12 + imul r12,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r15,rax + mov rax,rbp + adc rdx,0 + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add r8,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[40+rsi] + add r9,r10 + adc rdx,0 + add r9,rax + mov rax,r12 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[rcx] + add r11,rax + mov rax,r12 + adc r11,rdx + + mul QWORD PTR[8+rcx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rcx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[24+rcx] + add r15,r11 + adc rdx,0 + add r15,rax + mov rax,r12 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[32+rcx] + add r8,rax + mov rax,r12 + adc rdx,0 + add r8,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rcx] + add r9,rax + mov rax,QWORD PTR[40+rbx] + adc rdx,0 + add r9,r11 + adc r10,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[8+rsi] + add r14,rax + mov rax,rbp + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rsi] + add r15,rax + mov rax,rbp + adc rdx,0 + add r15,r11 + adc rdx,0 + mov r11,rdx + + mov r12,r13 + imul r13,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add r8,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[32+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r9,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r10,r11 + adc rdx,0 + add r10,rax + mov rax,r13 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[rcx] + add r12,rax + mov rax,r13 + adc r12,rdx + + mul QWORD PTR[8+rcx] + add r14,rax + mov rax,r13 + adc rdx,0 + add r14,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[16+rcx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[24+rcx] + add r8,r12 + adc rdx,0 + add r8,rax + mov rax,r13 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[32+rcx] + add r9,rax + mov rax,r13 + adc rdx,0 + add r9,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[40+rcx] + add r10,rax + mov rax,r14 + adc rdx,0 + add r10,r12 + adc r11,rdx + DB 0F3h,0C3h ;repret +__mulq_mont_383_nonred ENDP +PUBLIC sqr_mont_382x + + +ALIGN 32 +sqr_mont_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_mont_382x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_sqr_mont_382x:: + + + mov QWORD PTR[rsp],rcx + mov rcx,rdx + mov QWORD PTR[16+rsp],rsi + mov QWORD PTR[24+rsp],rdi + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,r8 + add r8,QWORD PTR[48+rsi] + mov r15,r9 + adc r9,QWORD PTR[56+rsi] + mov rax,r10 + adc r10,QWORD PTR[64+rsi] + mov rdx,r11 + adc r11,QWORD PTR[72+rsi] + mov rbx,r12 + adc r12,QWORD PTR[80+rsi] + mov rbp,r13 + adc r13,QWORD PTR[88+rsi] + + sub r14,QWORD PTR[48+rsi] + sbb r15,QWORD PTR[56+rsi] + sbb rax,QWORD PTR[64+rsi] + sbb rdx,QWORD PTR[72+rsi] + sbb rbx,QWORD PTR[80+rsi] + sbb rbp,QWORD PTR[88+rsi] + sbb rdi,rdi + + mov QWORD PTR[((32+0))+rsp],r8 + mov QWORD PTR[((32+8))+rsp],r9 + mov QWORD PTR[((32+16))+rsp],r10 + mov QWORD PTR[((32+24))+rsp],r11 + mov QWORD PTR[((32+32))+rsp],r12 + mov QWORD PTR[((32+40))+rsp],r13 + + mov QWORD PTR[((32+48))+rsp],r14 + mov QWORD PTR[((32+56))+rsp],r15 + mov QWORD PTR[((32+64))+rsp],rax + mov QWORD PTR[((32+72))+rsp],rdx + mov QWORD PTR[((32+80))+rsp],rbx + mov QWORD PTR[((32+88))+rsp],rbp + mov QWORD PTR[((32+96))+rsp],rdi + + + + lea rbx,QWORD PTR[48+rsi] + + mov rax,QWORD PTR[48+rsi] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov r13,QWORD PTR[24+rsi] + + mov rdi,QWORD PTR[24+rsp] + call __mulq_mont_383_nonred + add r14,r14 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + + mov QWORD PTR[48+rdi],r14 + mov QWORD PTR[56+rdi],r15 + mov QWORD PTR[64+rdi],r8 + mov QWORD PTR[72+rdi],r9 + mov QWORD PTR[80+rdi],r10 + mov QWORD PTR[88+rdi],r11 + + lea rsi,QWORD PTR[32+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + + mov rax,QWORD PTR[((32+48))+rsp] + mov r14,QWORD PTR[((32+0))+rsp] + mov r15,QWORD PTR[((32+8))+rsp] + mov r12,QWORD PTR[((32+16))+rsp] + mov r13,QWORD PTR[((32+24))+rsp] + + call __mulq_mont_383_nonred + mov rsi,QWORD PTR[((32+96))+rsp] + mov r12,QWORD PTR[((32+0))+rsp] + mov r13,QWORD PTR[((32+8))+rsp] + and r12,rsi + mov rax,QWORD PTR[((32+16))+rsp] + and r13,rsi + mov rbx,QWORD PTR[((32+24))+rsp] + and rax,rsi + mov rbp,QWORD PTR[((32+32))+rsp] + and rbx,rsi + and rbp,rsi + and rsi,QWORD PTR[((32+40))+rsp] + + sub r14,r12 + mov r12,QWORD PTR[rcx] + sbb r15,r13 + mov r13,QWORD PTR[8+rcx] + sbb r8,rax + mov rax,QWORD PTR[16+rcx] + sbb r9,rbx + mov rbx,QWORD PTR[24+rcx] + sbb r10,rbp + mov rbp,QWORD PTR[32+rcx] + sbb r11,rsi + sbb rsi,rsi + + and r12,rsi + and r13,rsi + and rax,rsi + and rbx,rsi + and rbp,rsi + and rsi,QWORD PTR[40+rcx] + + add r14,r12 + adc r15,r13 + adc r8,rax + adc r9,rbx + adc r10,rbp + adc r11,rsi + + mov QWORD PTR[rdi],r14 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],r8 + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_mont_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_mont_382x:: +sqr_mont_382x ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_mul_mont_384x + DD imagerel $L$SEH_body_mul_mont_384x + DD imagerel $L$SEH_info_mul_mont_384x_prologue + + DD imagerel $L$SEH_body_mul_mont_384x + DD imagerel $L$SEH_epilogue_mul_mont_384x + DD imagerel $L$SEH_info_mul_mont_384x_body + + DD imagerel $L$SEH_epilogue_mul_mont_384x + DD imagerel $L$SEH_end_mul_mont_384x + DD imagerel $L$SEH_info_mul_mont_384x_epilogue + + DD imagerel $L$SEH_begin_sqr_mont_384x + DD imagerel $L$SEH_body_sqr_mont_384x + DD imagerel $L$SEH_info_sqr_mont_384x_prologue + + DD imagerel $L$SEH_body_sqr_mont_384x + DD imagerel $L$SEH_epilogue_sqr_mont_384x + DD imagerel $L$SEH_info_sqr_mont_384x_body + + DD imagerel $L$SEH_epilogue_sqr_mont_384x + DD imagerel $L$SEH_end_sqr_mont_384x + DD imagerel $L$SEH_info_sqr_mont_384x_epilogue + + DD imagerel $L$SEH_begin_mul_382x + DD imagerel $L$SEH_body_mul_382x + DD imagerel $L$SEH_info_mul_382x_prologue + + DD imagerel $L$SEH_body_mul_382x + DD imagerel $L$SEH_epilogue_mul_382x + DD imagerel $L$SEH_info_mul_382x_body + + DD imagerel $L$SEH_epilogue_mul_382x + DD imagerel $L$SEH_end_mul_382x + DD imagerel $L$SEH_info_mul_382x_epilogue + + DD imagerel $L$SEH_begin_sqr_382x + DD imagerel $L$SEH_body_sqr_382x + DD imagerel $L$SEH_info_sqr_382x_prologue + + DD imagerel $L$SEH_body_sqr_382x + DD imagerel $L$SEH_epilogue_sqr_382x + DD imagerel $L$SEH_info_sqr_382x_body + + DD imagerel $L$SEH_epilogue_sqr_382x + DD imagerel $L$SEH_end_sqr_382x + DD imagerel $L$SEH_info_sqr_382x_epilogue + + DD imagerel $L$SEH_begin_mul_384 + DD imagerel $L$SEH_body_mul_384 + DD imagerel $L$SEH_info_mul_384_prologue + + DD imagerel $L$SEH_body_mul_384 + DD imagerel $L$SEH_epilogue_mul_384 + DD imagerel $L$SEH_info_mul_384_body + + DD imagerel $L$SEH_epilogue_mul_384 + DD imagerel $L$SEH_end_mul_384 + DD imagerel $L$SEH_info_mul_384_epilogue + + DD imagerel $L$SEH_begin_sqr_384 + DD imagerel $L$SEH_body_sqr_384 + DD imagerel $L$SEH_info_sqr_384_prologue + + DD imagerel $L$SEH_body_sqr_384 + DD imagerel $L$SEH_epilogue_sqr_384 + DD imagerel $L$SEH_info_sqr_384_body + + DD imagerel $L$SEH_epilogue_sqr_384 + DD imagerel $L$SEH_end_sqr_384 + DD imagerel $L$SEH_info_sqr_384_epilogue + + DD imagerel $L$SEH_begin_sqr_mont_384 + DD imagerel $L$SEH_body_sqr_mont_384 + DD imagerel $L$SEH_info_sqr_mont_384_prologue + + DD imagerel $L$SEH_body_sqr_mont_384 + DD imagerel $L$SEH_epilogue_sqr_mont_384 + DD imagerel $L$SEH_info_sqr_mont_384_body + + DD imagerel $L$SEH_epilogue_sqr_mont_384 + DD imagerel $L$SEH_end_sqr_mont_384 + DD imagerel $L$SEH_info_sqr_mont_384_epilogue + + DD imagerel $L$SEH_begin_redc_mont_384 + DD imagerel $L$SEH_body_redc_mont_384 + DD imagerel $L$SEH_info_redc_mont_384_prologue + + DD imagerel $L$SEH_body_redc_mont_384 + DD imagerel $L$SEH_epilogue_redc_mont_384 + DD imagerel $L$SEH_info_redc_mont_384_body + + DD imagerel $L$SEH_epilogue_redc_mont_384 + DD imagerel $L$SEH_end_redc_mont_384 + DD imagerel $L$SEH_info_redc_mont_384_epilogue + + DD imagerel $L$SEH_begin_from_mont_384 + DD imagerel $L$SEH_body_from_mont_384 + DD imagerel $L$SEH_info_from_mont_384_prologue + + DD imagerel $L$SEH_body_from_mont_384 + DD imagerel $L$SEH_epilogue_from_mont_384 + DD imagerel $L$SEH_info_from_mont_384_body + + DD imagerel $L$SEH_epilogue_from_mont_384 + DD imagerel $L$SEH_end_from_mont_384 + DD imagerel $L$SEH_info_from_mont_384_epilogue + + DD imagerel $L$SEH_begin_sgn0_pty_mont_384 + DD imagerel $L$SEH_body_sgn0_pty_mont_384 + DD imagerel $L$SEH_info_sgn0_pty_mont_384_prologue + + DD imagerel $L$SEH_body_sgn0_pty_mont_384 + DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384 + DD imagerel $L$SEH_info_sgn0_pty_mont_384_body + + DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384 + DD imagerel $L$SEH_end_sgn0_pty_mont_384 + DD imagerel $L$SEH_info_sgn0_pty_mont_384_epilogue + + DD imagerel $L$SEH_begin_sgn0_pty_mont_384x + DD imagerel $L$SEH_body_sgn0_pty_mont_384x + DD imagerel $L$SEH_info_sgn0_pty_mont_384x_prologue + + DD imagerel $L$SEH_body_sgn0_pty_mont_384x + DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384x + DD imagerel $L$SEH_info_sgn0_pty_mont_384x_body + + DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384x + DD imagerel $L$SEH_end_sgn0_pty_mont_384x + DD imagerel $L$SEH_info_sgn0_pty_mont_384x_epilogue + + DD imagerel $L$SEH_begin_mul_mont_384 + DD imagerel $L$SEH_body_mul_mont_384 + DD imagerel $L$SEH_info_mul_mont_384_prologue + + DD imagerel $L$SEH_body_mul_mont_384 + DD imagerel $L$SEH_epilogue_mul_mont_384 + DD imagerel $L$SEH_info_mul_mont_384_body + + DD imagerel $L$SEH_epilogue_mul_mont_384 + DD imagerel $L$SEH_end_mul_mont_384 + DD imagerel $L$SEH_info_mul_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqr_n_mul_mont_384 + DD imagerel $L$SEH_body_sqr_n_mul_mont_384 + DD imagerel $L$SEH_info_sqr_n_mul_mont_384_prologue + + DD imagerel $L$SEH_body_sqr_n_mul_mont_384 + DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_384 + DD imagerel $L$SEH_info_sqr_n_mul_mont_384_body + + DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_384 + DD imagerel $L$SEH_end_sqr_n_mul_mont_384 + DD imagerel $L$SEH_info_sqr_n_mul_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqr_n_mul_mont_383 + DD imagerel $L$SEH_body_sqr_n_mul_mont_383 + DD imagerel $L$SEH_info_sqr_n_mul_mont_383_prologue + + DD imagerel $L$SEH_body_sqr_n_mul_mont_383 + DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_383 + DD imagerel $L$SEH_info_sqr_n_mul_mont_383_body + + DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_383 + DD imagerel $L$SEH_end_sqr_n_mul_mont_383 + DD imagerel $L$SEH_info_sqr_n_mul_mont_383_epilogue + + DD imagerel $L$SEH_begin_sqr_mont_382x + DD imagerel $L$SEH_body_sqr_mont_382x + DD imagerel $L$SEH_info_sqr_mont_382x_prologue + + DD imagerel $L$SEH_body_sqr_mont_382x + DD imagerel $L$SEH_epilogue_sqr_mont_382x + DD imagerel $L$SEH_info_sqr_mont_382x_body + + DD imagerel $L$SEH_epilogue_sqr_mont_382x + DD imagerel $L$SEH_end_sqr_mont_382x + DD imagerel $L$SEH_info_sqr_mont_382x_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_mul_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_mont_384x_body:: +DB 1,0,18,0 +DB 000h,0f4h,029h,000h +DB 000h,0e4h,02ah,000h +DB 000h,0d4h,02bh,000h +DB 000h,0c4h,02ch,000h +DB 000h,034h,02dh,000h +DB 000h,054h,02eh,000h +DB 000h,074h,030h,000h +DB 000h,064h,031h,000h +DB 000h,001h,02fh,000h +$L$SEH_info_mul_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_mont_384x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_sqr_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_382x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_mul_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_382x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sqr_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_384_body:: +DB 1,0,11,0 +DB 000h,0c4h,000h,000h +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_mul_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sqr_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_mont_384_body:: +DB 1,0,18,0 +DB 000h,0f4h,00fh,000h +DB 000h,0e4h,010h,000h +DB 000h,0d4h,011h,000h +DB 000h,0c4h,012h,000h +DB 000h,034h,013h,000h +DB 000h,054h,014h,000h +DB 000h,074h,016h,000h +DB 000h,064h,017h,000h +DB 000h,001h,015h,000h +$L$SEH_info_sqr_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_redc_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_redc_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_redc_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_from_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_from_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_from_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0_pty_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sgn0_pty_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sgn0_pty_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0_pty_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sgn0_pty_mont_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sgn0_pty_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_n_mul_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_n_mul_mont_384_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_sqr_n_mul_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_n_mul_mont_383_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_n_mul_mont_383_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_sqr_n_mul_mont_383_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_mont_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_mont_382x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_sqr_mont_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/build/win64/mulx_mont_256-x86_64.asm b/build/win64/mulx_mont_256-x86_64.asm new file mode 100644 index 00000000..83534c62 --- /dev/null +++ b/build/win64/mulx_mont_256-x86_64.asm @@ -0,0 +1,796 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC mulx_mont_sparse_256 + + +ALIGN 32 +mulx_mont_sparse_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_mont_sparse_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_mulx_mont_sparse_256:: + + + mov rbx,rdx + mov rdx,QWORD PTR[rdx] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rbp,QWORD PTR[16+rsi] + mov r9,QWORD PTR[24+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r11,rax,r14 + call __mulx_mont_sparse_256 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mulx_mont_sparse_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_mont_sparse_256:: +mulx_mont_sparse_256 ENDP + +PUBLIC sqrx_mont_sparse_256 + + +ALIGN 32 +sqrx_mont_sparse_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_mont_sparse_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sqrx_mont_sparse_256:: + + + mov rbx,rsi + mov r8,rcx + mov rcx,rdx + mov rdx,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rbp,QWORD PTR[16+rsi] + mov r9,QWORD PTR[24+rsi] + lea rsi,QWORD PTR[((-128))+rbx] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r11,rax,rdx + call __mulx_mont_sparse_256 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqrx_mont_sparse_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_mont_sparse_256:: +sqrx_mont_sparse_256 ENDP + +ALIGN 32 +__mulx_mont_sparse_256 PROC PRIVATE + DB 243,15,30,250 + mulx r12,r15,r15 + mulx r13,rbp,rbp + add r11,r15 + mulx r14,r9,r9 + mov rdx,QWORD PTR[8+rbx] + adc r12,rbp + adc r13,r9 + adc r14,0 + + mov r10,rax + imul rax,r8 + + + xor r15,r15 + mulx r9,rbp,QWORD PTR[((0+128))+rsi] + adox r11,rbp + adcx r12,r9 + + mulx r9,rbp,QWORD PTR[((8+128))+rsi] + adox r12,rbp + adcx r13,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rsi] + adox r13,rbp + adcx r14,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rsi] + mov rdx,rax + adox r14,rbp + adcx r9,r15 + adox r15,r9 + + + mulx rax,rbp,QWORD PTR[((0+128))+rcx] + adcx r10,rbp + adox rax,r11 + + mulx r9,rbp,QWORD PTR[((8+128))+rcx] + adcx rax,rbp + adox r12,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rcx] + adcx r12,rbp + adox r13,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rcx] + mov rdx,QWORD PTR[16+rbx] + adcx r13,rbp + adox r14,r9 + adcx r14,r10 + adox r15,r10 + adcx r15,r10 + adox r10,r10 + adc r10,0 + mov r11,rax + imul rax,r8 + + + xor rbp,rbp + mulx r9,rbp,QWORD PTR[((0+128))+rsi] + adox r12,rbp + adcx r13,r9 + + mulx r9,rbp,QWORD PTR[((8+128))+rsi] + adox r13,rbp + adcx r14,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rsi] + adox r14,rbp + adcx r15,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rsi] + mov rdx,rax + adox r15,rbp + adcx r9,r10 + adox r10,r9 + + + mulx rax,rbp,QWORD PTR[((0+128))+rcx] + adcx r11,rbp + adox rax,r12 + + mulx r9,rbp,QWORD PTR[((8+128))+rcx] + adcx rax,rbp + adox r13,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rcx] + adcx r13,rbp + adox r14,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rcx] + mov rdx,QWORD PTR[24+rbx] + adcx r14,rbp + adox r15,r9 + adcx r15,r11 + adox r10,r11 + adcx r10,r11 + adox r11,r11 + adc r11,0 + mov r12,rax + imul rax,r8 + + + xor rbp,rbp + mulx r9,rbp,QWORD PTR[((0+128))+rsi] + adox r13,rbp + adcx r14,r9 + + mulx r9,rbp,QWORD PTR[((8+128))+rsi] + adox r14,rbp + adcx r15,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rsi] + adox r15,rbp + adcx r10,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rsi] + mov rdx,rax + adox r10,rbp + adcx r9,r11 + adox r11,r9 + + + mulx rax,rbp,QWORD PTR[((0+128))+rcx] + adcx r12,rbp + adox rax,r13 + + mulx r9,rbp,QWORD PTR[((8+128))+rcx] + adcx rax,rbp + adox r14,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rcx] + adcx r14,rbp + adox r15,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rcx] + mov rdx,rax + adcx r15,rbp + adox r10,r9 + adcx r10,r12 + adox r11,r12 + adcx r11,r12 + adox r12,r12 + adc r12,0 + imul rdx,r8 + + + xor rbp,rbp + mulx r9,r13,QWORD PTR[((0+128))+rcx] + adcx r13,rax + adox r14,r9 + + mulx r9,rbp,QWORD PTR[((8+128))+rcx] + adcx r14,rbp + adox r15,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rcx] + adcx r15,rbp + adox r10,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rcx] + mov rdx,r14 + lea rcx,QWORD PTR[128+rcx] + adcx r10,rbp + adox r11,r9 + mov rax,r15 + adcx r11,r13 + adox r12,r13 + adc r12,0 + + + + + mov rbp,r10 + sub r14,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rcx] + mov r9,r11 + sbb r11,QWORD PTR[24+rcx] + sbb r12,0 + + cmovc r14,rdx + cmovc r15,rax + cmovc r10,rbp + mov QWORD PTR[rdi],r14 + cmovc r11,r9 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + DB 0F3h,0C3h ;repret +__mulx_mont_sparse_256 ENDP +PUBLIC fromx_mont_256 + + +ALIGN 32 +fromx_mont_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_fromx_mont_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_fromx_mont_256:: + + + mov rbx,rdx + call __mulx_by_1_mont_256 + + + + + + mov rdx,r15 + mov r12,r10 + mov r13,r11 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r10,QWORD PTR[16+rbx] + sbb r11,QWORD PTR[24+rbx] + + cmovnc rax,r14 + cmovnc rdx,r15 + cmovnc r12,r10 + mov QWORD PTR[rdi],rax + cmovnc r13,r11 + mov QWORD PTR[8+rdi],rdx + mov QWORD PTR[16+rdi],r12 + mov QWORD PTR[24+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_fromx_mont_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_fromx_mont_256:: +fromx_mont_256 ENDP + +PUBLIC redcx_mont_256 + + +ALIGN 32 +redcx_mont_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_redcx_mont_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_redcx_mont_256:: + + + mov rbx,rdx + call __mulx_by_1_mont_256 + + add r14,QWORD PTR[32+rsi] + adc r15,QWORD PTR[40+rsi] + mov rax,r14 + adc r10,QWORD PTR[48+rsi] + mov rdx,r15 + adc r11,QWORD PTR[56+rsi] + sbb rsi,rsi + + + + + mov r12,r10 + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r10,QWORD PTR[16+rbx] + mov r13,r11 + sbb r11,QWORD PTR[24+rbx] + sbb rsi,0 + + cmovnc rax,r14 + cmovnc rdx,r15 + cmovnc r12,r10 + mov QWORD PTR[rdi],rax + cmovnc r13,r11 + mov QWORD PTR[8+rdi],rdx + mov QWORD PTR[16+rdi],r12 + mov QWORD PTR[24+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_redcx_mont_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_redcx_mont_256:: +redcx_mont_256 ENDP + +ALIGN 32 +__mulx_by_1_mont_256 PROC PRIVATE + DB 243,15,30,250 + mov rax,QWORD PTR[rsi] + mov r11,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov r13,QWORD PTR[24+rsi] + + mov r14,rax + imul rax,rcx + mov r10,rax + + mul QWORD PTR[rbx] + add r14,rax + mov rax,r10 + adc r14,rdx + + mul QWORD PTR[8+rbx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[16+rbx] + mov r15,r11 + imul r11,rcx + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[24+rbx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[rbx] + add r15,rax + mov rax,r11 + adc r15,rdx + + mul QWORD PTR[8+rbx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rbx] + mov r10,r12 + imul r12,rcx + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[24+rbx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[rbx] + add r10,rax + mov rax,r12 + adc r10,rdx + + mul QWORD PTR[8+rbx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rbx] + mov r11,r13 + imul r13,rcx + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[24+rbx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[rbx] + add r11,rax + mov rax,r13 + adc r11,rdx + + mul QWORD PTR[8+rbx] + add r14,rax + mov rax,r13 + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rbx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[24+rbx] + add r10,rax + mov rax,r14 + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + DB 0F3h,0C3h ;repret +__mulx_by_1_mont_256 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_mulx_mont_sparse_256 + DD imagerel $L$SEH_body_mulx_mont_sparse_256 + DD imagerel $L$SEH_info_mulx_mont_sparse_256_prologue + + DD imagerel $L$SEH_body_mulx_mont_sparse_256 + DD imagerel $L$SEH_epilogue_mulx_mont_sparse_256 + DD imagerel $L$SEH_info_mulx_mont_sparse_256_body + + DD imagerel $L$SEH_epilogue_mulx_mont_sparse_256 + DD imagerel $L$SEH_end_mulx_mont_sparse_256 + DD imagerel $L$SEH_info_mulx_mont_sparse_256_epilogue + + DD imagerel $L$SEH_begin_sqrx_mont_sparse_256 + DD imagerel $L$SEH_body_sqrx_mont_sparse_256 + DD imagerel $L$SEH_info_sqrx_mont_sparse_256_prologue + + DD imagerel $L$SEH_body_sqrx_mont_sparse_256 + DD imagerel $L$SEH_epilogue_sqrx_mont_sparse_256 + DD imagerel $L$SEH_info_sqrx_mont_sparse_256_body + + DD imagerel $L$SEH_epilogue_sqrx_mont_sparse_256 + DD imagerel $L$SEH_end_sqrx_mont_sparse_256 + DD imagerel $L$SEH_info_sqrx_mont_sparse_256_epilogue + + DD imagerel $L$SEH_begin_fromx_mont_256 + DD imagerel $L$SEH_body_fromx_mont_256 + DD imagerel $L$SEH_info_fromx_mont_256_prologue + + DD imagerel $L$SEH_body_fromx_mont_256 + DD imagerel $L$SEH_epilogue_fromx_mont_256 + DD imagerel $L$SEH_info_fromx_mont_256_body + + DD imagerel $L$SEH_epilogue_fromx_mont_256 + DD imagerel $L$SEH_end_fromx_mont_256 + DD imagerel $L$SEH_info_fromx_mont_256_epilogue + + DD imagerel $L$SEH_begin_redcx_mont_256 + DD imagerel $L$SEH_body_redcx_mont_256 + DD imagerel $L$SEH_info_redcx_mont_256_prologue + + DD imagerel $L$SEH_body_redcx_mont_256 + DD imagerel $L$SEH_epilogue_redcx_mont_256 + DD imagerel $L$SEH_info_redcx_mont_256_body + + DD imagerel $L$SEH_epilogue_redcx_mont_256 + DD imagerel $L$SEH_end_redcx_mont_256 + DD imagerel $L$SEH_info_redcx_mont_256_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_mulx_mont_sparse_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mulx_mont_sparse_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mulx_mont_sparse_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_mont_sparse_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_mont_sparse_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sqrx_mont_sparse_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_fromx_mont_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_fromx_mont_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_fromx_mont_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_redcx_mont_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_redcx_mont_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_redcx_mont_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/build/win64/mulx_mont_384-x86_64.asm b/build/win64/mulx_mont_384-x86_64.asm new file mode 100644 index 00000000..bd2c0912 --- /dev/null +++ b/build/win64/mulx_mont_384-x86_64.asm @@ -0,0 +1,3587 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + + + + + + + + +ALIGN 32 +__sub_mod_384x384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + + sub r8,QWORD PTR[rdx] + mov r15,QWORD PTR[56+rsi] + sbb r9,QWORD PTR[8+rdx] + mov rax,QWORD PTR[64+rsi] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[72+rsi] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[80+rsi] + sbb r12,QWORD PTR[32+rdx] + mov rsi,QWORD PTR[88+rsi] + sbb r13,QWORD PTR[40+rdx] + mov QWORD PTR[rdi],r8 + sbb r14,QWORD PTR[48+rdx] + mov r8,QWORD PTR[rcx] + mov QWORD PTR[8+rdi],r9 + sbb r15,QWORD PTR[56+rdx] + mov r9,QWORD PTR[8+rcx] + mov QWORD PTR[16+rdi],r10 + sbb rax,QWORD PTR[64+rdx] + mov r10,QWORD PTR[16+rcx] + mov QWORD PTR[24+rdi],r11 + sbb rbx,QWORD PTR[72+rdx] + mov r11,QWORD PTR[24+rcx] + mov QWORD PTR[32+rdi],r12 + sbb rbp,QWORD PTR[80+rdx] + mov r12,QWORD PTR[32+rcx] + mov QWORD PTR[40+rdi],r13 + sbb rsi,QWORD PTR[88+rdx] + mov r13,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r8,rdx + and r9,rdx + and r10,rdx + and r11,rdx + and r12,rdx + and r13,rdx + + add r14,r8 + adc r15,r9 + mov QWORD PTR[48+rdi],r14 + adc rax,r10 + mov QWORD PTR[56+rdi],r15 + adc rbx,r11 + mov QWORD PTR[64+rdi],rax + adc rbp,r12 + mov QWORD PTR[72+rdi],rbx + adc rsi,r13 + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rsi + + DB 0F3h,0C3h ;repret +__sub_mod_384x384 ENDP + + +ALIGN 32 +__add_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + adc r10,QWORD PTR[16+rdx] + mov r14,r8 + adc r11,QWORD PTR[24+rdx] + mov r15,r9 + adc r12,QWORD PTR[32+rdx] + mov rax,r10 + adc r13,QWORD PTR[40+rdx] + mov rbx,r11 + sbb rdx,rdx + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + mov QWORD PTR[rdi],r8 + cmovc r11,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r12,rbp + mov QWORD PTR[16+rdi],r10 + cmovc r13,rsi + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__add_mod_384 ENDP + + +ALIGN 32 +__sub_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +__sub_mod_384_a_is_loaded:: + sub r8,QWORD PTR[rdx] + mov r14,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov r15,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rax,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbx,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rdx] + mov rbp,QWORD PTR[32+rcx] + sbb r13,QWORD PTR[40+rdx] + mov rsi,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r14,rdx + and r15,rdx + and rax,rdx + and rbx,rdx + and rbp,rdx + and rsi,rdx + + add r8,r14 + adc r9,r15 + mov QWORD PTR[rdi],r8 + adc r10,rax + mov QWORD PTR[8+rdi],r9 + adc r11,rbx + mov QWORD PTR[16+rdi],r10 + adc r12,rbp + mov QWORD PTR[24+rdi],r11 + adc r13,rsi + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__sub_mod_384 ENDP +PUBLIC mulx_mont_384x + + +ALIGN 32 +mulx_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_mont_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,328 + +$L$SEH_body_mulx_mont_384x:: + + + mov rbx,rdx + mov QWORD PTR[32+rsp],rdi + mov QWORD PTR[24+rsp],rsi + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[8+rsp],rcx + mov QWORD PTR[rsp],r8 + + + + + lea rdi,QWORD PTR[40+rsp] + call __mulx_384 + + + lea rbx,QWORD PTR[48+rbx] + lea rsi,QWORD PTR[((128+48))+rsi] + lea rdi,QWORD PTR[96+rdi] + call __mulx_384 + + + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[rbx] + lea rdx,QWORD PTR[((-48))+rbx] + lea rdi,QWORD PTR[((40+192+48))+rsp] + call __add_mod_384 + + mov rsi,QWORD PTR[24+rsp] + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((-48))+rdi] + call __add_mod_384 + + lea rbx,QWORD PTR[rdi] + lea rsi,QWORD PTR[48+rdi] + call __mulx_384 + + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[40+rsp] + mov rcx,QWORD PTR[8+rsp] + call __sub_mod_384x384 + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[((-96))+rdi] + call __sub_mod_384x384 + + + lea rsi,QWORD PTR[40+rsp] + lea rdx,QWORD PTR[((40+96))+rsp] + lea rdi,QWORD PTR[40+rsp] + call __sub_mod_384x384 + + lea rbx,QWORD PTR[rcx] + + + lea rsi,QWORD PTR[40+rsp] + mov rcx,QWORD PTR[rsp] + mov rdi,QWORD PTR[32+rsp] + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + + lea rsi,QWORD PTR[((40+192))+rsp] + mov rcx,QWORD PTR[rsp] + lea rdi,QWORD PTR[48+rdi] + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + lea r8,QWORD PTR[328+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_mulx_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_mont_384x:: +mulx_mont_384x ENDP +PUBLIC sqrx_mont_384x + + +ALIGN 32 +sqrx_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_mont_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_sqrx_mont_384x:: + + + mov QWORD PTR[rsp],rcx + mov rcx,rdx + + mov QWORD PTR[16+rsp],rsi +DB 102,72,15,110,199 + + + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[32+rsp] + call __add_mod_384 + + + mov rsi,QWORD PTR[16+rsp] + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((32+48))+rsp] + call __sub_mod_384 + + + mov rsi,QWORD PTR[16+rsp] + lea rbx,QWORD PTR[48+rsi] + + mov rdx,QWORD PTR[48+rsi] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,r14 + call __mulx_mont_384 + add rdx,rdx + adc r15,r15 + adc rax,rax + mov r8,rdx + adc r12,r12 + mov r9,r15 + adc rdi,rdi + mov r10,rax + adc rbp,rbp + mov r11,r12 + sbb rsi,rsi + + sub rdx,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + mov r13,rdi + sbb rax,QWORD PTR[16+rcx] + sbb r12,QWORD PTR[24+rcx] + sbb rdi,QWORD PTR[32+rcx] + mov r14,rbp + sbb rbp,QWORD PTR[40+rcx] + sbb rsi,0 + + cmovc rdx,r8 + cmovc r15,r9 + cmovc rax,r10 + mov QWORD PTR[48+rbx],rdx + cmovc r12,r11 + mov QWORD PTR[56+rbx],r15 + cmovc rdi,r13 + mov QWORD PTR[64+rbx],rax + cmovc rbp,r14 + mov QWORD PTR[72+rbx],r12 + mov QWORD PTR[80+rbx],rdi + mov QWORD PTR[88+rbx],rbp + + lea rsi,QWORD PTR[32+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + + mov rdx,QWORD PTR[((32+48))+rsp] + mov r14,QWORD PTR[((32+0))+rsp] + mov r15,QWORD PTR[((32+8))+rsp] + mov rax,QWORD PTR[((32+16))+rsp] + mov r12,QWORD PTR[((32+24))+rsp] + mov rdi,QWORD PTR[((32+32))+rsp] + mov rbp,QWORD PTR[((32+40))+rsp] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,r14 + call __mulx_mont_384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqrx_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_mont_384x:: +sqrx_mont_384x ENDP + +PUBLIC mulx_382x + + +ALIGN 32 +mulx_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_382x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_mulx_382x:: + + + lea rdi,QWORD PTR[96+rdi] + mov QWORD PTR[rsp],rsi + mov QWORD PTR[8+rsp],rdx + mov QWORD PTR[16+rsp],rdi + mov QWORD PTR[24+rsp],rcx + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + add r8,QWORD PTR[48+rsi] + adc r9,QWORD PTR[56+rsi] + adc r10,QWORD PTR[64+rsi] + adc r11,QWORD PTR[72+rsi] + adc r12,QWORD PTR[80+rsi] + adc r13,QWORD PTR[88+rsi] + + mov QWORD PTR[((32+0))+rsp],r8 + mov QWORD PTR[((32+8))+rsp],r9 + mov QWORD PTR[((32+16))+rsp],r10 + mov QWORD PTR[((32+24))+rsp],r11 + mov QWORD PTR[((32+32))+rsp],r12 + mov QWORD PTR[((32+40))+rsp],r13 + + + mov r8,QWORD PTR[rdx] + mov r9,QWORD PTR[8+rdx] + mov r10,QWORD PTR[16+rdx] + mov r11,QWORD PTR[24+rdx] + mov r12,QWORD PTR[32+rdx] + mov r13,QWORD PTR[40+rdx] + + add r8,QWORD PTR[48+rdx] + adc r9,QWORD PTR[56+rdx] + adc r10,QWORD PTR[64+rdx] + adc r11,QWORD PTR[72+rdx] + adc r12,QWORD PTR[80+rdx] + adc r13,QWORD PTR[88+rdx] + + mov QWORD PTR[((32+48))+rsp],r8 + mov QWORD PTR[((32+56))+rsp],r9 + mov QWORD PTR[((32+64))+rsp],r10 + mov QWORD PTR[((32+72))+rsp],r11 + mov QWORD PTR[((32+80))+rsp],r12 + mov QWORD PTR[((32+88))+rsp],r13 + + + lea rsi,QWORD PTR[((32+0))+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + call __mulx_384 + + + mov rsi,QWORD PTR[rsp] + mov rbx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-96))+rdi] + call __mulx_384 + + + lea rsi,QWORD PTR[((48+128))+rsi] + lea rbx,QWORD PTR[48+rbx] + lea rdi,QWORD PTR[32+rsp] + call __mulx_384 + + + mov rsi,QWORD PTR[16+rsp] + lea rdx,QWORD PTR[32+rsp] + mov rcx,QWORD PTR[24+rsp] + mov rdi,rsi + call __sub_mod_384x384 + + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[((-96))+rdi] + call __sub_mod_384x384 + + + lea rsi,QWORD PTR[((-96))+rdi] + lea rdx,QWORD PTR[32+rsp] + lea rdi,QWORD PTR[((-96))+rdi] + call __sub_mod_384x384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_mulx_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_382x:: +mulx_382x ENDP +PUBLIC sqrx_382x + + +ALIGN 32 +sqrx_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_382x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_sqrx_382x:: + + + mov rcx,rdx + + + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov rbx,QWORD PTR[24+rsi] + mov rbp,QWORD PTR[32+rsi] + mov rdx,QWORD PTR[40+rsi] + + mov r8,r14 + add r14,QWORD PTR[48+rsi] + mov r9,r15 + adc r15,QWORD PTR[56+rsi] + mov r10,rax + adc rax,QWORD PTR[64+rsi] + mov r11,rbx + adc rbx,QWORD PTR[72+rsi] + mov r12,rbp + adc rbp,QWORD PTR[80+rsi] + mov r13,rdx + adc rdx,QWORD PTR[88+rsi] + + mov QWORD PTR[rdi],r14 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],rax + mov QWORD PTR[24+rdi],rbx + mov QWORD PTR[32+rdi],rbp + mov QWORD PTR[40+rdi],rdx + + + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[48+rdi] + call __sub_mod_384_a_is_loaded + + + lea rsi,QWORD PTR[rdi] + lea rbx,QWORD PTR[((-48))+rdi] + lea rdi,QWORD PTR[((-48))+rdi] + call __mulx_384 + + + mov rsi,QWORD PTR[rsp] + lea rbx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[96+rdi] + call __mulx_384 + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + mov r12,QWORD PTR[32+rdi] + mov r13,QWORD PTR[40+rdi] + mov r14,QWORD PTR[48+rdi] + mov r15,QWORD PTR[56+rdi] + mov rax,QWORD PTR[64+rdi] + mov rbx,QWORD PTR[72+rdi] + mov rbp,QWORD PTR[80+rdi] + add r8,r8 + mov rdx,QWORD PTR[88+rdi] + adc r9,r9 + mov QWORD PTR[rdi],r8 + adc r10,r10 + mov QWORD PTR[8+rdi],r9 + adc r11,r11 + mov QWORD PTR[16+rdi],r10 + adc r12,r12 + mov QWORD PTR[24+rdi],r11 + adc r13,r13 + mov QWORD PTR[32+rdi],r12 + adc r14,r14 + mov QWORD PTR[40+rdi],r13 + adc r15,r15 + mov QWORD PTR[48+rdi],r14 + adc rax,rax + mov QWORD PTR[56+rdi],r15 + adc rbx,rbx + mov QWORD PTR[64+rdi],rax + adc rbp,rbp + mov QWORD PTR[72+rdi],rbx + adc rdx,rdx + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rdx + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqrx_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_382x:: +sqrx_382x ENDP +PUBLIC mulx_384 + + +ALIGN 32 +mulx_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$SEH_body_mulx_384:: + + + mov rbx,rdx + call __mulx_384 + + mov r15,QWORD PTR[rsp] + + mov r14,QWORD PTR[8+rsp] + + mov r13,QWORD PTR[16+rsp] + + mov r12,QWORD PTR[24+rsp] + + mov rbx,QWORD PTR[32+rsp] + + mov rbp,QWORD PTR[40+rsp] + + lea rsp,QWORD PTR[48+rsp] + +$L$SEH_epilogue_mulx_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_384:: +mulx_384 ENDP + + +ALIGN 32 +__mulx_384 PROC PRIVATE + DB 243,15,30,250 + mov rdx,QWORD PTR[rbx] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + + mulx rcx,r9,r14 + xor rbp,rbp + + mulx rax,r8,r15 + adcx r8,rcx + mov QWORD PTR[rdi],r9 + + mulx rcx,r9,r10 + adcx r9,rax + + mulx rax,r10,r11 + adcx r10,rcx + + mulx rcx,r11,r12 + adcx r11,rax + + mulx r13,r12,r13 + mov rdx,QWORD PTR[8+rbx] + adcx r12,rcx + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[8+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,QWORD PTR[16+rbx] + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[16+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,QWORD PTR[24+rbx] + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[24+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,QWORD PTR[32+rbx] + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[32+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,QWORD PTR[40+rbx] + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[40+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,rax + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mov QWORD PTR[48+rdi],r8 + mov QWORD PTR[56+rdi],r9 + mov QWORD PTR[64+rdi],r10 + mov QWORD PTR[72+rdi],r11 + mov QWORD PTR[80+rdi],r12 + mov QWORD PTR[88+rdi],r13 + + DB 0F3h,0C3h ;repret +__mulx_384 ENDP +PUBLIC sqrx_384 + + +ALIGN 32 +sqrx_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_384:: + mov rdi,rcx + mov rsi,rdx + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_sqrx_384:: + + + call __sqrx_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqrx_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_384:: +sqrx_384 ENDP + +ALIGN 32 +__sqrx_384 PROC PRIVATE + DB 243,15,30,250 + mov rdx,QWORD PTR[rsi] + mov r14,QWORD PTR[8+rsi] + mov r15,QWORD PTR[16+rsi] + mov rcx,QWORD PTR[24+rsi] + mov rbx,QWORD PTR[32+rsi] +DB 102,72,15,110,199 + + + mulx rdi,r8,r14 + mov rbp,QWORD PTR[40+rsi] + mulx rax,r9,r15 + add r9,rdi + mulx rdi,r10,rcx + adc r10,rax + mulx rax,r11,rbx + adc r11,rdi + mulx r13,r12,rbp + mov rdx,r14 + adc r12,rax + adc r13,0 + + + xor r14,r14 + mulx rax,rdi,r15 + adcx r10,rdi + adox r11,rax + + mulx rax,rdi,rcx + adcx r11,rdi + adox r12,rax + + mulx rax,rdi,rbx + adcx r12,rdi + adox r13,rax + + mulx rax,rdi,rbp + mov rdx,r15 + adcx r13,rdi + adox rax,r14 + adcx r14,rax + + + xor r15,r15 + mulx rax,rdi,rcx + adcx r12,rdi + adox r13,rax + + mulx rax,rdi,rbx + adcx r13,rdi + adox r14,rax + + mulx rax,rdi,rbp + mov rdx,rcx + adcx r14,rdi + adox rax,r15 + adcx r15,rax + + + xor rcx,rcx + mulx rax,rdi,rbx + adcx r14,rdi + adox r15,rax + + mulx rax,rdi,rbp + mov rdx,rbx + adcx r15,rdi + adox rax,rcx + adcx rcx,rax + + + mulx rbx,rdi,rbp + mov rdx,QWORD PTR[rsi] + add rcx,rdi +DB 102,72,15,126,199 + adc rbx,0 + + + xor rbp,rbp + adcx r8,r8 + adcx r9,r9 + adcx r10,r10 + adcx r11,r11 + adcx r12,r12 + + + mulx rax,rdx,rdx + mov QWORD PTR[rdi],rdx + mov rdx,QWORD PTR[8+rsi] + adox r8,rax + mov QWORD PTR[8+rdi],r8 + + mulx rax,r8,rdx + mov rdx,QWORD PTR[16+rsi] + adox r9,r8 + adox r10,rax + mov QWORD PTR[16+rdi],r9 + mov QWORD PTR[24+rdi],r10 + + mulx r9,r8,rdx + mov rdx,QWORD PTR[24+rsi] + adox r11,r8 + adox r12,r9 + adcx r13,r13 + adcx r14,r14 + mov QWORD PTR[32+rdi],r11 + mov QWORD PTR[40+rdi],r12 + + mulx r9,r8,rdx + mov rdx,QWORD PTR[32+rsi] + adox r13,r8 + adox r14,r9 + adcx r15,r15 + adcx rcx,rcx + mov QWORD PTR[48+rdi],r13 + mov QWORD PTR[56+rdi],r14 + + mulx r9,r8,rdx + mov rdx,QWORD PTR[40+rsi] + adox r15,r8 + adox rcx,r9 + adcx rbx,rbx + adcx rbp,rbp + mov QWORD PTR[64+rdi],r15 + mov QWORD PTR[72+rdi],rcx + + mulx r9,r8,rdx + adox rbx,r8 + adox rbp,r9 + + mov QWORD PTR[80+rdi],rbx + mov QWORD PTR[88+rdi],rbp + + DB 0F3h,0C3h ;repret +__sqrx_384 ENDP + + + +PUBLIC redcx_mont_384 + + +ALIGN 32 +redcx_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_redcx_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_redcx_mont_384:: + + + mov rbx,rdx + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_redcx_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_redcx_mont_384:: +redcx_mont_384 ENDP + + + + +PUBLIC fromx_mont_384 + + +ALIGN 32 +fromx_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_fromx_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_fromx_mont_384:: + + + mov rbx,rdx + call __mulx_by_1_mont_384 + + + + + mov rax,r14 + mov rcx,r15 + mov rdx,r8 + mov rbp,r9 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + mov r13,r10 + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rbx] + + cmovc r14,rax + cmovc r15,rcx + cmovc r8,rdx + mov QWORD PTR[rdi],r14 + cmovc r9,rbp + mov QWORD PTR[8+rdi],r15 + cmovc r10,r13 + mov QWORD PTR[16+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_fromx_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_fromx_mont_384:: +fromx_mont_384 ENDP + +ALIGN 32 +__mulx_by_1_mont_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov rdx,rcx + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + imul rdx,r8 + + + xor r14,r14 + mulx rbp,rax,QWORD PTR[rbx] + adcx r8,rax + adox r9,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r9,rax + adox r10,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r10,rax + adox r11,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r11,rax + adox r12,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r13,rax + adox rbp,r14 + adcx r14,rbp + imul rdx,r9 + + + xor r15,r15 + mulx rbp,rax,QWORD PTR[rbx] + adcx r9,rax + adox r10,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r10,rax + adox r11,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r11,rax + adox r12,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r14,rax + adox rbp,r15 + adcx r15,rbp + imul rdx,r10 + + + xor r8,r8 + mulx rbp,rax,QWORD PTR[rbx] + adcx r10,rax + adox r11,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r11,rax + adox r12,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r14,rax + adox r15,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r15,rax + adox rbp,r8 + adcx r8,rbp + imul rdx,r11 + + + xor r9,r9 + mulx rbp,rax,QWORD PTR[rbx] + adcx r11,rax + adox r12,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r14,rax + adox r15,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r15,rax + adox r8,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r8,rax + adox rbp,r9 + adcx r9,rbp + imul rdx,r12 + + + xor r10,r10 + mulx rbp,rax,QWORD PTR[rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r14,rax + adox r15,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r15,rax + adox r8,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r8,rax + adox r9,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r9,rax + adox rbp,r10 + adcx r10,rbp + imul rdx,r13 + + + xor r11,r11 + mulx rbp,rax,QWORD PTR[rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r14,rax + adox r15,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r15,rax + adox r8,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r8,rax + adox r9,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r9,rax + adox r10,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r10,rax + adox rbp,r11 + adcx r11,rbp + DB 0F3h,0C3h ;repret +__mulx_by_1_mont_384 ENDP + + +ALIGN 32 +__redc_tail_mont_384 PROC PRIVATE + DB 243,15,30,250 + add r14,QWORD PTR[48+rsi] + mov rax,r14 + adc r15,QWORD PTR[56+rsi] + adc r8,QWORD PTR[64+rsi] + adc r9,QWORD PTR[72+rsi] + mov rcx,r15 + adc r10,QWORD PTR[80+rsi] + adc r11,QWORD PTR[88+rsi] + sbb r12,r12 + + + + + mov rdx,r8 + mov rbp,r9 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + mov r13,r10 + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rbx] + sbb r12,0 + + cmovc r14,rax + cmovc r15,rcx + cmovc r8,rdx + mov QWORD PTR[rdi],r14 + cmovc r9,rbp + mov QWORD PTR[8+rdi],r15 + cmovc r10,r13 + mov QWORD PTR[16+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + DB 0F3h,0C3h ;repret +__redc_tail_mont_384 ENDP + +PUBLIC sgn0x_pty_mont_384 + + +ALIGN 32 +sgn0x_pty_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0x_pty_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sgn0x_pty_mont_384:: + + + mov rbx,rsi + lea rsi,QWORD PTR[rdi] + mov rcx,rdx + call __mulx_by_1_mont_384 + + xor rax,rax + mov r13,r14 + add r14,r14 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rax,0 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rax,0 + + not rax + and r13,1 + and rax,2 + or rax,r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sgn0x_pty_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0x_pty_mont_384:: +sgn0x_pty_mont_384 ENDP + +PUBLIC sgn0x_pty_mont_384x + + +ALIGN 32 +sgn0x_pty_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0x_pty_mont_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sgn0x_pty_mont_384x:: + + + mov rbx,rsi + lea rsi,QWORD PTR[48+rdi] + mov rcx,rdx + call __mulx_by_1_mont_384 + + mov r12,r14 + or r14,r15 + or r14,r8 + or r14,r9 + or r14,r10 + or r14,r11 + + lea rsi,QWORD PTR[rdi] + xor rdi,rdi + mov r13,r12 + add r12,r12 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rdi,0 + + sub r12,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rdi,0 + + mov QWORD PTR[rsp],r14 + not rdi + and r13,1 + and rdi,2 + or rdi,r13 + + call __mulx_by_1_mont_384 + + mov r12,r14 + or r14,r15 + or r14,r8 + or r14,r9 + or r14,r10 + or r14,r11 + + xor rax,rax + mov r13,r12 + add r12,r12 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rax,0 + + sub r12,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rax,0 + + mov r12,QWORD PTR[rsp] + + not rax + + test r14,r14 + cmovz r13,rdi + + test r12,r12 + cmovnz rax,rdi + + and r13,1 + and rax,2 + or rax,r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sgn0x_pty_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0x_pty_mont_384x:: +sgn0x_pty_mont_384x ENDP +PUBLIC mulx_mont_384 + + +ALIGN 32 +mulx_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,QWORD PTR[((-24))+rsp] + +$L$SEH_body_mulx_mont_384:: + + + mov rbx,rdx + mov rdx,QWORD PTR[rdx] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] +DB 102,72,15,110,199 + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + mov QWORD PTR[rsp],r8 + + mulx r9,r8,r14 + call __mulx_mont_384 + + mov r15,QWORD PTR[24+rsp] + + mov r14,QWORD PTR[32+rsp] + + mov r13,QWORD PTR[40+rsp] + + mov r12,QWORD PTR[48+rsp] + + mov rbx,QWORD PTR[56+rsp] + + mov rbp,QWORD PTR[64+rsp] + + lea rsp,QWORD PTR[72+rsp] + +$L$SEH_epilogue_mulx_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_mont_384:: +mulx_mont_384 ENDP + +ALIGN 32 +__mulx_mont_384 PROC PRIVATE + DB 243,15,30,250 + + mulx r10,r14,r15 + mulx r11,r15,rax + add r9,r14 + mulx r12,rax,r12 + adc r10,r15 + mulx r13,rdi,rdi + adc r11,rax + mulx r14,rbp,rbp + mov rdx,QWORD PTR[8+rbx] + adc r12,rdi + adc r13,rbp + adc r14,0 + xor r15,r15 + + mov QWORD PTR[16+rsp],r8 + imul r8,QWORD PTR[8+rsp] + + + xor rax,rax + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r9,rdi + adcx r10,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r10,rdi + adcx r11,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r8 + adox r14,rdi + adcx r15,rbp + adox r15,rax + adox rax,rax + + + xor r8,r8 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r9,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r9,rdi + adox r10,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r10,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[16+rbx] + adcx r13,rdi + adox r14,rbp + adcx r14,r8 + adox r15,r8 + adcx r15,r8 + adox rax,r8 + adcx rax,r8 + mov QWORD PTR[16+rsp],r9 + imul r9,QWORD PTR[8+rsp] + + + xor r8,r8 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r10,rdi + adcx r11,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r9 + adox r15,rdi + adcx rax,rbp + adox rax,r8 + adox r8,r8 + + + xor r9,r9 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r10,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r10,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[24+rbx] + adcx r14,rdi + adox r15,rbp + adcx r15,r9 + adox rax,r9 + adcx rax,r9 + adox r8,r9 + adcx r8,r9 + mov QWORD PTR[16+rsp],r10 + imul r10,QWORD PTR[8+rsp] + + + xor r9,r9 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r10 + adox rax,rdi + adcx r8,rbp + adox r8,r9 + adox r9,r9 + + + xor r10,r10 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[32+rbx] + adcx r15,rdi + adox rax,rbp + adcx rax,r10 + adox r8,r10 + adcx r8,r10 + adox r9,r10 + adcx r9,r10 + mov QWORD PTR[16+rsp],r11 + imul r11,QWORD PTR[8+rsp] + + + xor r10,r10 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox rax,rdi + adcx r8,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r11 + adox r8,rdi + adcx r9,rbp + adox r9,r10 + adox r10,r10 + + + xor r11,r11 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[40+rbx] + adcx rax,rdi + adox r8,rbp + adcx r8,r11 + adox r9,r11 + adcx r9,r11 + adox r10,r11 + adcx r10,r11 + mov QWORD PTR[16+rsp],r12 + imul r12,QWORD PTR[8+rsp] + + + xor r11,r11 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox rax,rdi + adcx r8,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r8,rdi + adcx r9,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r12 + adox r9,rdi + adcx r10,rbp + adox r10,r11 + adox r11,r11 + + + xor r12,r12 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx rax,rdi + adox r8,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,r13 + adcx r8,rdi + adox r9,rbp + adcx r9,r12 + adox r10,r12 + adcx r10,r12 + adox r11,r12 + adcx r11,r12 + imul rdx,QWORD PTR[8+rsp] +DB 102,72,15,126,195 + + + xor r12,r12 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx rax,rdi + adox r8,rbp + mov r13,r15 + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r8,rdi + adox r9,rbp + mov rsi,rax + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + adcx r9,rdi + adox r10,rbp + mov rdx,r14 + adcx r10,r12 + adox r11,r12 + lea rcx,QWORD PTR[128+rcx] + mov r12,r8 + adc r11,0 + + + + + sub r14,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + mov rdi,r9 + sbb rax,QWORD PTR[16+rcx] + sbb r8,QWORD PTR[24+rcx] + sbb r9,QWORD PTR[32+rcx] + mov rbp,r10 + sbb r10,QWORD PTR[40+rcx] + sbb r11,0 + + cmovnc rdx,r14 + cmovc r15,r13 + cmovc rax,rsi + cmovnc r12,r8 + mov QWORD PTR[rbx],rdx + cmovnc rdi,r9 + mov QWORD PTR[8+rbx],r15 + cmovnc rbp,r10 + mov QWORD PTR[16+rbx],rax + mov QWORD PTR[24+rbx],r12 + mov QWORD PTR[32+rbx],rdi + mov QWORD PTR[40+rbx],rbp + + DB 0F3h,0C3h ;repret + +__mulx_mont_384 ENDP +PUBLIC sqrx_mont_384 + + +ALIGN 32 +sqrx_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,QWORD PTR[((-24))+rsp] + +$L$SEH_body_sqrx_mont_384:: + + + mov r8,rcx + lea rcx,QWORD PTR[((-128))+rdx] + mov rdx,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] +DB 102,72,15,110,199 + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + + lea rbx,QWORD PTR[rsi] + mov QWORD PTR[rsp],r8 + lea rsi,QWORD PTR[((-128))+rsi] + + mulx r9,r8,rdx + call __mulx_mont_384 + + mov r15,QWORD PTR[24+rsp] + + mov r14,QWORD PTR[32+rsp] + + mov r13,QWORD PTR[40+rsp] + + mov r12,QWORD PTR[48+rsp] + + mov rbx,QWORD PTR[56+rsp] + + mov rbp,QWORD PTR[64+rsp] + + lea rsp,QWORD PTR[72+rsp] + +$L$SEH_epilogue_sqrx_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_mont_384:: +sqrx_mont_384 ENDP + +PUBLIC sqrx_n_mul_mont_384 + + +ALIGN 32 +sqrx_n_mul_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_n_mul_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + mov r9,QWORD PTR[48+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,QWORD PTR[((-24))+rsp] + +$L$SEH_body_sqrx_n_mul_mont_384:: + + + mov r10,rdx + mov rdx,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov rbx,rsi + mov r12,QWORD PTR[24+rsi] +DB 102,72,15,110,199 + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + + mov QWORD PTR[rsp],r8 + mov QWORD PTR[16+rsp],r9 + movq xmm2,QWORD PTR[r9] + +$L$oop_sqrx_384:: + movd xmm1,r10d + lea rsi,QWORD PTR[((-128))+rbx] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,rdx + call __mulx_mont_384 + + movd r10d,xmm1 + dec r10d + jnz $L$oop_sqrx_384 + + mov r14,rdx +DB 102,72,15,126,210 + lea rsi,QWORD PTR[((-128))+rbx] + mov rbx,QWORD PTR[16+rsp] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,r14 + call __mulx_mont_384 + + mov r15,QWORD PTR[24+rsp] + + mov r14,QWORD PTR[32+rsp] + + mov r13,QWORD PTR[40+rsp] + + mov r12,QWORD PTR[48+rsp] + + mov rbx,QWORD PTR[56+rsp] + + mov rbp,QWORD PTR[64+rsp] + + lea rsp,QWORD PTR[72+rsp] + +$L$SEH_epilogue_sqrx_n_mul_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_n_mul_mont_384:: +sqrx_n_mul_mont_384 ENDP + +PUBLIC sqrx_n_mul_mont_383 + + +ALIGN 32 +sqrx_n_mul_mont_383 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_n_mul_mont_383:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + mov r9,QWORD PTR[48+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,QWORD PTR[((-24))+rsp] + +$L$SEH_body_sqrx_n_mul_mont_383:: + + + mov r10,rdx + mov rdx,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov rbx,rsi + mov r12,QWORD PTR[24+rsi] +DB 102,72,15,110,199 + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + + mov QWORD PTR[rsp],r8 + mov QWORD PTR[16+rsp],r9 + movq xmm2,QWORD PTR[r9] + lea rcx,QWORD PTR[((-128))+rcx] + +$L$oop_sqrx_383:: + movd xmm1,r10d + lea rsi,QWORD PTR[((-128))+rbx] + + mulx r9,r8,rdx + call __mulx_mont_383_nonred + + movd r10d,xmm1 + dec r10d + jnz $L$oop_sqrx_383 + + mov r14,rdx +DB 102,72,15,126,210 + lea rsi,QWORD PTR[((-128))+rbx] + mov rbx,QWORD PTR[16+rsp] + + mulx r9,r8,r14 + call __mulx_mont_384 + + mov r15,QWORD PTR[24+rsp] + + mov r14,QWORD PTR[32+rsp] + + mov r13,QWORD PTR[40+rsp] + + mov r12,QWORD PTR[48+rsp] + + mov rbx,QWORD PTR[56+rsp] + + mov rbp,QWORD PTR[64+rsp] + + lea rsp,QWORD PTR[72+rsp] + +$L$SEH_epilogue_sqrx_n_mul_mont_383:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_n_mul_mont_383:: +sqrx_n_mul_mont_383 ENDP + +ALIGN 32 +__mulx_mont_383_nonred PROC PRIVATE + DB 243,15,30,250 + + mulx r10,r14,r15 + mulx r11,r15,rax + add r9,r14 + mulx r12,rax,r12 + adc r10,r15 + mulx r13,rdi,rdi + adc r11,rax + mulx r14,rbp,rbp + mov rdx,QWORD PTR[8+rbx] + adc r12,rdi + adc r13,rbp + adc r14,0 + mov rax,r8 + imul r8,QWORD PTR[8+rsp] + + + xor r15,r15 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r9,rdi + adcx r10,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r10,rdi + adcx r11,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r8 + adox r14,rdi + adcx rbp,r15 + adox r15,rbp + + + xor r8,r8 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rax,rdi + adox r9,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r9,rdi + adox r10,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r10,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[16+rbx] + adcx r13,rdi + adox r14,rbp + adcx r14,rax + adox r15,rax + adcx r15,rax + mov r8,r9 + imul r9,QWORD PTR[8+rsp] + + + xor rax,rax + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r10,rdi + adcx r11,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r9 + adox r15,rdi + adcx rbp,rax + adox rax,rbp + + + xor r9,r9 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r8,rdi + adox r10,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r10,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[24+rbx] + adcx r14,rdi + adox r15,rbp + adcx r15,r8 + adox rax,r8 + adcx rax,r8 + mov r9,r10 + imul r10,QWORD PTR[8+rsp] + + + xor r8,r8 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r10 + adox rax,rdi + adcx rbp,r8 + adox r8,rbp + + + xor r10,r10 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r9,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[32+rbx] + adcx r15,rdi + adox rax,rbp + adcx rax,r9 + adox r8,r9 + adcx r8,r9 + mov r10,r11 + imul r11,QWORD PTR[8+rsp] + + + xor r9,r9 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox rax,rdi + adcx r8,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r11 + adox r8,rdi + adcx rbp,r9 + adox r9,rbp + + + xor r11,r11 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r10,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[40+rbx] + adcx rax,rdi + adox r8,rbp + adcx r8,r10 + adox r9,r10 + adcx r9,r10 + mov r11,r12 + imul r12,QWORD PTR[8+rsp] + + + xor r10,r10 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox rax,rdi + adcx r8,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r8,rdi + adcx r9,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r12 + adox r9,rdi + adcx rbp,r10 + adox r10,rbp + + + xor r12,r12 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r11,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx rax,rdi + adox r8,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,r13 + adcx r8,rdi + adox r9,rbp + adcx r9,r11 + adox r10,r11 + adcx r10,r11 + imul rdx,QWORD PTR[8+rsp] +DB 102,72,15,126,195 + + + xor r12,r12 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx rax,rdi + adox r8,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r8,rdi + adox r9,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,r14 + adcx r9,rdi + adox r10,rbp + adc r10,0 + mov r12,r8 + + mov QWORD PTR[rbx],r14 + mov QWORD PTR[8+rbx],r15 + mov QWORD PTR[16+rbx],rax + mov rdi,r9 + mov QWORD PTR[24+rbx],r8 + mov QWORD PTR[32+rbx],r9 + mov QWORD PTR[40+rbx],r10 + mov rbp,r10 + + DB 0F3h,0C3h ;repret + +__mulx_mont_383_nonred ENDP +PUBLIC sqrx_mont_382x + + +ALIGN 32 +sqrx_mont_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_mont_382x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_sqrx_mont_382x:: + + + mov QWORD PTR[rsp],rcx + mov rcx,rdx + mov QWORD PTR[16+rsp],rsi + movq xmm0,rdi + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,r8 + add r8,QWORD PTR[48+rsi] + mov r15,r9 + adc r9,QWORD PTR[56+rsi] + mov rax,r10 + adc r10,QWORD PTR[64+rsi] + mov rdx,r11 + adc r11,QWORD PTR[72+rsi] + mov rbx,r12 + adc r12,QWORD PTR[80+rsi] + mov rbp,r13 + adc r13,QWORD PTR[88+rsi] + + sub r14,QWORD PTR[48+rsi] + sbb r15,QWORD PTR[56+rsi] + sbb rax,QWORD PTR[64+rsi] + sbb rdx,QWORD PTR[72+rsi] + sbb rbx,QWORD PTR[80+rsi] + sbb rbp,QWORD PTR[88+rsi] + sbb rdi,rdi + + mov QWORD PTR[((32+0))+rsp],r8 + mov QWORD PTR[((32+8))+rsp],r9 + mov QWORD PTR[((32+16))+rsp],r10 + mov QWORD PTR[((32+24))+rsp],r11 + mov QWORD PTR[((32+32))+rsp],r12 + mov QWORD PTR[((32+40))+rsp],r13 + + mov QWORD PTR[((32+48))+rsp],r14 + mov QWORD PTR[((32+56))+rsp],r15 + mov QWORD PTR[((32+64))+rsp],rax + mov QWORD PTR[((32+72))+rsp],rdx + mov QWORD PTR[((32+80))+rsp],rbx + mov QWORD PTR[((32+88))+rsp],rbp + mov QWORD PTR[((32+96))+rsp],rdi + + + + lea rbx,QWORD PTR[48+rsi] + + mov rdx,QWORD PTR[48+rsi] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,r14 + call __mulx_mont_383_nonred + add rdx,rdx + adc r15,r15 + adc rax,rax + adc r12,r12 + adc rdi,rdi + adc rbp,rbp + + mov QWORD PTR[48+rbx],rdx + mov QWORD PTR[56+rbx],r15 + mov QWORD PTR[64+rbx],rax + mov QWORD PTR[72+rbx],r12 + mov QWORD PTR[80+rbx],rdi + mov QWORD PTR[88+rbx],rbp + + lea rsi,QWORD PTR[((32-128))+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + + mov rdx,QWORD PTR[((32+48))+rsp] + mov r14,QWORD PTR[((32+0))+rsp] + mov r15,QWORD PTR[((32+8))+rsp] + mov rax,QWORD PTR[((32+16))+rsp] + mov r12,QWORD PTR[((32+24))+rsp] + mov rdi,QWORD PTR[((32+32))+rsp] + mov rbp,QWORD PTR[((32+40))+rsp] + + + + mulx r9,r8,r14 + call __mulx_mont_383_nonred + mov r14,QWORD PTR[((32+96))+rsp] + lea rcx,QWORD PTR[128+rcx] + mov r8,QWORD PTR[((32+0))+rsp] + and r8,r14 + mov r9,QWORD PTR[((32+8))+rsp] + and r9,r14 + mov r10,QWORD PTR[((32+16))+rsp] + and r10,r14 + mov r11,QWORD PTR[((32+24))+rsp] + and r11,r14 + mov r13,QWORD PTR[((32+32))+rsp] + and r13,r14 + and r14,QWORD PTR[((32+40))+rsp] + + sub rdx,r8 + mov r8,QWORD PTR[rcx] + sbb r15,r9 + mov r9,QWORD PTR[8+rcx] + sbb rax,r10 + mov r10,QWORD PTR[16+rcx] + sbb r12,r11 + mov r11,QWORD PTR[24+rcx] + sbb rdi,r13 + mov r13,QWORD PTR[32+rcx] + sbb rbp,r14 + sbb r14,r14 + + and r8,r14 + and r9,r14 + and r10,r14 + and r11,r14 + and r13,r14 + and r14,QWORD PTR[40+rcx] + + add rdx,r8 + adc r15,r9 + adc rax,r10 + adc r12,r11 + adc rdi,r13 + adc rbp,r14 + + mov QWORD PTR[rbx],rdx + mov QWORD PTR[8+rbx],r15 + mov QWORD PTR[16+rbx],rax + mov QWORD PTR[24+rbx],r12 + mov QWORD PTR[32+rbx],rdi + mov QWORD PTR[40+rbx],rbp + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqrx_mont_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_mont_382x:: +sqrx_mont_382x ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_mulx_mont_384x + DD imagerel $L$SEH_body_mulx_mont_384x + DD imagerel $L$SEH_info_mulx_mont_384x_prologue + + DD imagerel $L$SEH_body_mulx_mont_384x + DD imagerel $L$SEH_epilogue_mulx_mont_384x + DD imagerel $L$SEH_info_mulx_mont_384x_body + + DD imagerel $L$SEH_epilogue_mulx_mont_384x + DD imagerel $L$SEH_end_mulx_mont_384x + DD imagerel $L$SEH_info_mulx_mont_384x_epilogue + + DD imagerel $L$SEH_begin_sqrx_mont_384x + DD imagerel $L$SEH_body_sqrx_mont_384x + DD imagerel $L$SEH_info_sqrx_mont_384x_prologue + + DD imagerel $L$SEH_body_sqrx_mont_384x + DD imagerel $L$SEH_epilogue_sqrx_mont_384x + DD imagerel $L$SEH_info_sqrx_mont_384x_body + + DD imagerel $L$SEH_epilogue_sqrx_mont_384x + DD imagerel $L$SEH_end_sqrx_mont_384x + DD imagerel $L$SEH_info_sqrx_mont_384x_epilogue + + DD imagerel $L$SEH_begin_mulx_382x + DD imagerel $L$SEH_body_mulx_382x + DD imagerel $L$SEH_info_mulx_382x_prologue + + DD imagerel $L$SEH_body_mulx_382x + DD imagerel $L$SEH_epilogue_mulx_382x + DD imagerel $L$SEH_info_mulx_382x_body + + DD imagerel $L$SEH_epilogue_mulx_382x + DD imagerel $L$SEH_end_mulx_382x + DD imagerel $L$SEH_info_mulx_382x_epilogue + + DD imagerel $L$SEH_begin_sqrx_382x + DD imagerel $L$SEH_body_sqrx_382x + DD imagerel $L$SEH_info_sqrx_382x_prologue + + DD imagerel $L$SEH_body_sqrx_382x + DD imagerel $L$SEH_epilogue_sqrx_382x + DD imagerel $L$SEH_info_sqrx_382x_body + + DD imagerel $L$SEH_epilogue_sqrx_382x + DD imagerel $L$SEH_end_sqrx_382x + DD imagerel $L$SEH_info_sqrx_382x_epilogue + + DD imagerel $L$SEH_begin_mulx_384 + DD imagerel $L$SEH_body_mulx_384 + DD imagerel $L$SEH_info_mulx_384_prologue + + DD imagerel $L$SEH_body_mulx_384 + DD imagerel $L$SEH_epilogue_mulx_384 + DD imagerel $L$SEH_info_mulx_384_body + + DD imagerel $L$SEH_epilogue_mulx_384 + DD imagerel $L$SEH_end_mulx_384 + DD imagerel $L$SEH_info_mulx_384_epilogue + + DD imagerel $L$SEH_begin_sqrx_384 + DD imagerel $L$SEH_body_sqrx_384 + DD imagerel $L$SEH_info_sqrx_384_prologue + + DD imagerel $L$SEH_body_sqrx_384 + DD imagerel $L$SEH_epilogue_sqrx_384 + DD imagerel $L$SEH_info_sqrx_384_body + + DD imagerel $L$SEH_epilogue_sqrx_384 + DD imagerel $L$SEH_end_sqrx_384 + DD imagerel $L$SEH_info_sqrx_384_epilogue + + DD imagerel $L$SEH_begin_redcx_mont_384 + DD imagerel $L$SEH_body_redcx_mont_384 + DD imagerel $L$SEH_info_redcx_mont_384_prologue + + DD imagerel $L$SEH_body_redcx_mont_384 + DD imagerel $L$SEH_epilogue_redcx_mont_384 + DD imagerel $L$SEH_info_redcx_mont_384_body + + DD imagerel $L$SEH_epilogue_redcx_mont_384 + DD imagerel $L$SEH_end_redcx_mont_384 + DD imagerel $L$SEH_info_redcx_mont_384_epilogue + + DD imagerel $L$SEH_begin_fromx_mont_384 + DD imagerel $L$SEH_body_fromx_mont_384 + DD imagerel $L$SEH_info_fromx_mont_384_prologue + + DD imagerel $L$SEH_body_fromx_mont_384 + DD imagerel $L$SEH_epilogue_fromx_mont_384 + DD imagerel $L$SEH_info_fromx_mont_384_body + + DD imagerel $L$SEH_epilogue_fromx_mont_384 + DD imagerel $L$SEH_end_fromx_mont_384 + DD imagerel $L$SEH_info_fromx_mont_384_epilogue + + DD imagerel $L$SEH_begin_sgn0x_pty_mont_384 + DD imagerel $L$SEH_body_sgn0x_pty_mont_384 + DD imagerel $L$SEH_info_sgn0x_pty_mont_384_prologue + + DD imagerel $L$SEH_body_sgn0x_pty_mont_384 + DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384 + DD imagerel $L$SEH_info_sgn0x_pty_mont_384_body + + DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384 + DD imagerel $L$SEH_end_sgn0x_pty_mont_384 + DD imagerel $L$SEH_info_sgn0x_pty_mont_384_epilogue + + DD imagerel $L$SEH_begin_sgn0x_pty_mont_384x + DD imagerel $L$SEH_body_sgn0x_pty_mont_384x + DD imagerel $L$SEH_info_sgn0x_pty_mont_384x_prologue + + DD imagerel $L$SEH_body_sgn0x_pty_mont_384x + DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384x + DD imagerel $L$SEH_info_sgn0x_pty_mont_384x_body + + DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384x + DD imagerel $L$SEH_end_sgn0x_pty_mont_384x + DD imagerel $L$SEH_info_sgn0x_pty_mont_384x_epilogue + + DD imagerel $L$SEH_begin_mulx_mont_384 + DD imagerel $L$SEH_body_mulx_mont_384 + DD imagerel $L$SEH_info_mulx_mont_384_prologue + + DD imagerel $L$SEH_body_mulx_mont_384 + DD imagerel $L$SEH_epilogue_mulx_mont_384 + DD imagerel $L$SEH_info_mulx_mont_384_body + + DD imagerel $L$SEH_epilogue_mulx_mont_384 + DD imagerel $L$SEH_end_mulx_mont_384 + DD imagerel $L$SEH_info_mulx_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqrx_mont_384 + DD imagerel $L$SEH_body_sqrx_mont_384 + DD imagerel $L$SEH_info_sqrx_mont_384_prologue + + DD imagerel $L$SEH_body_sqrx_mont_384 + DD imagerel $L$SEH_epilogue_sqrx_mont_384 + DD imagerel $L$SEH_info_sqrx_mont_384_body + + DD imagerel $L$SEH_epilogue_sqrx_mont_384 + DD imagerel $L$SEH_end_sqrx_mont_384 + DD imagerel $L$SEH_info_sqrx_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_body_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_384_prologue + + DD imagerel $L$SEH_body_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_384_body + + DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_end_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_body_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_383_prologue + + DD imagerel $L$SEH_body_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_383_body + + DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_end_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_383_epilogue + + DD imagerel $L$SEH_begin_sqrx_mont_382x + DD imagerel $L$SEH_body_sqrx_mont_382x + DD imagerel $L$SEH_info_sqrx_mont_382x_prologue + + DD imagerel $L$SEH_body_sqrx_mont_382x + DD imagerel $L$SEH_epilogue_sqrx_mont_382x + DD imagerel $L$SEH_info_sqrx_mont_382x_body + + DD imagerel $L$SEH_epilogue_sqrx_mont_382x + DD imagerel $L$SEH_end_sqrx_mont_382x + DD imagerel $L$SEH_info_sqrx_mont_382x_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_mulx_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mulx_mont_384x_body:: +DB 1,0,18,0 +DB 000h,0f4h,029h,000h +DB 000h,0e4h,02ah,000h +DB 000h,0d4h,02bh,000h +DB 000h,0c4h,02ch,000h +DB 000h,034h,02dh,000h +DB 000h,054h,02eh,000h +DB 000h,074h,030h,000h +DB 000h,064h,031h,000h +DB 000h,001h,02fh,000h +$L$SEH_info_mulx_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_mont_384x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_sqrx_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mulx_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mulx_382x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_mulx_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_382x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sqrx_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mulx_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mulx_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,000h,000h +DB 000h,0e4h,001h,000h +DB 000h,0d4h,002h,000h +DB 000h,0c4h,003h,000h +DB 000h,034h,004h,000h +DB 000h,054h,005h,000h +DB 000h,074h,007h,000h +DB 000h,064h,008h,000h +DB 000h,052h +DB 000h,000h +$L$SEH_info_mulx_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sqrx_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_redcx_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_redcx_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_redcx_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_fromx_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_fromx_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_fromx_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0x_pty_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sgn0x_pty_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sgn0x_pty_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0x_pty_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sgn0x_pty_mont_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sgn0x_pty_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mulx_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mulx_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h +$L$SEH_info_mulx_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h +$L$SEH_info_sqrx_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_n_mul_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_n_mul_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h +$L$SEH_info_sqrx_n_mul_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_n_mul_mont_383_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_n_mul_mont_383_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h +$L$SEH_info_sqrx_n_mul_mont_383_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_mont_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_mont_382x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_sqrx_mont_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/build/win64/sha256-x86_64.asm b/build/win64/sha256-x86_64.asm new file mode 100644 index 00000000..801bac44 --- /dev/null +++ b/build/win64/sha256-x86_64.asm @@ -0,0 +1,1570 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +ALIGN 64 + +K256:: + DD 0428a2f98h,071374491h,0b5c0fbcfh,0e9b5dba5h + DD 03956c25bh,059f111f1h,0923f82a4h,0ab1c5ed5h + DD 0d807aa98h,012835b01h,0243185beh,0550c7dc3h + DD 072be5d74h,080deb1feh,09bdc06a7h,0c19bf174h + DD 0e49b69c1h,0efbe4786h,00fc19dc6h,0240ca1cch + DD 02de92c6fh,04a7484aah,05cb0a9dch,076f988dah + DD 0983e5152h,0a831c66dh,0b00327c8h,0bf597fc7h + DD 0c6e00bf3h,0d5a79147h,006ca6351h,014292967h + DD 027b70a85h,02e1b2138h,04d2c6dfch,053380d13h + DD 0650a7354h,0766a0abbh,081c2c92eh,092722c85h + DD 0a2bfe8a1h,0a81a664bh,0c24b8b70h,0c76c51a3h + DD 0d192e819h,0d6990624h,0f40e3585h,0106aa070h + DD 019a4c116h,01e376c08h,02748774ch,034b0bcb5h + DD 0391c0cb3h,04ed8aa4ah,05b9cca4fh,0682e6ff3h + DD 0748f82eeh,078a5636fh,084c87814h,08cc70208h + DD 090befffah,0a4506cebh,0bef9a3f7h,0c67178f2h + + DD 000010203h,004050607h,008090a0bh,00c0d0e0fh + DD 003020100h,00b0a0908h,0ffffffffh,0ffffffffh + DD 0ffffffffh,0ffffffffh,003020100h,00b0a0908h +DB 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 +DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 +DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 +DB 32,64,100,111,116,45,97,115,109,0 +PUBLIC sha256_block_data_order_shaext + + +ALIGN 64 +sha256_block_data_order_shaext PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sha256_block_data_order_shaext:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + sub rsp,058h + + movaps XMMWORD PTR[(-88)+r11],xmm6 + + movaps XMMWORD PTR[(-72)+r11],xmm7 + + movaps XMMWORD PTR[(-56)+r11],xmm8 + + movaps XMMWORD PTR[(-40)+r11],xmm9 + + movaps XMMWORD PTR[(-24)+r11],xmm10 + +$L$SEH_body_sha256_block_data_order_shaext:: + + lea rcx,QWORD PTR[((K256+128))] + movdqu xmm1,XMMWORD PTR[rdi] + movdqu xmm2,XMMWORD PTR[16+rdi] + movdqa xmm7,XMMWORD PTR[((256-128))+rcx] + + pshufd xmm0,xmm1,01bh + pshufd xmm1,xmm1,0b1h + pshufd xmm2,xmm2,01bh + movdqa xmm8,xmm7 +DB 102,15,58,15,202,8 + punpcklqdq xmm2,xmm0 + jmp $L$oop_shaext + +ALIGN 16 +$L$oop_shaext:: + movdqu xmm3,XMMWORD PTR[rsi] + movdqu xmm4,XMMWORD PTR[16+rsi] + movdqu xmm5,XMMWORD PTR[32+rsi] +DB 102,15,56,0,223 + movdqu xmm6,XMMWORD PTR[48+rsi] + + movdqa xmm0,XMMWORD PTR[((0-128))+rcx] + paddd xmm0,xmm3 +DB 102,15,56,0,231 + movdqa xmm10,xmm2 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + nop + movdqa xmm9,xmm1 +DB 15,56,203,202 + + movdqa xmm0,XMMWORD PTR[((16-128))+rcx] + paddd xmm0,xmm4 +DB 102,15,56,0,239 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + lea rsi,QWORD PTR[64+rsi] +DB 15,56,204,220 +DB 15,56,203,202 + + movdqa xmm0,XMMWORD PTR[((32-128))+rcx] + paddd xmm0,xmm5 +DB 102,15,56,0,247 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm6 +DB 102,15,58,15,253,4 + nop + paddd xmm3,xmm7 +DB 15,56,204,229 +DB 15,56,203,202 + + movdqa xmm0,XMMWORD PTR[((48-128))+rcx] + paddd xmm0,xmm6 +DB 15,56,205,222 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm3 +DB 102,15,58,15,254,4 + nop + paddd xmm4,xmm7 +DB 15,56,204,238 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((64-128))+rcx] + paddd xmm0,xmm3 +DB 15,56,205,227 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm4 +DB 102,15,58,15,251,4 + nop + paddd xmm5,xmm7 +DB 15,56,204,243 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((80-128))+rcx] + paddd xmm0,xmm4 +DB 15,56,205,236 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm5 +DB 102,15,58,15,252,4 + nop + paddd xmm6,xmm7 +DB 15,56,204,220 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((96-128))+rcx] + paddd xmm0,xmm5 +DB 15,56,205,245 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm6 +DB 102,15,58,15,253,4 + nop + paddd xmm3,xmm7 +DB 15,56,204,229 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((112-128))+rcx] + paddd xmm0,xmm6 +DB 15,56,205,222 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm3 +DB 102,15,58,15,254,4 + nop + paddd xmm4,xmm7 +DB 15,56,204,238 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((128-128))+rcx] + paddd xmm0,xmm3 +DB 15,56,205,227 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm4 +DB 102,15,58,15,251,4 + nop + paddd xmm5,xmm7 +DB 15,56,204,243 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((144-128))+rcx] + paddd xmm0,xmm4 +DB 15,56,205,236 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm5 +DB 102,15,58,15,252,4 + nop + paddd xmm6,xmm7 +DB 15,56,204,220 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((160-128))+rcx] + paddd xmm0,xmm5 +DB 15,56,205,245 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm6 +DB 102,15,58,15,253,4 + nop + paddd xmm3,xmm7 +DB 15,56,204,229 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((176-128))+rcx] + paddd xmm0,xmm6 +DB 15,56,205,222 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm3 +DB 102,15,58,15,254,4 + nop + paddd xmm4,xmm7 +DB 15,56,204,238 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((192-128))+rcx] + paddd xmm0,xmm3 +DB 15,56,205,227 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm4 +DB 102,15,58,15,251,4 + nop + paddd xmm5,xmm7 +DB 15,56,204,243 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((208-128))+rcx] + paddd xmm0,xmm4 +DB 15,56,205,236 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm5 +DB 102,15,58,15,252,4 +DB 15,56,203,202 + paddd xmm6,xmm7 + + movdqa xmm0,XMMWORD PTR[((224-128))+rcx] + paddd xmm0,xmm5 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh +DB 15,56,205,245 + movdqa xmm7,xmm8 +DB 15,56,203,202 + + movdqa xmm0,XMMWORD PTR[((240-128))+rcx] + paddd xmm0,xmm6 + nop +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + dec rdx + nop +DB 15,56,203,202 + + paddd xmm2,xmm10 + paddd xmm1,xmm9 + jnz $L$oop_shaext + + pshufd xmm2,xmm2,0b1h + pshufd xmm7,xmm1,01bh + pshufd xmm1,xmm1,0b1h + punpckhqdq xmm1,xmm2 +DB 102,15,58,15,215,8 + + movdqu XMMWORD PTR[rdi],xmm1 + movdqu XMMWORD PTR[16+rdi],xmm2 + movaps xmm6,XMMWORD PTR[((-88))+r11] + movaps xmm7,XMMWORD PTR[((-72))+r11] + movaps xmm8,XMMWORD PTR[((-56))+r11] + movaps xmm9,XMMWORD PTR[((-40))+r11] + movaps xmm10,XMMWORD PTR[((-24))+r11] + mov rsp,r11 + +$L$SEH_epilogue_sha256_block_data_order_shaext:: + mov rdi,QWORD PTR[8+r11] ;WIN64 epilogue + mov rsi,QWORD PTR[16+r11] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sha256_block_data_order_shaext:: +sha256_block_data_order_shaext ENDP +PUBLIC sha256_block_data_order + + +ALIGN 64 +sha256_block_data_order PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sha256_block_data_order:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,104 + + lea rdx,QWORD PTR[rdx*4+rsi] + mov QWORD PTR[rsp],rdi + + mov QWORD PTR[16+rsp],rdx + movaps XMMWORD PTR[32+rsp],xmm6 + + movaps XMMWORD PTR[48+rsp],xmm7 + + movaps XMMWORD PTR[64+rsp],xmm8 + + movaps XMMWORD PTR[80+rsp],xmm9 + + mov rbp,rsp + +$L$SEH_body_sha256_block_data_order:: + + + lea rsp,QWORD PTR[((-64))+rsp] + mov eax,DWORD PTR[rdi] + and rsp,-64 + mov ebx,DWORD PTR[4+rdi] + mov ecx,DWORD PTR[8+rdi] + mov edx,DWORD PTR[12+rdi] + mov r8d,DWORD PTR[16+rdi] + mov r9d,DWORD PTR[20+rdi] + mov r10d,DWORD PTR[24+rdi] + mov r11d,DWORD PTR[28+rdi] + + + jmp $L$loop_ssse3 +ALIGN 16 +$L$loop_ssse3:: + movdqa xmm7,XMMWORD PTR[((K256+256))] + mov QWORD PTR[8+rbp],rsi + movdqu xmm0,XMMWORD PTR[rsi] + movdqu xmm1,XMMWORD PTR[16+rsi] + movdqu xmm2,XMMWORD PTR[32+rsi] +DB 102,15,56,0,199 + movdqu xmm3,XMMWORD PTR[48+rsi] + lea rsi,QWORD PTR[K256] +DB 102,15,56,0,207 + movdqa xmm4,XMMWORD PTR[rsi] + movdqa xmm5,XMMWORD PTR[16+rsi] +DB 102,15,56,0,215 + paddd xmm4,xmm0 + movdqa xmm6,XMMWORD PTR[32+rsi] +DB 102,15,56,0,223 + movdqa xmm7,XMMWORD PTR[48+rsi] + paddd xmm5,xmm1 + paddd xmm6,xmm2 + paddd xmm7,xmm3 + movdqa XMMWORD PTR[rsp],xmm4 + mov r14d,eax + movdqa XMMWORD PTR[16+rsp],xmm5 + mov edi,ebx + movdqa XMMWORD PTR[32+rsp],xmm6 + xor edi,ecx + movdqa XMMWORD PTR[48+rsp],xmm7 + mov r13d,r8d + jmp $L$ssse3_00_47 + +ALIGN 16 +$L$ssse3_00_47:: + sub rsi,-64 + ror r13d,14 + movdqa xmm4,xmm1 + mov eax,r14d + mov r12d,r9d + movdqa xmm7,xmm3 + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax +DB 102,15,58,15,224,4 + and r12d,r8d + xor r13d,r8d +DB 102,15,58,15,250,4 + add r11d,DWORD PTR[rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,ebx + add r11d,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,eax + add r11d,r13d + xor edi,ebx + paddd xmm0,xmm7 + ror r14d,2 + add edx,r11d + psrld xmm6,7 + add r11d,edi + mov r13d,edx + pshufd xmm7,xmm3,250 + add r14d,r11d + ror r13d,14 + pslld xmm5,14 + mov r11d,r14d + mov r12d,r8d + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + psrld xmm6,11 + xor r14d,r11d + pxor xmm4,xmm5 + and r12d,edx + xor r13d,edx + pslld xmm5,11 + add r10d,DWORD PTR[4+rsp] + mov edi,r11d + pxor xmm4,xmm6 + xor r12d,r9d + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,eax + add r10d,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,r11d + psrld xmm7,10 + add r10d,r13d + xor r15d,eax + paddd xmm0,xmm4 + ror r14d,2 + add ecx,r10d + psrlq xmm6,17 + add r10d,r15d + mov r13d,ecx + add r14d,r10d + pxor xmm7,xmm6 + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + psrlq xmm6,2 + xor r13d,ecx + xor r12d,r8d + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,r10d + and r12d,ecx + pshufd xmm7,xmm7,128 + xor r13d,ecx + add r9d,DWORD PTR[8+rsp] + mov r15d,r10d + psrldq xmm7,8 + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + paddd xmm0,xmm7 + and edi,r15d + xor r14d,r10d + add r9d,r13d + pshufd xmm7,xmm0,80 + xor edi,r11d + ror r14d,2 + add ebx,r9d + movdqa xmm6,xmm7 + add r9d,edi + mov r13d,ebx + psrld xmm7,10 + add r14d,r9d + ror r13d,14 + psrlq xmm6,17 + mov r9d,r14d + mov r12d,ecx + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + psrlq xmm6,2 + and r12d,ebx + xor r13d,ebx + add r8d,DWORD PTR[12+rsp] + pxor xmm7,xmm6 + mov edi,r9d + xor r12d,edx + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,r10d + add r8d,r12d + movdqa xmm6,XMMWORD PTR[rsi] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + paddd xmm0,xmm7 + ror r14d,2 + add eax,r8d + add r8d,r15d + paddd xmm6,xmm0 + mov r13d,eax + add r14d,r8d + movdqa XMMWORD PTR[rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm2 + mov r8d,r14d + mov r12d,ebx + movdqa xmm7,xmm0 + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d +DB 102,15,58,15,225,4 + and r12d,eax + xor r13d,eax +DB 102,15,58,15,251,4 + add edx,DWORD PTR[16+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,r9d + add edx,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,r8d + add edx,r13d + xor edi,r9d + paddd xmm1,xmm7 + ror r14d,2 + add r11d,edx + psrld xmm6,7 + add edx,edi + mov r13d,r11d + pshufd xmm7,xmm0,250 + add r14d,edx + ror r13d,14 + pslld xmm5,14 + mov edx,r14d + mov r12d,eax + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + psrld xmm6,11 + xor r14d,edx + pxor xmm4,xmm5 + and r12d,r11d + xor r13d,r11d + pslld xmm5,11 + add ecx,DWORD PTR[20+rsp] + mov edi,edx + pxor xmm4,xmm6 + xor r12d,ebx + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,r8d + add ecx,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,edx + psrld xmm7,10 + add ecx,r13d + xor r15d,r8d + paddd xmm1,xmm4 + ror r14d,2 + add r10d,ecx + psrlq xmm6,17 + add ecx,r15d + mov r13d,r10d + add r14d,ecx + pxor xmm7,xmm6 + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + psrlq xmm6,2 + xor r13d,r10d + xor r12d,eax + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,ecx + and r12d,r10d + pshufd xmm7,xmm7,128 + xor r13d,r10d + add ebx,DWORD PTR[24+rsp] + mov r15d,ecx + psrldq xmm7,8 + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + paddd xmm1,xmm7 + and edi,r15d + xor r14d,ecx + add ebx,r13d + pshufd xmm7,xmm1,80 + xor edi,edx + ror r14d,2 + add r9d,ebx + movdqa xmm6,xmm7 + add ebx,edi + mov r13d,r9d + psrld xmm7,10 + add r14d,ebx + ror r13d,14 + psrlq xmm6,17 + mov ebx,r14d + mov r12d,r10d + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + psrlq xmm6,2 + and r12d,r9d + xor r13d,r9d + add eax,DWORD PTR[28+rsp] + pxor xmm7,xmm6 + mov edi,ebx + xor r12d,r11d + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,ecx + add eax,r12d + movdqa xmm6,XMMWORD PTR[16+rsi] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,ebx + add eax,r13d + xor r15d,ecx + paddd xmm1,xmm7 + ror r14d,2 + add r8d,eax + add eax,r15d + paddd xmm6,xmm1 + mov r13d,r8d + add r14d,eax + movdqa XMMWORD PTR[16+rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm3 + mov eax,r14d + mov r12d,r9d + movdqa xmm7,xmm1 + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax +DB 102,15,58,15,226,4 + and r12d,r8d + xor r13d,r8d +DB 102,15,58,15,248,4 + add r11d,DWORD PTR[32+rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,ebx + add r11d,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,eax + add r11d,r13d + xor edi,ebx + paddd xmm2,xmm7 + ror r14d,2 + add edx,r11d + psrld xmm6,7 + add r11d,edi + mov r13d,edx + pshufd xmm7,xmm1,250 + add r14d,r11d + ror r13d,14 + pslld xmm5,14 + mov r11d,r14d + mov r12d,r8d + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + psrld xmm6,11 + xor r14d,r11d + pxor xmm4,xmm5 + and r12d,edx + xor r13d,edx + pslld xmm5,11 + add r10d,DWORD PTR[36+rsp] + mov edi,r11d + pxor xmm4,xmm6 + xor r12d,r9d + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,eax + add r10d,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,r11d + psrld xmm7,10 + add r10d,r13d + xor r15d,eax + paddd xmm2,xmm4 + ror r14d,2 + add ecx,r10d + psrlq xmm6,17 + add r10d,r15d + mov r13d,ecx + add r14d,r10d + pxor xmm7,xmm6 + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + psrlq xmm6,2 + xor r13d,ecx + xor r12d,r8d + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,r10d + and r12d,ecx + pshufd xmm7,xmm7,128 + xor r13d,ecx + add r9d,DWORD PTR[40+rsp] + mov r15d,r10d + psrldq xmm7,8 + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + paddd xmm2,xmm7 + and edi,r15d + xor r14d,r10d + add r9d,r13d + pshufd xmm7,xmm2,80 + xor edi,r11d + ror r14d,2 + add ebx,r9d + movdqa xmm6,xmm7 + add r9d,edi + mov r13d,ebx + psrld xmm7,10 + add r14d,r9d + ror r13d,14 + psrlq xmm6,17 + mov r9d,r14d + mov r12d,ecx + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + psrlq xmm6,2 + and r12d,ebx + xor r13d,ebx + add r8d,DWORD PTR[44+rsp] + pxor xmm7,xmm6 + mov edi,r9d + xor r12d,edx + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,r10d + add r8d,r12d + movdqa xmm6,XMMWORD PTR[32+rsi] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + paddd xmm2,xmm7 + ror r14d,2 + add eax,r8d + add r8d,r15d + paddd xmm6,xmm2 + mov r13d,eax + add r14d,r8d + movdqa XMMWORD PTR[32+rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm0 + mov r8d,r14d + mov r12d,ebx + movdqa xmm7,xmm2 + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d +DB 102,15,58,15,227,4 + and r12d,eax + xor r13d,eax +DB 102,15,58,15,249,4 + add edx,DWORD PTR[48+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,r9d + add edx,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,r8d + add edx,r13d + xor edi,r9d + paddd xmm3,xmm7 + ror r14d,2 + add r11d,edx + psrld xmm6,7 + add edx,edi + mov r13d,r11d + pshufd xmm7,xmm2,250 + add r14d,edx + ror r13d,14 + pslld xmm5,14 + mov edx,r14d + mov r12d,eax + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + psrld xmm6,11 + xor r14d,edx + pxor xmm4,xmm5 + and r12d,r11d + xor r13d,r11d + pslld xmm5,11 + add ecx,DWORD PTR[52+rsp] + mov edi,edx + pxor xmm4,xmm6 + xor r12d,ebx + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,r8d + add ecx,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,edx + psrld xmm7,10 + add ecx,r13d + xor r15d,r8d + paddd xmm3,xmm4 + ror r14d,2 + add r10d,ecx + psrlq xmm6,17 + add ecx,r15d + mov r13d,r10d + add r14d,ecx + pxor xmm7,xmm6 + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + psrlq xmm6,2 + xor r13d,r10d + xor r12d,eax + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,ecx + and r12d,r10d + pshufd xmm7,xmm7,128 + xor r13d,r10d + add ebx,DWORD PTR[56+rsp] + mov r15d,ecx + psrldq xmm7,8 + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + paddd xmm3,xmm7 + and edi,r15d + xor r14d,ecx + add ebx,r13d + pshufd xmm7,xmm3,80 + xor edi,edx + ror r14d,2 + add r9d,ebx + movdqa xmm6,xmm7 + add ebx,edi + mov r13d,r9d + psrld xmm7,10 + add r14d,ebx + ror r13d,14 + psrlq xmm6,17 + mov ebx,r14d + mov r12d,r10d + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + psrlq xmm6,2 + and r12d,r9d + xor r13d,r9d + add eax,DWORD PTR[60+rsp] + pxor xmm7,xmm6 + mov edi,ebx + xor r12d,r11d + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,ecx + add eax,r12d + movdqa xmm6,XMMWORD PTR[48+rsi] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,ebx + add eax,r13d + xor r15d,ecx + paddd xmm3,xmm7 + ror r14d,2 + add r8d,eax + add eax,r15d + paddd xmm6,xmm3 + mov r13d,r8d + add r14d,eax + movdqa XMMWORD PTR[48+rsp],xmm6 + cmp BYTE PTR[67+rsi],0 + jne $L$ssse3_00_47 + ror r13d,14 + mov eax,r14d + mov r12d,r9d + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD PTR[rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + xor r15d,ebx + add r11d,r12d + ror r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + ror r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + ror r13d,14 + mov r11d,r14d + mov r12d,r8d + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD PTR[4+rsp] + mov edi,r11d + xor r12d,r9d + ror r14d,11 + xor edi,eax + add r10d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + ror r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + xor r13d,ecx + xor r12d,r8d + ror r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD PTR[8+rsp] + mov r15d,r10d + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + ror r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + ror r13d,14 + mov r9d,r14d + mov r12d,ecx + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD PTR[12+rsp] + mov edi,r9d + xor r12d,edx + ror r14d,11 + xor edi,r10d + add r8d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + ror r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + ror r13d,14 + mov r8d,r14d + mov r12d,ebx + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD PTR[16+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + xor r15d,r9d + add edx,r12d + ror r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + ror r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + ror r13d,14 + mov edx,r14d + mov r12d,eax + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD PTR[20+rsp] + mov edi,edx + xor r12d,ebx + ror r14d,11 + xor edi,r8d + add ecx,r12d + ror r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + ror r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + xor r13d,r10d + xor r12d,eax + ror r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD PTR[24+rsp] + mov r15d,ecx + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + ror r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + ror r13d,14 + mov ebx,r14d + mov r12d,r10d + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD PTR[28+rsp] + mov edi,ebx + xor r12d,r11d + ror r14d,11 + xor edi,ecx + add eax,r12d + ror r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + ror r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + ror r13d,14 + mov eax,r14d + mov r12d,r9d + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD PTR[32+rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + xor r15d,ebx + add r11d,r12d + ror r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + ror r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + ror r13d,14 + mov r11d,r14d + mov r12d,r8d + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD PTR[36+rsp] + mov edi,r11d + xor r12d,r9d + ror r14d,11 + xor edi,eax + add r10d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + ror r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + xor r13d,ecx + xor r12d,r8d + ror r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD PTR[40+rsp] + mov r15d,r10d + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + ror r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + ror r13d,14 + mov r9d,r14d + mov r12d,ecx + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD PTR[44+rsp] + mov edi,r9d + xor r12d,edx + ror r14d,11 + xor edi,r10d + add r8d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + ror r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + ror r13d,14 + mov r8d,r14d + mov r12d,ebx + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD PTR[48+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + xor r15d,r9d + add edx,r12d + ror r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + ror r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + ror r13d,14 + mov edx,r14d + mov r12d,eax + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD PTR[52+rsp] + mov edi,edx + xor r12d,ebx + ror r14d,11 + xor edi,r8d + add ecx,r12d + ror r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + ror r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + xor r13d,r10d + xor r12d,eax + ror r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD PTR[56+rsp] + mov r15d,ecx + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + ror r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + ror r13d,14 + mov ebx,r14d + mov r12d,r10d + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD PTR[60+rsp] + mov edi,ebx + xor r12d,r11d + ror r14d,11 + xor edi,ecx + add eax,r12d + ror r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + ror r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + mov rdi,QWORD PTR[rbp] + mov eax,r14d + mov rsi,QWORD PTR[8+rbp] + + add eax,DWORD PTR[rdi] + add ebx,DWORD PTR[4+rdi] + add ecx,DWORD PTR[8+rdi] + add edx,DWORD PTR[12+rdi] + add r8d,DWORD PTR[16+rdi] + add r9d,DWORD PTR[20+rdi] + add r10d,DWORD PTR[24+rdi] + add r11d,DWORD PTR[28+rdi] + + lea rsi,QWORD PTR[64+rsi] + cmp rsi,QWORD PTR[16+rbp] + + mov DWORD PTR[rdi],eax + mov DWORD PTR[4+rdi],ebx + mov DWORD PTR[8+rdi],ecx + mov DWORD PTR[12+rdi],edx + mov DWORD PTR[16+rdi],r8d + mov DWORD PTR[20+rdi],r9d + mov DWORD PTR[24+rdi],r10d + mov DWORD PTR[28+rdi],r11d + jb $L$loop_ssse3 + + xorps xmm0,xmm0 + lea r11,QWORD PTR[((104+48))+rbp] + + movaps XMMWORD PTR[rsp],xmm0 + movaps XMMWORD PTR[16+rsp],xmm0 + movaps XMMWORD PTR[32+rsp],xmm0 + movaps XMMWORD PTR[48+rsp],xmm0 + movaps xmm6,XMMWORD PTR[32+rbp] + movaps xmm7,XMMWORD PTR[48+rbp] + movaps xmm8,XMMWORD PTR[64+rbp] + movaps xmm9,XMMWORD PTR[80+rbp] + mov r15,QWORD PTR[104+rbp] + + mov r14,QWORD PTR[((-40))+r11] + + mov r13,QWORD PTR[((-32))+r11] + + mov r12,QWORD PTR[((-24))+r11] + + mov rbx,QWORD PTR[((-16))+r11] + + mov rbp,QWORD PTR[((-8))+r11] + +$L$SEH_epilogue_sha256_block_data_order:: + mov rdi,QWORD PTR[8+r11] ;WIN64 epilogue + mov rsi,QWORD PTR[16+r11] + + lea rsp,QWORD PTR[r11] + DB 0F3h,0C3h ;repret + +$L$SEH_end_sha256_block_data_order:: +sha256_block_data_order ENDP +PUBLIC sha256_emit + + +ALIGN 16 +sha256_emit PROC PUBLIC + DB 243,15,30,250 + mov r8,QWORD PTR[rdx] + mov r9,QWORD PTR[8+rdx] + mov r10,QWORD PTR[16+rdx] + bswap r8 + mov r11,QWORD PTR[24+rdx] + bswap r9 + mov DWORD PTR[4+rcx],r8d + bswap r10 + mov DWORD PTR[12+rcx],r9d + bswap r11 + mov DWORD PTR[20+rcx],r10d + shr r8,32 + mov DWORD PTR[28+rcx],r11d + shr r9,32 + mov DWORD PTR[rcx],r8d + shr r10,32 + mov DWORD PTR[8+rcx],r9d + shr r11,32 + mov DWORD PTR[16+rcx],r10d + mov DWORD PTR[24+rcx],r11d + DB 0F3h,0C3h ;repret +sha256_emit ENDP + +PUBLIC sha256_bcopy + + +ALIGN 16 +sha256_bcopy PROC PUBLIC + DB 243,15,30,250 + sub rcx,rdx +$L$oop_bcopy:: + movzx eax,BYTE PTR[rdx] + lea rdx,QWORD PTR[1+rdx] + mov BYTE PTR[((-1))+rdx*1+rcx],al + dec r8 + jnz $L$oop_bcopy + DB 0F3h,0C3h ;repret +sha256_bcopy ENDP + +PUBLIC sha256_hcopy + + +ALIGN 16 +sha256_hcopy PROC PUBLIC + DB 243,15,30,250 + mov r8,QWORD PTR[rdx] + mov r9,QWORD PTR[8+rdx] + mov r10,QWORD PTR[16+rdx] + mov r11,QWORD PTR[24+rdx] + mov QWORD PTR[rcx],r8 + mov QWORD PTR[8+rcx],r9 + mov QWORD PTR[16+rcx],r10 + mov QWORD PTR[24+rcx],r11 + DB 0F3h,0C3h ;repret +sha256_hcopy ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_sha256_block_data_order_shaext + DD imagerel $L$SEH_body_sha256_block_data_order_shaext + DD imagerel $L$SEH_info_sha256_block_data_order_shaext_prologue + + DD imagerel $L$SEH_body_sha256_block_data_order_shaext + DD imagerel $L$SEH_epilogue_sha256_block_data_order_shaext + DD imagerel $L$SEH_info_sha256_block_data_order_shaext_body + + DD imagerel $L$SEH_epilogue_sha256_block_data_order_shaext + DD imagerel $L$SEH_end_sha256_block_data_order_shaext + DD imagerel $L$SEH_info_sha256_block_data_order_shaext_epilogue + + DD imagerel $L$SEH_begin_sha256_block_data_order + DD imagerel $L$SEH_body_sha256_block_data_order + DD imagerel $L$SEH_info_sha256_block_data_order_prologue + + DD imagerel $L$SEH_body_sha256_block_data_order + DD imagerel $L$SEH_epilogue_sha256_block_data_order + DD imagerel $L$SEH_info_sha256_block_data_order_body + + DD imagerel $L$SEH_epilogue_sha256_block_data_order + DD imagerel $L$SEH_end_sha256_block_data_order + DD imagerel $L$SEH_info_sha256_block_data_order_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_sha256_block_data_order_shaext_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sha256_block_data_order_shaext_body:: +DB 1,0,15,0 +DB 000h,068h,000h,000h +DB 000h,078h,001h,000h +DB 000h,088h,002h,000h +DB 000h,098h,003h,000h +DB 000h,0a8h,004h,000h +DB 000h,074h,00ch,000h +DB 000h,064h,00dh,000h +DB 000h,0a2h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_sha256_block_data_order_shaext_epilogue:: +DB 1,0,5,11 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,003h +DB 000h,000h + +$L$SEH_info_sha256_block_data_order_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sha256_block_data_order_body:: +DB 1,0,26,5 +DB 000h,068h,002h,000h +DB 000h,078h,003h,000h +DB 000h,088h,004h,000h +DB 000h,098h,005h,000h +DB 000h,0f4h,00dh,000h +DB 000h,0e4h,00eh,000h +DB 000h,0d4h,00fh,000h +DB 000h,0c4h,010h,000h +DB 000h,034h,011h,000h +DB 000h,074h,014h,000h +DB 000h,064h,015h,000h +DB 000h,003h +DB 000h,001h,012h,000h +DB 000h,050h +$L$SEH_info_sha256_block_data_order_epilogue:: +DB 1,0,5,11 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,003h +DB 000h,000h + + +.xdata ENDS +END diff --git a/src/aggregate.c b/src/aggregate.c new file mode 100644 index 00000000..630db6ec --- /dev/null +++ b/src/aggregate.c @@ -0,0 +1,435 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * Usage pattern on single-processor system is + * + * blst_pairing_init(ctx); + * blst_pairing_aggregate_pk_in_g1(ctx, PK1, aggregated_signature, message1); + * blst_pairing_aggregate_pk_in_g1(ctx, PK2, NULL, message2); + * ... + * blst_pairing_commit(ctx); + * blst_pairing_finalverify(ctx, NULL); + * + *********************************************************************** + * Usage pattern on multi-processor system is + * + * blst_pairing_init(pk0); + * blst_pairing_init(pk1); + * ... + * start threads each processing a slice of PKs and messages: + * blst_pairing_aggregate_pk_in_g1(pkx, PK[], NULL, message[]); + * blst_pairing_commit(pkx); + * ... + * blst_fp12 gtsig; + * blst_aggregated_in_g2(>sig, aggregated_signature); + * join threads and merge their contexts: + * blst_pairing_merge(pk0, pk1); + * blst_pairing_merge(pk0, pk2); + * ... + * blst_pairing_finalverify(pk0, gtsig); + */ + +#ifndef N_MAX +# define N_MAX 8 +#endif + +typedef union { POINTonE1 e1; POINTonE2 e2; } AggregatedSignature; +typedef struct { + unsigned int min_sig_or_pk; + unsigned int nelems; + vec384fp12 GT; + AggregatedSignature AggrSign; + POINTonE2_affine Q[N_MAX]; + POINTonE1_affine P[N_MAX]; +} PAIRING; + +enum { AGGR_UNDEFINED = 0, AGGR_MIN_SIG, AGGR_MIN_PK, + AGGR_SIGN_SET = 0x10, AGGR_GT_SET = 0x20 }; + +size_t blst_pairing_sizeof() +{ return (sizeof(PAIRING) + 7) & ~(size_t)7; } + +void blst_pairing_init(PAIRING *ctx) +{ ctx->min_sig_or_pk = AGGR_UNDEFINED; ctx->nelems = 0; } + +#define FROM_AFFINE(out,in) do { \ + vec_copy((out)->X, in->X, 2*sizeof(in->X)), \ + vec_select((out)->Z, in->X, BLS12_381_Rx.p, sizeof(in->X), \ + vec_is_zero(in, 2*sizeof(in->X))); } while(0) + +static BLST_ERROR PAIRING_Aggregate_PK_in_G2(PAIRING *ctx, + const POINTonE2_affine *PK, + const POINTonE1 *signature, + int hash_or_encode, + const void *msg, size_t msg_len, + const void *DST, size_t DST_len, + const void *aug, size_t aug_len) +{ + if (ctx->min_sig_or_pk & AGGR_MIN_PK) + return BLST_AGGR_TYPE_MISMATCH; + + ctx->min_sig_or_pk |= AGGR_MIN_SIG; + + if (signature != NULL) { + if (!POINTonE1_in_G1(signature)) + return BLST_POINT_NOT_IN_GROUP; + + if (ctx->min_sig_or_pk & AGGR_SIGN_SET) { + POINTonE1_dadd_affine(&ctx->AggrSign.e1, &ctx->AggrSign.e1, + signature); + } else { + ctx->min_sig_or_pk |= AGGR_SIGN_SET; + FROM_AFFINE(&ctx->AggrSign.e1, signature); + } + } + + if (PK != NULL) { + size_t n; + POINTonE1 H[1]; + + if (hash_or_encode) + Hash_to_G1(H, msg, msg_len, DST, DST_len, aug, aug_len); + else + Encode_to_G1(H, msg, msg_len, DST, DST_len, aug, aug_len); + + POINTonE1_from_Jacobian(H, H); + + n = ctx->nelems; + vec_copy(ctx->Q + n, PK, sizeof(POINTonE2_affine)); + vec_copy(ctx->P + n, H, sizeof(POINTonE1_affine)); + if (++n == N_MAX) { + if (ctx->min_sig_or_pk & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->min_sig_or_pk |= AGGR_GT_SET; + } + n = 0; + } + ctx->nelems = n; + } + + return BLST_SUCCESS; +} + +BLST_ERROR blst_pairing_aggregate_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + const POINTonE1 *signature, + int hash_or_encode, + const void *msg, size_t msg_len, + const void *DST, size_t DST_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, signature, hash_or_encode, + msg, msg_len, DST, DST_len, aug, aug_len); +} + +static BLST_ERROR PAIRING_Aggregate_PK_in_G1(PAIRING *ctx, + const POINTonE1_affine *PK, + const POINTonE2 *signature, + int hash_or_encode, + const void *msg, size_t msg_len, + const void *DST, size_t DST_len, + const void *aug, size_t aug_len) +{ + if (ctx->min_sig_or_pk & AGGR_MIN_SIG) + return BLST_AGGR_TYPE_MISMATCH; + + ctx->min_sig_or_pk |= AGGR_MIN_PK; + + if (signature != NULL) { + if (!POINTonE2_in_G2(signature)) + return BLST_POINT_NOT_IN_GROUP; + + if (ctx->min_sig_or_pk & AGGR_SIGN_SET) { + POINTonE2_dadd_affine(&ctx->AggrSign.e2, &ctx->AggrSign.e2, + signature); + } else { + ctx->min_sig_or_pk |= AGGR_SIGN_SET; + FROM_AFFINE(&ctx->AggrSign.e2, signature); + } + } + + if (PK != NULL) { + size_t n; + POINTonE2 H[1]; + + if (hash_or_encode) + Hash_to_G2(H, msg, msg_len, DST, DST_len, aug, aug_len); + else + Encode_to_G2(H, msg, msg_len, DST, DST_len, aug, aug_len); + + POINTonE2_from_Jacobian(H, H); + + n = ctx->nelems; + vec_copy(ctx->Q + n, H, sizeof(POINTonE2_affine)); + vec_copy(ctx->P + n, PK, sizeof(POINTonE1_affine)); + if (++n == N_MAX) { + if (ctx->min_sig_or_pk & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->min_sig_or_pk |= AGGR_GT_SET; + } + n = 0; + } + ctx->nelems = n; + } + + return BLST_SUCCESS; +} + +BLST_ERROR blst_pairing_aggregate_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + const POINTonE2 *signature, + int hash_or_encode, + const void *msg, size_t msg_len, + const void *DST, size_t DST_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, signature, hash_or_encode, + msg, msg_len, DST, DST_len, aug, aug_len); +} + +static void PAIRING_Commit(PAIRING *ctx) +{ + size_t n; + + if ((n = ctx->nelems)) { + if (ctx->min_sig_or_pk & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->min_sig_or_pk |= AGGR_GT_SET; + } + ctx->nelems = 0; + } +} + +void blst_pairing_commit(PAIRING *ctx) +{ PAIRING_Commit(ctx); } + +BLST_ERROR blst_pairing_merge(PAIRING *ctx, const PAIRING *ctx1) +{ + if (ctx->min_sig_or_pk != AGGR_UNDEFINED + && (ctx->min_sig_or_pk & ctx1->min_sig_or_pk & 3) == 0) + return BLST_AGGR_TYPE_MISMATCH; + + /* context producers are expected to have called blst_pairing_commit */ + if (ctx->nelems || ctx1->nelems) + return BLST_AGGR_TYPE_MISMATCH; + + switch (ctx->min_sig_or_pk & 3) { + case AGGR_MIN_SIG: + if (ctx->min_sig_or_pk & ctx1->min_sig_or_pk & AGGR_SIGN_SET) { + POINTonE1_dadd(&ctx->AggrSign.e1, &ctx->AggrSign.e1, + &ctx1->AggrSign.e1, NULL); + } else if (ctx1->min_sig_or_pk & AGGR_SIGN_SET) { + ctx->min_sig_or_pk |= AGGR_SIGN_SET; + vec_copy(&ctx->AggrSign.e1, &ctx1->AggrSign.e1, + sizeof(ctx->AggrSign.e1)); + } + break; + case AGGR_MIN_PK: + if (ctx->min_sig_or_pk & ctx1->min_sig_or_pk & AGGR_SIGN_SET) { + POINTonE2_dadd(&ctx->AggrSign.e2, &ctx->AggrSign.e2, + &ctx1->AggrSign.e2, NULL); + } else if (ctx1->min_sig_or_pk & AGGR_SIGN_SET) { + ctx->min_sig_or_pk |= AGGR_SIGN_SET; + vec_copy(&ctx->AggrSign.e2, &ctx1->AggrSign.e2, + sizeof(ctx->AggrSign.e2)); + } + break; + case AGGR_UNDEFINED: + vec_copy(ctx, ctx1, sizeof(*ctx)); + return BLST_SUCCESS; + default: + return BLST_AGGR_TYPE_MISMATCH; + } + + if (ctx->min_sig_or_pk & ctx1->min_sig_or_pk & AGGR_GT_SET) { + mul_fp12(ctx->GT, ctx->GT, ctx1->GT); + } else if (ctx1->min_sig_or_pk & AGGR_GT_SET) { + ctx->min_sig_or_pk |= AGGR_GT_SET; + vec_copy(ctx->GT, ctx1->GT, sizeof(ctx->GT)); + } + + return BLST_SUCCESS; +} + +static limb_t PAIRING_FinalVerify(const PAIRING *ctx, const vec384fp12 GTsig) +{ + vec384fp12 GT; + + if (!(ctx->min_sig_or_pk & AGGR_GT_SET)) + return 0; + + if (GTsig != NULL) { + vec_copy(GT, GTsig, sizeof(GT)); + } else if (ctx->min_sig_or_pk & AGGR_SIGN_SET) { + AggregatedSignature AggrSign; + + switch (ctx->min_sig_or_pk & 3) { + case AGGR_MIN_SIG: + POINTonE1_from_Jacobian(&AggrSign.e1, &ctx->AggrSign.e1); + miller_loop_n(GT, (const POINTonE2_affine *)&BLS12_381_G2, + (const POINTonE1_affine *)&AggrSign.e1, 1); + break; + case AGGR_MIN_PK: + POINTonE2_from_Jacobian(&AggrSign.e2, &ctx->AggrSign.e2); + miller_loop_n(GT, (const POINTonE2_affine *)&AggrSign.e2, + (const POINTonE1_affine *)&BLS12_381_G1, 1); + break; + default: + return 0; + } + } else { + return 0; + } + + conjugate_fp12(GT); + mul_fp12(GT, GT, ctx->GT); + final_exp(GT, GT); + + /* return GT==1 */ + return vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) & + vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0])); +} + +limb_t blst_pairing_finalverify(const PAIRING *ctx, const vec384fp12 GTsig) +{ return PAIRING_FinalVerify(ctx, GTsig); } + +/* + * PAIRING context-free entry points. + * + * To perform FastAggregateVerify, aggregate all public keys and + * signatures with corresponding blst_aggregate_in_g{12}, convert + * result to affine and call suitable blst_core_verify_pk_in_g{12} + * or blst_aggregated_in_g{12}... + */ +BLST_ERROR blst_aggregate_in_g1(POINTonE1 *out, const POINTonE1 *in, + const unsigned char *zwire) +{ + POINTonE1_affine P[1]; + + if (zwire[0] & 0x40) { /* infinity? */ + if (in == NULL) + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + + if (zwire[0] & 0x80) { /* compressed? */ + BLST_ERROR ret = POINTonE1_Uncompress(P, zwire); + if (ret != BLST_SUCCESS) + return ret; + } else { + POINTonE1_Deserialize_BE(P, zwire); + if (!POINTonE1_affine_on_curve(P)) + return BLST_POINT_NOT_ON_CURVE; + } + + if (!POINTonE1_in_G1((POINTonE1 *)P)) + return BLST_POINT_NOT_IN_GROUP; + + if (in == NULL) { + vec_copy(out->X, P->X, 2*sizeof(out->X)); + vec_copy(out->Z, BLS12_381_Rx.p, sizeof(out->Z)); + } else { + POINTonE1_dadd_affine(out, in, (POINTonE1 *)P); + } + + return BLST_SUCCESS; +} + +BLST_ERROR blst_aggregate_in_g2(POINTonE2 *out, const POINTonE2 *in, + const unsigned char *zwire) +{ + POINTonE2_affine P[1]; + + if (zwire[0] & 0x40) { /* infinity? */ + if (in == NULL) + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + + if (zwire[0] & 0x80) { /* compressed? */ + BLST_ERROR ret = POINTonE2_Uncompress(P, zwire); + if (ret != BLST_SUCCESS) + return ret; + } else { + POINTonE2_Deserialize_BE(P, zwire); + if (!POINTonE2_affine_on_curve(P)) + return BLST_POINT_NOT_ON_CURVE; + } + + if (!POINTonE2_in_G2((POINTonE2 *)P)) + return BLST_POINT_NOT_IN_GROUP; + + if (in == NULL) { + vec_copy(out->X, P->X, 2*sizeof(out->X)); + vec_copy(out->Z, BLS12_381_Rx.p, sizeof(out->Z)); + } else { + POINTonE2_dadd_affine(out, in, (POINTonE2 *)P); + } + return BLST_SUCCESS; +} + +void blst_aggregated_in_g1(vec384fp12 ret, const POINTonE1_affine *sig) +{ miller_loop_n(ret, (const POINTonE2_affine *)&BLS12_381_G2, sig, 1); } + +void blst_aggregated_in_g2(vec384fp12 ret, const POINTonE2_affine *sig) +{ miller_loop_n(ret, sig, (const POINTonE1_affine *)&BLS12_381_G1, 1); } + +BLST_ERROR blst_core_verify_pk_in_g1(const POINTonE1_affine *pk, + const POINTonE2 *signature, + int hash_or_encode, + const void *msg, size_t msg_len, + const void *DST, size_t DST_len, + const void *aug, size_t aug_len) +{ + PAIRING ctx; + BLST_ERROR ret; + + ctx.min_sig_or_pk = AGGR_UNDEFINED; + ctx.nelems = 0; + + ret = PAIRING_Aggregate_PK_in_G1(&ctx, pk, signature, hash_or_encode, + msg, msg_len, DST, DST_len, aug, aug_len); + if (ret != BLST_SUCCESS) + return ret; + + PAIRING_Commit(&ctx); + + return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL; +} + +BLST_ERROR blst_core_verify_pk_in_g2(const POINTonE2_affine *pk, + const POINTonE1 *signature, + int hash_or_encode, + const void *msg, size_t msg_len, + const void *DST, size_t DST_len, + const void *aug, size_t aug_len) +{ + PAIRING ctx; + BLST_ERROR ret; + + ctx.min_sig_or_pk = AGGR_UNDEFINED; + ctx.nelems = 0; + + ret = PAIRING_Aggregate_PK_in_G2(&ctx, pk, signature, hash_or_encode, + msg, msg_len, DST, DST_len, aug, aug_len); + if (ret != BLST_SUCCESS) + return ret; + + PAIRING_Commit(&ctx); + + return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL; +} diff --git a/src/asm/add_mod_256-x86_64.pl b/src/asm/add_mod_256-x86_64.pl new file mode 100755 index 00000000..69d1dc98 --- /dev/null +++ b/src/asm/add_mod_256-x86_64.pl @@ -0,0 +1,392 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr) = ("%rdi","%rsi","%rdx","%rcx"); +$b_ptr = "%rbx"; + +{ ############################################################## 256 bits add +my @acc=map("%r$_",(8..11, "ax", "si", "bx", "bp", 12)); + +$code.=<<___; +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,\@function,4,"unwind" +.align 32 +add_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + +.Loaded_a_add_mod_256: + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + mov @acc[0], @acc[4] + adc 8*2($b_org), @acc[2] + mov @acc[1], @acc[5] + adc 8*3($b_org), @acc[3] + sbb $b_org, $b_org + + mov @acc[2], @acc[6] + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], @acc[7] + sbb 8*3($n_ptr), @acc[3] + sbb \$0, $b_org + + cmovc @acc[4], @acc[0] + cmovc @acc[5], @acc[1] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[6], @acc[2] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_256,.-add_mod_256 + +######################################################################## +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,\@function,3,"unwind" +.align 32 +mul_by_3_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + + mov $b_org,$n_ptr + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov $a_ptr,$b_org + mov 8*3($a_ptr), @acc[3] + + call __lshift_mod_256 + mov 0(%rsp),%r12 +.cfi_restore %r12 + jmp .Loaded_a_add_mod_256 + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.type __lshift_mod_256,\@abi-omnipotent +.align 32 +__lshift_mod_256: + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + mov @acc[0], @acc[4] + adc @acc[2], @acc[2] + mov @acc[1], @acc[5] + adc @acc[3], @acc[3] + sbb @acc[8], @acc[8] + + mov @acc[2], @acc[6] + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], @acc[7] + sbb 8*3($n_ptr), @acc[3] + sbb \$0, @acc[8] + + cmovc @acc[4], @acc[0] + cmovc @acc[5], @acc[1] + cmovc @acc[6], @acc[2] + cmovc @acc[7], @acc[3] + + ret +.size __lshift_mod_256,.-__lshift_mod_256 + +######################################################################## +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,\@function,4,"unwind" +.align 32 +lshift_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + +.Loop_lshift_mod_256: + call __lshift_mod_256 + dec %edx + jnz .Loop_lshift_mod_256 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 0(%rsp),%r12 +.cfi_restore %r12 + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size lshift_mod_256,.-lshift_mod_256 + +######################################################################## +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,\@function,4,"unwind" +.align 32 +rshift_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[7] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + +.Loop_rshift_mod_256: + mov @acc[7], @acc[0] + and \$1, @acc[7] + mov 8*0($n_ptr), @acc[4] + neg @acc[7] + mov 8*1($n_ptr), @acc[5] + mov 8*2($n_ptr), @acc[6] + + and @acc[7], @acc[4] + and @acc[7], @acc[5] + and @acc[7], @acc[6] + and 8*3($n_ptr), @acc[7] + + add @acc[4], @acc[0] + adc @acc[5], @acc[1] + adc @acc[6], @acc[2] + adc @acc[7], @acc[3] + sbb @acc[4], @acc[4] + + shr \$1, @acc[0] + mov @acc[1], @acc[7] + shr \$1, @acc[1] + mov @acc[2], @acc[6] + shr \$1, @acc[2] + mov @acc[3], @acc[5] + shr \$1, @acc[3] + + shl \$63, @acc[7] + shl \$63, @acc[6] + or @acc[0], @acc[7] + shl \$63, @acc[5] + or @acc[6], @acc[1] + shl \$63, @acc[4] + or @acc[5], @acc[2] + or @acc[4], @acc[3] + + dec %edx + jnz .Loop_rshift_mod_256 + + mov @acc[7], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size rshift_mod_256,.-rshift_mod_256 + +######################################################################## +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,\@function,4,"unwind" +.align 32 +cneg_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[8] # load a[0:3] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov @acc[8], @acc[0] + mov 8*3($a_ptr), @acc[3] + or @acc[1], @acc[8] + or @acc[2], @acc[8] + or @acc[3], @acc[8] + mov \$-1, @acc[7] + + mov 8*0($n_ptr), @acc[4] # load n[0:3] + cmovnz @acc[7], @acc[8] # mask = a[0:3] ? -1 : 0 + mov 8*1($n_ptr), @acc[5] + mov 8*2($n_ptr), @acc[6] + and @acc[8], @acc[4] # n[0:3] &= mask + mov 8*3($n_ptr), @acc[7] + and @acc[8], @acc[5] + and @acc[8], @acc[6] + and @acc[8], @acc[7] + + sub @acc[0], @acc[4] # a[0:3] ? n[0:3]-a[0:3] : 0-0 + sbb @acc[1], @acc[5] + sbb @acc[2], @acc[6] + sbb @acc[3], @acc[7] + + or $b_org, $b_org # check condition flag + + cmovz @acc[0], @acc[4] # flag ? n[0:3]-a[0:3] : a[0:3] + cmovz @acc[1], @acc[5] + mov @acc[4], 8*0($r_ptr) + cmovz @acc[2], @acc[6] + mov @acc[5], 8*1($r_ptr) + cmovz @acc[3], @acc[7] + mov @acc[6], 8*2($r_ptr) + mov @acc[7], 8*3($r_ptr) + + mov 0(%rsp),%r12 +.cfi_restore %r12 + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size cneg_mod_256,.-cneg_mod_256 + +######################################################################## +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,\@function,4,"unwind" +.align 32 +sub_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[4] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[5] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[6] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[7] + sbb $b_org, $b_org + + and $b_org, @acc[4] + and $b_org, @acc[5] + and $b_org, @acc[6] + and $b_org, @acc[7] + + add @acc[4], @acc[0] + adc @acc[5], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[6], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[7], @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_256,.-sub_mod_256 +___ +} + +print $code; +close STDOUT; diff --git a/src/asm/add_mod_384-x86_64.pl b/src/asm/add_mod_384-x86_64.pl new file mode 100755 index 00000000..29ac43bc --- /dev/null +++ b/src/asm/add_mod_384-x86_64.pl @@ -0,0 +1,1420 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +{ ############################################################## 384 bits add +my @acc=map("%r$_",(8..15, "ax", "bx", "bp")); + push(@acc, $a_ptr); + +$code.=<<___; +.text +.extern BLS12_381_P +.hidden BLS12_381_P + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,\@function,4,"unwind" +.align 32 +add_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __add_mod_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,\@abi-omnipotent +.align 32 +__add_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +__add_mod_384_a_is_loaded: + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + adc 8*2($b_org), @acc[2] + mov @acc[0], @acc[6] + adc 8*3($b_org), @acc[3] + mov @acc[1], @acc[7] + adc 8*4($b_org), @acc[4] + mov @acc[2], @acc[8] + adc 8*5($b_org), @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[9], @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,\@function,4,"unwind" +.align 32 +add_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$24, %rsp +.cfi_adjust_cfa_offset 24 +.cfi_end_prologue + + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + lea 48($a_ptr), $a_ptr # a->im + lea 48($b_org), $b_org # b->im + lea 48($r_ptr), $r_ptr # ret->im + call __add_mod_384 # add_mod_384(ret->im, a->im, b->im, mod); + + mov 8*0(%rsp), $a_ptr # a->re + mov 8*1(%rsp), $b_org # b->re + lea -48($r_ptr), $r_ptr # ret->re + call __add_mod_384 # add_mod_384(ret->re, a->re, b->re, mod); + + mov 24+8*0(%rsp),%r15 +.cfi_restore %r15 + mov 24+8*1(%rsp),%r14 +.cfi_restore %r14 + mov 24+8*2(%rsp),%r13 +.cfi_restore %r13 + mov 24+8*3(%rsp),%r12 +.cfi_restore %r12 + mov 24+8*4(%rsp),%rbx +.cfi_restore %rbx + mov 24+8*5(%rsp),%rbp +.cfi_restore %rbp + lea 24+8*6(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_384x,.-add_mod_384x + +######################################################################## +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,\@function,4,"unwind" +.align 32 +lshift_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +.Loop_lshift_mod_384: + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $r_ptr, $r_ptr + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov (%rsp), $r_ptr + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + cmovc @acc[9], @acc[3] + cmovc @acc[10], @acc[4] + cmovc @acc[11], @acc[5] + + dec %edx + jnz .Loop_lshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,\@abi-omnipotent +.align 32 +__lshift_mod_384: + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + cmovc @acc[9], @acc[3] + cmovc @acc[10], @acc[4] + cmovc @acc[11], @acc[5] + + ret +.size __lshift_mod_384,.-__lshift_mod_384 + +######################################################################## +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,\@function,3,"unwind" +.align 32 +mul_by_3_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + + mov (%rsp), $b_org + call __add_mod_384_a_is_loaded + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,\@function,3,"unwind" +.align 32 +mul_by_8_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + +.globl mul_by_b_onE1 +.hidden mul_by_b_onE1 +.type mul_by_b_onE1,\@function,2,"unwind" +.align 32 +mul_by_b_onE1: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + lea BLS12_381_P(%rip), $n_ptr + + call __lshift_mod_384 + call __lshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_b_onE1,.-mul_by_b_onE1 + +.globl mul_by_4b_onE1 +.hidden mul_by_4b_onE1 +.type mul_by_4b_onE1,\@function,2,"unwind" +.align 32 +mul_by_4b_onE1: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + lea BLS12_381_P(%rip), $n_ptr + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_4b_onE1,.-mul_by_4b_onE1 + +######################################################################## +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,\@function,3,"unwind" +.align 32 +mul_by_3_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + + mov (%rsp), $b_org + call __add_mod_384_a_is_loaded + + mov (%rsp), $a_ptr + lea 8*6($r_ptr), $r_ptr + + mov 8*6($a_ptr), @acc[0] + mov 8*7($a_ptr), @acc[1] + mov 8*8($a_ptr), @acc[2] + mov 8*9($a_ptr), @acc[3] + mov 8*10($a_ptr), @acc[4] + mov 8*11($a_ptr), @acc[5] + + call __lshift_mod_384 + + mov \$8*6, $b_org + add (%rsp), $b_org + call __add_mod_384_a_is_loaded + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,\@function,3,"unwind" +.align 32 +mul_by_8_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov (%rsp), $a_ptr + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 48+8*0($a_ptr), @acc[0] + mov 48+8*1($a_ptr), @acc[1] + mov 48+8*2($a_ptr), @acc[2] + mov 48+8*3($a_ptr), @acc[3] + mov 48+8*4($a_ptr), @acc[4] + mov 48+8*5($a_ptr), @acc[5] + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov @acc[0], 48+8*0($r_ptr) + mov @acc[1], 48+8*1($r_ptr) + mov @acc[2], 48+8*2($r_ptr) + mov @acc[3], 48+8*3($r_ptr) + mov @acc[4], 48+8*4($r_ptr) + mov @acc[5], 48+8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + +.globl mul_by_b_onE2 +.hidden mul_by_b_onE2 +.type mul_by_b_onE2,\@function,2,"unwind" +.align 32 +mul_by_b_onE2: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + lea BLS12_381_P(%rip), $n_ptr + lea 8*6($a_ptr), $b_org + call __sub_mod_384 # a->re - a->im + call __lshift_mod_384 + call __lshift_mod_384 + + mov 0(%rsp), $a_ptr + mov @acc[0], 8*0($r_ptr) # ret->re + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + lea 8*6($a_ptr), $b_org + lea 8*6($r_ptr), $r_ptr + call __add_mod_384 # a->re + a->im + call __lshift_mod_384 + call __lshift_mod_384 + + mov @acc[0], 8*0($r_ptr) # ret->im + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8*1(%rsp),%r15 +.cfi_restore %r15 + mov 8*2(%rsp),%r14 +.cfi_restore %r14 + mov 8*3(%rsp),%r13 +.cfi_restore %r13 + mov 8*4(%rsp),%r12 +.cfi_restore %r12 + mov 8*5(%rsp),%rbx +.cfi_restore %rbx + mov 8*6(%rsp),%rbp +.cfi_restore %rbp + lea 8*7(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_b_onE2,.-mul_by_b_onE2 + +.globl mul_by_4b_onE2 +.hidden mul_by_4b_onE2 +.type mul_by_4b_onE2,\@function,2,"unwind" +.align 32 +mul_by_4b_onE2: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + lea BLS12_381_P(%rip), $n_ptr + lea 8*6($a_ptr), $b_org + call __sub_mod_384 # a->re - a->im + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov 0(%rsp), $a_ptr # restore $a_ptr + mov @acc[0], 8*0($r_ptr) # ret->re + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + lea 8*6($a_ptr), $b_org + lea 8*6($r_ptr), $r_ptr + call __add_mod_384 # a->re + a->im + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov @acc[0], 8*0($r_ptr) # ret->im + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8*1(%rsp),%r15 +.cfi_restore %r15 + mov 8*2(%rsp),%r14 +.cfi_restore %r14 + mov 8*3(%rsp),%r13 +.cfi_restore %r13 + mov 8*4(%rsp),%r12 +.cfi_restore %r12 + mov 8*5(%rsp),%rbx +.cfi_restore %rbx + mov 8*6(%rsp),%rbp +.cfi_restore %rbp + lea 8*7(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_4b_onE2,.-mul_by_4b_onE2 + +######################################################################## +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,\@function,4,"unwind" +.align 32 +cneg_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $b_org # condition flag +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), $b_org # load a[0:5] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov $b_org, @acc[0] + mov 8*3($a_ptr), @acc[3] + or @acc[1], $b_org + mov 8*4($a_ptr), @acc[4] + or @acc[2], $b_org + mov 8*5($a_ptr), @acc[5] + or @acc[3], $b_org + mov \$-1, @acc[11] + or @acc[4], $b_org + or @acc[5], $b_org + + mov 8*0($n_ptr), @acc[6] # load n[0:5] + cmovnz @acc[11], $b_org # mask = a[0:5] ? -1 : 0 + mov 8*1($n_ptr), @acc[7] + mov 8*2($n_ptr), @acc[8] + and $b_org, @acc[6] # n[0:5] &= mask + mov 8*3($n_ptr), @acc[9] + and $b_org, @acc[7] + mov 8*4($n_ptr), @acc[10] + and $b_org, @acc[8] + mov 8*5($n_ptr), @acc[11] + and $b_org, @acc[9] + mov 0(%rsp), $n_ptr # restore condition flag + and $b_org, @acc[10] + and $b_org, @acc[11] + + sub @acc[0], @acc[6] # a[0:5] ? n[0:5]-a[0:5] : 0-0 + sbb @acc[1], @acc[7] + sbb @acc[2], @acc[8] + sbb @acc[3], @acc[9] + sbb @acc[4], @acc[10] + sbb @acc[5], @acc[11] + + or $n_ptr, $n_ptr # check condition flag + + cmovz @acc[0], @acc[6] # flag ? n[0:5]-a[0:5] : a[0:5] + cmovz @acc[1], @acc[7] + cmovz @acc[2], @acc[8] + mov @acc[6], 8*0($r_ptr) + cmovz @acc[3], @acc[9] + mov @acc[7], 8*1($r_ptr) + cmovz @acc[4], @acc[10] + mov @acc[8], 8*2($r_ptr) + cmovz @acc[5], @acc[11] + mov @acc[9], 8*3($r_ptr) + mov @acc[10], 8*4($r_ptr) + mov @acc[11], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size cneg_mod_384,.-cneg_mod_384 + +######################################################################## +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,\@function,4,"unwind" +.align 32 +sub_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __sub_mod_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,\@abi-omnipotent +.align 32 +__sub_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb 8*4($b_org), @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb 8*5($b_org), @acc[5] + mov 8*5($n_ptr), @acc[11] + sbb $b_org, $b_org + + and $b_org, @acc[6] + and $b_org, @acc[7] + and $b_org, @acc[8] + and $b_org, @acc[9] + and $b_org, @acc[10] + and $b_org, @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[8], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[9], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[10], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[11], @acc[5] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,\@function,4,"unwind" +.align 32 +sub_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$24, %rsp +.cfi_adjust_cfa_offset 24 +.cfi_end_prologue + + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + lea 48($a_ptr), $a_ptr # a->im + lea 48($b_org), $b_org # b->im + lea 48($r_ptr), $r_ptr # ret->im + call __sub_mod_384 # sub_mod_384(ret->im, a->im, b->im, mod); + + mov 8*0(%rsp), $a_ptr # a->re + mov 8*1(%rsp), $b_org # b->re + lea -48($r_ptr), $r_ptr # ret->re + call __sub_mod_384 # sub_mod_384(ret->re, a->re, b->re, mod); + + mov 24+8*0(%rsp),%r15 +.cfi_restore %r15 + mov 24+8*1(%rsp),%r14 +.cfi_restore %r14 + mov 24+8*2(%rsp),%r13 +.cfi_restore %r13 + mov 24+8*3(%rsp),%r12 +.cfi_restore %r12 + mov 24+8*4(%rsp),%rbx +.cfi_restore %rbx + mov 24+8*5(%rsp),%rbp +.cfi_restore %rbp + lea 24+8*6(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_384x,.-sub_mod_384x +___ +} +{ ###################################################### ret = a * (1 + i) +my ($r_ptr,$a_ptr,$n_ptr) = ("%rdi","%rsi","%rdx"); +my @acc=map("%r$_",(8..15, "ax", "bx", "cx", "bp")); + +$code.=<<___; +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,\@function,3,"unwind" +.align 32 +mul_by_1_plus_i_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$56, %rsp +.cfi_adjust_cfa_offset 56 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov @acc[0], @acc[6] + add 8*6($a_ptr), @acc[0] # a->re + a->im + mov @acc[1], @acc[7] + adc 8*7($a_ptr), @acc[1] + mov @acc[2], @acc[8] + adc 8*8($a_ptr), @acc[2] + mov @acc[3], @acc[9] + adc 8*9($a_ptr), @acc[3] + mov @acc[4], @acc[10] + adc 8*10($a_ptr), @acc[4] + mov @acc[5], @acc[11] + adc 8*11($a_ptr), @acc[5] + mov $r_ptr, 8*6(%rsp) # offload r_ptr + sbb $r_ptr, $r_ptr + + sub 8*6($a_ptr), @acc[6] # a->re - a->im + sbb 8*7($a_ptr), @acc[7] + sbb 8*8($a_ptr), @acc[8] + sbb 8*9($a_ptr), @acc[9] + sbb 8*10($a_ptr), @acc[10] + sbb 8*11($a_ptr), @acc[11] + sbb $a_ptr, $a_ptr + + mov @acc[0], 8*0(%rsp) # offload a->re + a->im [without carry] + mov 8*0($n_ptr), @acc[0] + mov @acc[1], 8*1(%rsp) + mov 8*1($n_ptr), @acc[1] + mov @acc[2], 8*2(%rsp) + mov 8*2($n_ptr), @acc[2] + mov @acc[3], 8*3(%rsp) + mov 8*3($n_ptr), @acc[3] + mov @acc[4], 8*4(%rsp) + and $a_ptr, @acc[0] + mov 8*4($n_ptr), @acc[4] + mov @acc[5], 8*5(%rsp) + and $a_ptr, @acc[1] + mov 8*5($n_ptr), @acc[5] + and $a_ptr, @acc[2] + and $a_ptr, @acc[3] + and $a_ptr, @acc[4] + and $a_ptr, @acc[5] + mov 8*6(%rsp), $a_ptr # restore r_ptr + + add @acc[0], @acc[6] + mov 8*0(%rsp), @acc[0] # restore a->re + a->im + adc @acc[1], @acc[7] + mov 8*1(%rsp), @acc[1] + adc @acc[2], @acc[8] + mov 8*2(%rsp), @acc[2] + adc @acc[3], @acc[9] + mov 8*3(%rsp), @acc[3] + adc @acc[4], @acc[10] + mov 8*4(%rsp), @acc[4] + adc @acc[5], @acc[11] + mov 8*5(%rsp), @acc[5] + + mov @acc[6], 8*0($a_ptr) # ret->re = a->re - a->im + mov @acc[0], @acc[6] + mov @acc[7], 8*1($a_ptr) + mov @acc[8], 8*2($a_ptr) + mov @acc[1], @acc[7] + mov @acc[9], 8*3($a_ptr) + mov @acc[10], 8*4($a_ptr) + mov @acc[2], @acc[8] + mov @acc[11], 8*5($a_ptr) + + sub 8*0($n_ptr), @acc[0] + mov @acc[3], @acc[9] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[4], @acc[10] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*6($a_ptr) # ret->im = a->re + a->im + cmovc @acc[9], @acc[3] + mov @acc[1], 8*7($a_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*8($a_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*9($a_ptr) + mov @acc[4], 8*10($a_ptr) + mov @acc[5], 8*11($a_ptr) + + mov 56+8*0(%rsp),%r15 +.cfi_restore %r15 + mov 56+8*1(%rsp),%r14 +.cfi_restore %r14 + mov 56+8*2(%rsp),%r13 +.cfi_restore %r13 + mov 56+8*3(%rsp),%r12 +.cfi_restore %r12 + mov 56+8*4(%rsp),%rbx +.cfi_restore %rbx + mov 56+8*5(%rsp),%rbp +.cfi_restore %rbp + lea 56+8*6(%rsp),%rsp +.cfi_adjust_cfa_offset -56-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x +___ +} +{ ###################################################### +my ($r_ptr,$n_ptr) = ("%rdi","%rsi"); +my @acc=map("%r$_",(8..11, "cx", "dx", "bx", "bp")); + +$code.=<<___; +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,\@function,2,"unwind" +.align 32 +sgn0_pty_mod_384: +.cfi_startproc +.cfi_end_prologue + mov 8*0($r_ptr), @acc[0] + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + mov 8*4($r_ptr), @acc[4] + mov 8*5($r_ptr), @acc[5] + + xor %rax, %rax + mov @acc[0], $r_ptr + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + not %rax # 2*x > p, which means "negative" + and \$1, $r_ptr + and \$2, %rax + or $r_ptr, %rax # pack sign and parity + +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,\@function,2,"unwind" +.align 32 +sgn0_pty_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($r_ptr), @acc[0] + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + mov 8*4($r_ptr), @acc[4] + mov 8*5($r_ptr), @acc[5] + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + xor %rax, %rax + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + mov @acc[0], 0(%rsp) # a->re is zero or not + not %rax # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8*6($r_ptr), @acc[0] + mov 8*7($r_ptr), @acc[1] + mov 8*8($r_ptr), @acc[2] + mov 8*9($r_ptr), @acc[3] + mov 8*10($r_ptr), @acc[4] + mov 8*11($r_ptr), @acc[5] + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + xor $r_ptr, $r_ptr + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, $r_ptr + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov 0(%rsp), @acc[6] + + not $r_ptr # 2*x > p, which means "negative" + + test @acc[0], @acc[0] + cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) + + test @acc[6], @acc[6] + cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) + + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp), %rbx +.cfi_restore %rbx + mov 16(%rsp), %rbp +.cfi_restore %rbp + lea 24(%rsp), %rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +___ +} +if (0) { +my $inp = $win64 ? "%rcx" : "%rdi"; +$code.=<<___; +.globl nbits_384 +.hidden nbits_384 +.type nbits_384,\@abi-omnipotent +.align 32 +nbits_384: + mov 8*5($inp), %r8 + mov 8*4($inp), %r9 + mov 8*3($inp), %r10 + mov 8*2($inp), %r11 + mov \$-1, %rdx + mov \$127, %eax + bsr %r8, %r8 + cmovnz %rdx,%r9 + cmovz %rax,%r8 + bsr %r9, %r9 + cmovnz %rdx,%r10 + cmovz %rax,%r9 + xor \$63,%r8 + bsr %r10, %r10 + cmovnz %rdx, %r11 + cmovz %rax, %r10 + xor \$63,%r9 + add %r8, %r9 + mov 8*1($inp), %r8 + bsr %r11, %r11 + cmovnz %rdx, %r8 + cmovz %rax, %r11 + xor \$63, %r10 + add %r9, %r10 + mov 8*0($inp), %r9 + bsr %r8, %r8 + cmovnz %rdx, %r9 + cmovz %rax, %r8 + xor \$63, %r11 + add %r10, %r11 + bsr %r9, %r9 + cmovz %rax, %r9 + xor \$63, %r8 + add %r11, %r8 + xor \$63, %r9 + add %r8, %r9 + mov \$384, %eax + sub %r9, %rax + ret +.size nbits_384,.-nbits_384 +___ +} + +print $code; +close STDOUT; diff --git a/src/asm/add_mod_384x384-x86_64.pl b/src/asm/add_mod_384x384-x86_64.pl new file mode 100755 index 00000000..6ee3cf87 --- /dev/null +++ b/src/asm/add_mod_384x384-x86_64.pl @@ -0,0 +1,260 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +# common accumulator layout +@acc=map("%r$_",(8..15)); + +############################################################ 384x384 add/sub +# Double-width addition/subtraction modulo n<<384, as opposite to +# naively expected modulo n*n. It works because n<<384 is the actual +# input boundary condition for Montgomery reduction, not n*n. +# Just in case, this is duplicated, but only one module is +# supposed to be linked... +{ +my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.text + +.type __add_mod_384x384,\@abi-omnipotent +.align 32 +__add_mod_384x384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov 8*6($a_ptr), @acc[6] + + add 8*0($b_org), @acc[0] + mov 8*7($a_ptr), @acc[7] + adc 8*1($b_org), @acc[1] + mov 8*8($a_ptr), @acc[8] + adc 8*2($b_org), @acc[2] + mov 8*9($a_ptr), @acc[9] + adc 8*3($b_org), @acc[3] + mov 8*10($a_ptr), @acc[10] + adc 8*4($b_org), @acc[4] + mov 8*11($a_ptr), @acc[11] + adc 8*5($b_org), @acc[5] + mov @acc[0], 8*0($r_ptr) + adc 8*6($b_org), @acc[6] + mov @acc[1], 8*1($r_ptr) + adc 8*7($b_org), @acc[7] + mov @acc[2], 8*2($r_ptr) + adc 8*8($b_org), @acc[8] + mov @acc[4], 8*4($r_ptr) + mov @acc[6], @acc[0] + adc 8*9($b_org), @acc[9] + mov @acc[3], 8*3($r_ptr) + mov @acc[7], @acc[1] + adc 8*10($b_org), @acc[10] + mov @acc[5], 8*5($r_ptr) + mov @acc[8], @acc[2] + adc 8*11($b_org), @acc[11] + mov @acc[9], @acc[3] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[7] + mov @acc[10], @acc[4] + sbb 8*2($n_ptr), @acc[8] + sbb 8*3($n_ptr), @acc[9] + sbb 8*4($n_ptr), @acc[10] + mov @acc[11], @acc[5] + sbb 8*5($n_ptr), @acc[11] + sbb \$0, $b_org + + cmovc @acc[0], @acc[6] + cmovc @acc[1], @acc[7] + cmovc @acc[2], @acc[8] + mov @acc[6], 8*6($r_ptr) + cmovc @acc[3], @acc[9] + mov @acc[7], 8*7($r_ptr) + cmovc @acc[4], @acc[10] + mov @acc[8], 8*8($r_ptr) + cmovc @acc[5], @acc[11] + mov @acc[9], 8*9($r_ptr) + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __add_mod_384x384,.-__add_mod_384x384 + +.type __sub_mod_384x384,\@abi-omnipotent +.align 32 +__sub_mod_384x384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov 8*6($a_ptr), @acc[6] + + sub 8*0($b_org), @acc[0] + mov 8*7($a_ptr), @acc[7] + sbb 8*1($b_org), @acc[1] + mov 8*8($a_ptr), @acc[8] + sbb 8*2($b_org), @acc[2] + mov 8*9($a_ptr), @acc[9] + sbb 8*3($b_org), @acc[3] + mov 8*10($a_ptr), @acc[10] + sbb 8*4($b_org), @acc[4] + mov 8*11($a_ptr), @acc[11] + sbb 8*5($b_org), @acc[5] + mov @acc[0], 8*0($r_ptr) + sbb 8*6($b_org), @acc[6] + mov 8*0($n_ptr), @acc[0] + mov @acc[1], 8*1($r_ptr) + sbb 8*7($b_org), @acc[7] + mov 8*1($n_ptr), @acc[1] + mov @acc[2], 8*2($r_ptr) + sbb 8*8($b_org), @acc[8] + mov 8*2($n_ptr), @acc[2] + mov @acc[3], 8*3($r_ptr) + sbb 8*9($b_org), @acc[9] + mov 8*3($n_ptr), @acc[3] + mov @acc[4], 8*4($r_ptr) + sbb 8*10($b_org), @acc[10] + mov 8*4($n_ptr), @acc[4] + mov @acc[5], 8*5($r_ptr) + sbb 8*11($b_org), @acc[11] + mov 8*5($n_ptr), @acc[5] + sbb $b_org, $b_org + + and $b_org, @acc[0] + and $b_org, @acc[1] + and $b_org, @acc[2] + and $b_org, @acc[3] + and $b_org, @acc[4] + and $b_org, @acc[5] + + add @acc[0], @acc[6] + adc @acc[1], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[2], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[3], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[4], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[5], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.globl add_mod_384x384 +.hidden add_mod_384x384 +.type add_mod_384x384,\@function,4,"unwind" +.align 32 +add_mod_384x384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __add_mod_384x384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_384x384,.-add_mod_384x384 + +.globl sub_mod_384x384 +.hidden sub_mod_384x384 +.type sub_mod_384x384,\@function,4,"unwind" +.align 32 +sub_mod_384x384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __sub_mod_384x384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_384x384,.-sub_mod_384x384 +___ +} + +print $code; +close STDOUT; diff --git a/src/asm/inverse_mod_384-x86_64.pl b/src/asm/inverse_mod_384-x86_64.pl new file mode 100755 index 00000000..25e94067 --- /dev/null +++ b/src/asm/inverse_mod_384-x86_64.pl @@ -0,0 +1,411 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# ~3x(*) faster than dedicated modulus-specific constant-time addition +# chain, but as implied, at the cost of constant-time-ness. +# Caveat lector! Benchmarking results can be deceptive, as timings vary +# wildly from input to input. And that is depending both on set bits' +# relative positions in input and modulus, and ability of branch +# prediction logic to adapt for a specific workflow. Right thing to do +# is to benchmark with series of random inputs. +# +# (*) "up to ~3x" actually, as on adcx/adox-capable processor the +# ratio is lower, less than 2x, because 384-bit multiplications +# are relatively faster; +# +# int eucl_inverse_mod_384(vec384 ret, const vec384 inp, +# const vec384 mod, const vec384 one = NULL); +# +# If |one| is 1, then it works as plain inverse procedure. +# If |one| is (1<<768)%|mod|, then it's inverse in Montgomery +# representation domain. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +($r_ptr, $a_ptr, $n_ptr, $one) = ("%rdi","%rsi","%rdx","%rcx"); +@acc=(map("%r$_",(8..15)),"%rax","%rbx","%rbp",$r_ptr); +($ux_ptr, $vy_ptr) = ($a_ptr, $one); + +$frame=8*3+4*384/8; +$U=16; +$X=$U+384/8; +$V=$X+384/8; +$Y=$V+384/8; + +$code.=<<___; +.text + +.align 32 +.Lone: + .quad 1,0,0,0,0,0,0,0 + +.globl eucl_inverse_mod_384 +.hidden eucl_inverse_mod_384 +.type eucl_inverse_mod_384,\@function,4,"unwind" +.align 32 +eucl_inverse_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $r_ptr, 8*0(%rsp) + lea .Lone(%rip), %rbp + cmp \$0, $one + cmove %rbp, $one # default $one to 1 + + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov %rax, @acc[0] + or @acc[1], %rax + or @acc[2], %rax + or @acc[3], %rax + or @acc[4], %rax + or @acc[5], %rax + jz .Labort # abort if |inp|==0 + + lea $U(%rsp), $ux_ptr + mov 8*0($one), @acc[6] + mov 8*1($one), @acc[7] + mov 8*2($one), @acc[8] + mov 8*3($one), @acc[9] + mov 8*4($one), @acc[10] + mov 8*5($one), @acc[11] + + mov @acc[0], 8*0($ux_ptr) # copy |inp| to U + mov @acc[1], 8*1($ux_ptr) + mov @acc[2], 8*2($ux_ptr) + mov @acc[3], 8*3($ux_ptr) + mov @acc[4], 8*4($ux_ptr) + mov @acc[5], 8*5($ux_ptr) + + lea $V(%rsp), $vy_ptr + mov 8*0($n_ptr), @acc[0] + mov 8*1($n_ptr), @acc[1] + mov 8*2($n_ptr), @acc[2] + mov 8*3($n_ptr), @acc[3] + mov 8*4($n_ptr), @acc[4] + mov 8*5($n_ptr), @acc[5] + + mov @acc[6], 8*6($ux_ptr) # copy |one| to X + mov @acc[7], 8*7($ux_ptr) + mov @acc[8], 8*8($ux_ptr) + mov @acc[9], 8*9($ux_ptr) + mov @acc[10], 8*10($ux_ptr) + mov @acc[11], 8*11($ux_ptr) + + mov @acc[0], 8*0($vy_ptr) # copy |mod| to V + mov @acc[1], 8*1($vy_ptr) + mov @acc[2], 8*2($vy_ptr) + mov @acc[3], 8*3($vy_ptr) + mov @acc[4], 8*4($vy_ptr) + mov @acc[5], 8*5($vy_ptr) + + xor %eax, %eax + mov %rax, 8*6($vy_ptr) # clear Y + mov %rax, 8*7($vy_ptr) + mov %rax, 8*8($vy_ptr) + mov %rax, 8*9($vy_ptr) + mov %rax, 8*10($vy_ptr) + mov %rax, 8*11($vy_ptr) + jmp .Loop_inv + +.align 32 +.Loop_inv: + lea $V(%rsp), $ux_ptr + call __remove_powers_of_2 + + lea $U(%rsp), $ux_ptr + call __remove_powers_of_2 + + lea $V(%rsp), $vy_ptr + sub $V+8*0(%rsp), @acc[0] # U-V + sbb 8*1($vy_ptr), @acc[1] + sbb 8*2($vy_ptr), @acc[2] + sbb 8*3($vy_ptr), @acc[3] + sbb 8*4($vy_ptr), @acc[4] + sbb 8*5($vy_ptr), @acc[5] + jae .Lu_greater_than_v # conditional pointers' swap + # doesn't help [performance + # with random inputs] + xchg $vy_ptr, $ux_ptr + + not @acc[0] # U-V => V-U + not @acc[1] + not @acc[2] + not @acc[3] + not @acc[4] + not @acc[5] + + add \$1, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + +.Lu_greater_than_v: + mov 8*6($ux_ptr), @acc[6] + mov 8*7($ux_ptr), @acc[7] + mov 8*8($ux_ptr), @acc[8] + mov 8*9($ux_ptr), @acc[9] + mov 8*10($ux_ptr), @acc[10] + mov 8*11($ux_ptr), @acc[11] + + sub 8*6($vy_ptr), @acc[6] # X-Y # [alt. Y-X] + sbb 8*7($vy_ptr), @acc[7] + sbb 8*8($vy_ptr), @acc[8] + sbb 8*9($vy_ptr), @acc[9] + sbb 8*10($vy_ptr), @acc[10] + sbb 8*11($vy_ptr), @acc[11] + + mov @acc[0], 8*0($ux_ptr) + sbb @acc[0], @acc[0] # borrow -> mask + mov @acc[1], 8*1($ux_ptr) + mov @acc[0], @acc[1] + mov @acc[2], 8*2($ux_ptr) + mov @acc[0], @acc[2] + mov @acc[3], 8*3($ux_ptr) + mov @acc[0], @acc[3] + mov @acc[4], 8*4($ux_ptr) + mov @acc[0], @acc[4] + mov @acc[5], 8*5($ux_ptr) + mov @acc[0], @acc[5] + + and 8*0($n_ptr), @acc[0] + and 8*1($n_ptr), @acc[1] + and 8*2($n_ptr), @acc[2] + and 8*3($n_ptr), @acc[3] + and 8*4($n_ptr), @acc[4] + and 8*5($n_ptr), @acc[5] + + add @acc[0], @acc[6] # reduce if Xre, b->re); + #lea 0($b_btr), $b_ptr # b->re + #lea 0($a_ptr), $a_ptr # a->re + lea 40(%rsp), $r_ptr # t0 + call __mulq_384 + + ################################# mul_384(t1, a->im, b->im); + lea 48($b_ptr), $b_ptr # b->im + lea 48($a_ptr), $a_ptr # a->im + lea 40+96(%rsp), $r_ptr # t1 + call __mulq_384 + + ################################# mul_384(t2, a->re+a->im, b->re+b->im); + mov 8*1(%rsp), $n_ptr + lea -48($a_ptr), $b_org + lea 40+192+48(%rsp), $r_ptr + call __add_mod_384 + + mov 8*2(%rsp), $a_ptr + lea 48($a_ptr), $b_org + lea -48($r_ptr), $r_ptr + call __add_mod_384 + + lea ($r_ptr),$b_ptr + lea 48($r_ptr),$a_ptr + call __mulq_384 + + ################################# t2=t2-t0-t1 + lea ($r_ptr), $a_ptr # t2 + lea 40(%rsp), $b_org # t0 + mov 8*1(%rsp), $n_ptr + call __sub_mod_384x384 # t2=t2-t0 + + lea ($r_ptr), $a_ptr # t2 + lea -96($r_ptr), $b_org # t1 + call __sub_mod_384x384 # t2=t2-t1 + + ################################# t0=t0-t1 + lea 40(%rsp), $a_ptr + lea 40+96(%rsp), $b_org + lea 40(%rsp), $r_ptr + call __sub_mod_384x384 # t0-t1 + + mov $n_ptr, $b_ptr # n_ptr for redc_mont_384 + + ################################# redc_mont_384(ret->re, t0, mod, n0); + lea 40(%rsp), $a_ptr # t0 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + mov 8*4(%rsp), $r_ptr # ret->re + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + ################################# redc_mont_384(ret->im, t2, mod, n0); + lea 40+192(%rsp), $a_ptr # t2 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + lea 48($r_ptr), $r_ptr # ret->im + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mul_mont_384x,.-mul_mont_384x +___ +} +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # align +$code.=<<___; +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,\@function,4,"unwind" +.align 32 +sqr_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + mov $a_ptr, 8*2(%rsp) + movq $r_ptr, %xmm0 + + ################################# add_mod_384(t0, a->re, a->im); + lea 48($a_ptr), $b_org # a->im + lea 32(%rsp), $r_ptr # t0 + call __add_mod_384 + + ################################# sub_mod_384(t1, a->re, a->im); + mov 8*2(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_org # a->im + lea 32+48(%rsp), $r_ptr # t1 + call __sub_mod_384 + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + mov 8*2(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rax # a->im + mov 8*0($a_ptr), @acc[6] # a->re + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[4] + mov 8*3($a_ptr), @acc[5] + + call __mulq_mont_384 +___ +{ +my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 + 12,13,"ax","bx","bp","si"); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*6($r_ptr) # ret->im + cmovc @acc[9], @acc[3] + mov @acc[1], 8*7($r_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*8($r_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32(%rsp), $a_ptr # t0 + lea 32+48(%rsp), $b_ptr # t1 + + mov 32+48(%rsp), %rax # t1[0] + mov 32+8*0(%rsp), @acc[6] # t0[0..3] + mov 32+8*1(%rsp), @acc[7] + mov 32+8*2(%rsp), @acc[4] + mov 32+8*3(%rsp), @acc[5] + + call __mulq_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_382x +.hidden mul_382x +.type mul_382x,\@function,4,"unwind" +.align 32 +mul_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 96($r_ptr), $r_ptr # ret->im + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + mov $r_ptr, 8*2(%rsp) # offload ret->im + mov $n_ptr, 8*3(%rsp) + + ################################# t0 = a->re + a->im + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + add 8*6($a_ptr), @acc[0] + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + + mov @acc[0], 32+8*0(%rsp) + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + ################################# t1 = b->re + b->im + mov 8*0($b_org), @acc[0] + mov 8*1($b_org), @acc[1] + mov 8*2($b_org), @acc[2] + mov 8*3($b_org), @acc[3] + mov 8*4($b_org), @acc[4] + mov 8*5($b_org), @acc[5] + + add 8*6($b_org), @acc[0] + adc 8*7($b_org), @acc[1] + adc 8*8($b_org), @acc[2] + adc 8*9($b_org), @acc[3] + adc 8*10($b_org), @acc[4] + adc 8*11($b_org), @acc[5] + + mov @acc[0], 32+8*6(%rsp) + mov @acc[1], 32+8*7(%rsp) + mov @acc[2], 32+8*8(%rsp) + mov @acc[3], 32+8*9(%rsp) + mov @acc[4], 32+8*10(%rsp) + mov @acc[5], 32+8*11(%rsp) + + ################################# mul_384(ret->im, t0, t1); + lea 32+8*0(%rsp), $a_ptr # t0 + lea 32+8*6(%rsp), $b_ptr # t1 + call __mulq_384 + + ################################# mul_384(ret->re, a->re, b->re); + mov 8*0(%rsp), $a_ptr + mov 8*1(%rsp), $b_ptr + lea -96($r_ptr), $r_ptr # ret->re + call __mulq_384 + + ################################# mul_384(tx, a->im, b->im); + lea 48($a_ptr), $a_ptr + lea 48($b_ptr), $b_ptr + lea 32(%rsp), $r_ptr + call __mulq_384 + + ################################# ret->im -= tx + mov 8*2(%rsp), $a_ptr # restore ret->im + lea 32(%rsp), $b_org + mov 8*3(%rsp), $n_ptr + mov $a_ptr, $r_ptr + call __sub_mod_384x384 + + ################################# ret->im -= ret->re + lea 0($r_ptr), $a_ptr + lea -96($r_ptr), $b_org + call __sub_mod_384x384 + + ################################# ret->re -= tx + lea -96($r_ptr), $a_ptr + lea 32(%rsp), $b_org + lea -96($r_ptr), $r_ptr + call __sub_mod_384x384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mul_382x,.-mul_382x +___ +} +{ my @acc=(@acc,"%rax","%rbx","%rbp",$b_org); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,\@function,3,"unwind" +.align 32 +sqr_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + + ################################# t0 = a->re + a->im + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + mov 8*5($a_ptr), @acc[11] + + mov @acc[6], @acc[0] + add 8*6($a_ptr), @acc[6] + mov @acc[7], @acc[1] + adc 8*7($a_ptr), @acc[7] + mov @acc[8], @acc[2] + adc 8*8($a_ptr), @acc[8] + mov @acc[9], @acc[3] + adc 8*9($a_ptr), @acc[9] + mov @acc[10], @acc[4] + adc 8*10($a_ptr), @acc[10] + mov @acc[11], @acc[5] + adc 8*11($a_ptr), @acc[11] + + mov @acc[6], 8*0($r_ptr) + mov @acc[7], 8*1($r_ptr) + mov @acc[8], 8*2($r_ptr) + mov @acc[9], 8*3($r_ptr) + mov @acc[10], 8*4($r_ptr) + mov @acc[11], 8*5($r_ptr) + + ################################# t1 = a->re - a->im + lea 48($a_ptr), $b_org + lea 48($r_ptr), $r_ptr + call __sub_mod_384_a_is_loaded + + ################################# mul_384(ret->re, t0, t1); + lea ($r_ptr), $a_ptr + lea -48($r_ptr), $b_ptr + lea -48($r_ptr), $r_ptr + call __mulq_384 + + ################################# mul_384(ret->im, a->re, a->im); + mov (%rsp), $a_ptr + lea 48($a_ptr), $b_ptr + lea 96($r_ptr), $r_ptr + call __mulq_384 + + mov 8*0($r_ptr), @acc[0] # double ret->im + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + mov 8*4($r_ptr), @acc[4] + mov 8*5($r_ptr), @acc[5] + mov 8*6($r_ptr), @acc[6] + mov 8*7($r_ptr), @acc[7] + mov 8*8($r_ptr), @acc[8] + mov 8*9($r_ptr), @acc[9] + mov 8*10($r_ptr), @acc[10] + add @acc[0], @acc[0] + mov 8*11($r_ptr), @acc[11] + adc @acc[1], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[2], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[3], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[4], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[5], @acc[5] + mov @acc[4], 8*4($r_ptr) + adc @acc[6], @acc[6] + mov @acc[5], 8*5($r_ptr) + adc @acc[7], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[8], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[9], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[10], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[11], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + mov 8*1(%rsp),%r15 +.cfi_restore %r15 + mov 8*2(%rsp),%r14 +.cfi_restore %r14 + mov 8*3(%rsp),%r13 +.cfi_restore %r13 + mov 8*4(%rsp),%r12 +.cfi_restore %r12 + mov 8*5(%rsp),%rbx +.cfi_restore %rbx + mov 8*6(%rsp),%rbp +.cfi_restore %rbp + lea 8*7(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_382x,.-sqr_382x +___ +} +{ ########################################################## 384-bit mul +my @acc=map("%r$_",("cx",8..12)); +my $bi = "%rbp"; + +$code.=<<___; +.globl mul_384 +.hidden mul_384 +.type mul_384,\@function,3,"unwind" +.align 32 +mul_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + + mov $b_org, $b_ptr + call __mulq_384 + + mov 0(%rsp),%r12 +.cfi_restore %r12 + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size mul_384,.-mul_384 + +.type __mulq_384,\@abi-omnipotent +.align 32 +__mulq_384: + mov 8*0($b_ptr), %rax + + mov %rax, $bi + mulq 8*0($a_ptr) + mov %rax, 8*0($r_ptr) + mov $bi, %rax + mov %rdx, @acc[0] + + mulq 8*1($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[1] + + mulq 8*2($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq 8*3($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mulq 8*4($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*5($a_ptr) + add %rax, @acc[4] + mov 8*1($b_ptr), %rax + adc \$0, %rdx + mov %rdx, @acc[5] +___ +for(my $i=1; $i<6; $i++) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax"; +$code.=<<___; + mov %rax, $bi + mulq 8*0($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov @acc[0], 8*$i($r_ptr) + mov %rdx, @acc[0] + + mulq 8*1($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + add @acc[1], @acc[0] + adc \$0, %rdx + mov %rdx, @acc[1] + + mulq 8*2($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + add @acc[2], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq 8*3($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + add @acc[3], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[3] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + add @acc[4], @acc[3] + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*5($a_ptr) + add %rax, @acc[5] + mov $b_next, %rax + adc \$0, %rdx + add @acc[5], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[5] +___ +} +$code.=<<___; + mov @acc[0], 8*6($r_ptr) + mov @acc[1], 8*7($r_ptr) + mov @acc[2], 8*8($r_ptr) + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) + + ret +.size __mulq_384,.-__mulq_384 +___ +} +if (0) { ############################################################## +my @b=map("%r$_",(10..15)); +my @a=reverse(@b); + @b[5]=$b_ptr; +my $bi = "%rbp"; +my @comba=map("%r$_",("cx",8,9)); +# a[0]*b[0] +# a[1]*b[0] +# a[0]*b[1] +# a[2]*b[0] +# a[1]*b[1] +# a[0]*b[2] +# a[3]*b[0] +# a[2]*b[1] +# a[1]*b[2] +# a[0]*b[3] +# a[4]*b[0] +# a[3]*b[1] +# a[2]*b[2] +# a[1]*b[3] +# a[0]*b[4] +# a[5]*b[0] +# a[4]*b[1] +# a[3]*b[2] +# a[2]*b[3] +# a[1]*b[4] +# a[0]*b[5] +# a[5]*b[1] +# a[4]*b[2] +# a[3]*b[3] +# a[2]*b[4] +# a[1]*b[5] +# a[5]*b[2] +# a[4]*b[3] +# a[3]*b[4] +# a[2]*b[5] +# a[5]*b[3] +# a[4]*b[4] +# a[3]*b[5] +# a[5]*b[4] +# a[4]*b[5] +# a[5]*b[5] +# +# 13% less instructions give +15% on Core2, +10% on Goldmont, +# -0% on Sandy Bridge, but -16% on Haswell:-( +# [for reference +5% on Skylake, +11% on Ryzen] + +$code.=<<___; +.type __mulq_comba_384,\@abi-omnipotent +.align 32 +__mulq_comba_384: + mov 8*0($b_ptr), %rax + mov 8*0($a_ptr), @a[0] + mov 8*1($a_ptr), @a[1] + mov 8*1($b_ptr), @b[1] + + mov %rax, @b[0] + mulq @a[0] # a[0]*b[0] + mov %rax, 8*0($r_ptr) + mov @b[0], %rax + mov %rdx, @comba[0] + + ################################# + mov 8*2($a_ptr), @a[2] + xor @comba[2], @comba[2] + mulq @a[1] # a[1]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc \$0, %rdx + mov 8*2($b_ptr), @b[2] + mov %rdx, @comba[1] + + mulq @a[0] # a[0]*b[1] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*1($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[2] # a[2]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[1] # a[1]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[0] # a[0]*b[2] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*2($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq 8*3($a_ptr) # a[3]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[2] # a[2]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[1] # a[1]*b[2] + add %rax, @comba[0] + mov 8*3($b_ptr), %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mov %rax, @b[3] + mulq @a[0] # a[0]*b[3] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*3($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq 8*4($a_ptr) # a[4]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*3($a_ptr) # a[3]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[1] # a[1]*b[3] + add %rax, @comba[0] + mov 8*4($b_ptr), %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mov %rax, @b[4] + mulq @a[0] # a[0]*b[4] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + mov 8*5($a_ptr), @a[5] + adc \$0, @comba[2] + mov @comba[0], 8*4($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*4($a_ptr) # a[4]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*3($a_ptr) # a[3]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*1($a_ptr) # a[1]*b[4] + add %rax, @comba[0] + mov 8*5($b_ptr), %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mov %rax, @b[5] + mulq @a[0] # a[0]*b[5] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + mov 8*4($a_ptr), @a[4] + adc \$0, @comba[2] + mov @comba[0], 8*5($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*3($a_ptr) # a[3]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*1($a_ptr) # a[1]*b[5] + add %rax, @comba[0] + mov $b[2], %rax + adc %rdx, @comba[1] + mov 8*3($a_ptr), @a[3] + adc \$0, @comba[2] + mov @comba[0], 8*6($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[3] # a[3]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[5] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*7($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[3] # a[3]*b[5] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*8($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[5] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*9($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + mulq @a[5] # a[5]*b[4] + add %rax, @comba[0] + adc %rdx, @comba[1] + + mov @comba[0], 8*10($r_ptr) + mov @comba[1], 8*11($r_ptr) + + ret +.size __mulq_comba_384,.-__mulq_comba_384 +___ +} +{ ########################################################## 384-bit sqr +my @acc=(@acc,"%rcx","%rbx","%rbp",$a_ptr); +my $hi; + +$code.=<<___; +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,\@function,2,"unwind" +.align 32 +sqr_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __sqrq_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_384,.-sqr_384 + +.type __sqrq_384,\@abi-omnipotent +.align 32 +__sqrq_384: + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + + ######################################### + mov %rax, @acc[6] + mulq @acc[7] # a[1]*a[0] + mov %rax, @acc[1] + mov @acc[6], %rax + mov 8*4($a_ptr), @acc[10] + mov %rdx, @acc[2] + + mulq @acc[8] # a[2]*a[0] + add %rax, @acc[2] + mov @acc[6], %rax + adc \$0, %rdx + mov 8*5($a_ptr), @acc[11] + mov %rdx, @acc[3] + + mulq @acc[9] # a[3]*a[0] + add %rax, @acc[3] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq @acc[10] # a[4]*a[0] + add %rax, @acc[4] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[5] + + mulq @acc[11] # a[5]*a[0] + add %rax, @acc[5] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq %rax # a[0]*a[0] + xor @acc[0], @acc[0] + mov %rax, 8*0($r_ptr) + mov @acc[7], %rax + add @acc[1], @acc[1] # double acc[1] + adc \$0, @acc[0] + add %rdx, @acc[1] # accumulate a[0]*a[0] + adc \$0, @acc[0] # carries to a[1]*a[1] + mov @acc[1], 8*1($r_ptr) +___ +$hi=@acc[1]; +$code.=<<___; + ######################################### + mulq @acc[8] # a[2]*a[1] + add %rax, @acc[3] + mov @acc[7], %rax + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[9] # a[3]*a[1] + add %rax, @acc[4] + mov @acc[7], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[10] # a[4]*a[1] + add %rax, @acc[5] + mov @acc[7], %rax + adc \$0, %rdx + add $hi, @acc[5] + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[11] # a[5]*a[1] + add %rax, @acc[6] + mov @acc[7], %rax + adc \$0, %rdx + add $hi, @acc[6] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq %rax # a[1]*a[1] + xor @acc[1], @acc[1] + add %rax, @acc[0] # can't carry + mov @acc[8], %rax + add @acc[2], @acc[2] # double acc[2:3] + adc @acc[3], @acc[3] + adc \$0, @acc[1] + add @acc[0], @acc[2] # accumulate a[1]*a[1] + adc %rdx, @acc[3] + adc \$0, @acc[1] # carries to a[2]*a[2] + mov @acc[2], 8*2($r_ptr) +___ +$hi=@acc[0]; +$code.=<<___; + ######################################### + mulq @acc[9] # a[3]*a[2] + add %rax, @acc[5] + mov @acc[8], %rax + adc \$0, %rdx + mov @acc[3], 8*3($r_ptr) + mov %rdx, $hi + + mulq @acc[10] # a[4]*a[2] + add %rax, @acc[6] + mov @acc[8], %rax + adc \$0, %rdx + add $hi, @acc[6] + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[11] # a[5]*a[2] + add %rax, @acc[7] + mov @acc[8], %rax + adc \$0, %rdx + add $hi, @acc[7] + adc \$0, %rdx + mov %rdx, @acc[8] + + mulq %rax # a[2]*a[2] + xor @acc[3], @acc[3] + add %rax, @acc[1] # can't carry + mov @acc[9], %rax + add @acc[4], @acc[4] # double acc[4:5] + adc @acc[5], @acc[5] + adc \$0, @acc[3] + add @acc[1], @acc[4] # accumulate a[2]*a[2] + adc %rdx, @acc[5] + adc \$0, @acc[3] # carries to a[3]*a[3] + mov @acc[4], 8*4($r_ptr) + + ######################################### + mulq @acc[10] # a[4]*a[3] + add %rax, @acc[7] + mov @acc[9], %rax + adc \$0, %rdx + mov @acc[5], 8*5($r_ptr) + mov %rdx, $hi + + mulq @acc[11] # a[5]*a[3] + add %rax, @acc[8] + mov @acc[9], %rax + adc \$0, %rdx + add $hi, @acc[8] + adc \$0, %rdx + mov %rdx, @acc[9] + + mulq %rax # a[3]*a[3] + xor @acc[4], @acc[4] + add %rax, @acc[3] # can't carry + mov @acc[10], %rax + add @acc[6], @acc[6] # double acc[6:7] + adc @acc[7], @acc[7] + adc \$0, @acc[4] + add @acc[3], @acc[6] # accumulate a[3]*a[3] + adc %rdx, @acc[7] + mov @acc[6], 8*6($r_ptr) + adc \$0, @acc[4] # carries to a[4]*a[4] + mov @acc[7], 8*7($r_ptr) + + ######################################### + mulq @acc[11] # a[5]*a[4] + add %rax, @acc[9] + mov @acc[10], %rax + adc \$0, %rdx + mov %rdx, @acc[10] + + mulq %rax # a[4]*a[4] + xor @acc[5], @acc[5] + add %rax, @acc[4] # can't carry + mov @acc[11], %rax + add @acc[8], @acc[8] # double acc[8:9] + adc @acc[9], @acc[9] + adc \$0, @acc[5] + add @acc[4], @acc[8] # accumulate a[4]*a[4] + adc %rdx, @acc[9] + mov @acc[8], 8*8($r_ptr) + adc \$0, @acc[5] # carries to a[5]*a[5] + mov @acc[9], 8*9($r_ptr) + + ######################################### + mulq %rax # a[5]*a[5] + add @acc[5], %rax # can't carry + add @acc[10], @acc[10] # double acc[10] + adc \$0, %rdx + add @acc[10], %rax # accumulate a[5]*a[5] + adc \$0, %rdx + mov %rax, 8*10($r_ptr) + mov %rdx, 8*11($r_ptr) + + ret +.size __sqrq_384,.-__sqrq_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,\@function,4,"unwind" +.align 32 +sqr_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*15, %rsp +.cfi_adjust_cfa_offset 8*15 +.cfi_end_prologue + + mov $n_ptr, 8*12(%rsp) # n0 + mov $b_org, 8*13(%rsp) # n_ptr + mov $r_ptr, 8*14(%rsp) + + mov %rsp, $r_ptr + call __sqrq_384 + + lea 0(%rsp), $a_ptr + mov 8*12(%rsp), %rcx # n0 for mul_by_1 + mov 8*13(%rsp), $b_ptr # n_ptr for mul_by_1 + mov 8*14(%rsp), $r_ptr + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + lea 8*15(%rsp), %r8 # size optimization + mov 8*15(%rsp), %r15 +.cfi_restore %r15 + mov 8*1(%r8), %r14 +.cfi_restore %r14 + mov 8*2(%r8), %r13 +.cfi_restore %r13 + mov 8*3(%r8), %r12 +.cfi_restore %r12 + mov 8*4(%r8), %rbx +.cfi_restore %rbx + mov 8*5(%r8), %rbp +.cfi_restore %rbp + lea 8*6(%r8), %rsp +.cfi_adjust_cfa_offset -8*21 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_384,.-sqr_mont_384 +___ +} +{ ########################################################## 384-bit redc_mont +my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" + +$code.=<<___; +######################################################################## +# void redc_mont_384(uint64_t ret[6], const uint64_t a[12], +# uint64_t m[6], uint64_t n0); +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,\@function,4,"unwind" +.align 32 +redc_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size redc_mont_384,.-redc_mont_384 + +######################################################################## +# void from_mont_384(uint64_t ret[6], const uint64_t a[6], +# uint64_t m[6], uint64_t n0); +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,\@function,4,"unwind" +.align 32 +from_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulq_by_1_mont_384 + + ################################# + # Branch-less conditional acc[0:6] - modulus + + #mov @acc[6], %rax # __mulq_by_1_mont_384 does it + mov @acc[7], %rcx + mov @acc[0], %rdx + mov @acc[1], %rbp + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[7] + mov @acc[2], @acc[5] + sbb 8*2($n_ptr), @acc[0] + sbb 8*3($n_ptr), @acc[1] + sbb 8*4($n_ptr), @acc[2] + mov @acc[3], $a_ptr + sbb 8*5($n_ptr), @acc[3] + + cmovc %rax, @acc[6] + cmovc %rcx, @acc[7] + cmovc %rdx, @acc[0] + mov @acc[6], 8*0($r_ptr) + cmovc %rbp, @acc[1] + mov @acc[7], 8*1($r_ptr) + cmovc @acc[5], @acc[2] + mov @acc[0], 8*2($r_ptr) + cmovc $a_ptr, @acc[3] + mov @acc[1], 8*3($r_ptr) + mov @acc[2], 8*4($r_ptr) + mov @acc[3], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size from_mont_384,.-from_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulq_by_1_mont_384,\@abi-omnipotent +.align 32 +__mulq_by_1_mont_384: + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov %rax, @acc[6] + imulq $n0, %rax + mov %rax, @acc[0] +___ +for (my $i=0; $i<6; $i++) { +my $hi = @acc[6]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, @acc[6] # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, @acc[6] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[6], @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add %rax, @acc[3] + mov @acc[0], %rax + adc \$0, %rdx +___ +$code.=<<___ if ($i<5); + mov @acc[1], @acc[7] + imulq $n0, @acc[1] +___ +$code.=<<___; + add $hi, @acc[3] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*4($n_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*5($n_ptr) + add %rax, @acc[5] + mov @acc[1], %rax + adc \$0, %rdx + add $hi, @acc[5] + adc \$0, %rdx + mov %rdx, @acc[6] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + ret +.size __mulq_by_1_mont_384,.-__mulq_by_1_mont_384 + +.type __redc_tail_mont_384,\@abi-omnipotent +.align 32 +__redc_tail_mont_384: + add 8*6($a_ptr), @acc[0] # accumulate upper half + mov @acc[0], %rax + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + mov @acc[1], %rcx + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + sbb @acc[6], @acc[6] + + ################################# + # Branch-less conditional acc[0:6] - modulus + + mov @acc[2], %rdx + mov @acc[3], %rbp + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[7] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], $a_ptr + sbb 8*5($n_ptr), @acc[5] + sbb \$0, @acc[6] + + cmovc %rax, @acc[0] + cmovc %rcx, @acc[1] + cmovc %rdx, @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc %rbp, @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc $a_ptr, @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,\@function,3,"unwind" +.align 32 +sgn0_pty_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 0($r_ptr), $a_ptr + mov $b_org, $n0 + call __mulq_by_1_mont_384 + + xor %rax, %rax + mov @acc[0], @acc[7] + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + not %rax # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,\@function,3,"unwind" +.align 32 +sgn0_pty_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 48($r_ptr), $a_ptr # sgn0(a->im) + mov $b_org, $n0 + call __mulq_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + lea 0($r_ptr), $a_ptr # sgn0(a->re) + xor $r_ptr, $r_ptr + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, $r_ptr + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov @acc[0], 0(%rsp) # a->im is zero or not + not $r_ptr # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, $r_ptr + or @acc[7], $r_ptr # pack sign and parity + + call __mulq_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + xor %rax, %rax + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + mov 0(%rsp), @acc[6] + + not %rax # 2*x > p, which means "negative" + + test @acc[0], @acc[0] + cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) + + test @acc[6], @acc[6] + cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) + + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x +___ +} } + +{ ########################################################## mulq_mont +my ($bi, $hi) = ("%rdi", "%rbp"); + +$code.=<<___; +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,\@function,5,"unwind" +.align 32 +mul_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $n0 +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($b_org), %rax + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[4] + mov 8*3($a_ptr), @acc[5] + mov $b_org, $b_ptr # evacuate from %rdx + movq $r_ptr, %xmm0 + + call __mulq_mont_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_mont_384,.-mul_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulq_mont_384,\@abi-omnipotent +.align 32 +__mulq_mont_384: + mov %rax, $bi + mulq @acc[6] # a[0]*b[0] + mov %rax, @acc[0] + mov $bi, %rax + mov %rdx, @acc[1] + + mulq @acc[7] # a[1]*b[0] + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq @acc[4] # a[2]*b[0] + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mov @acc[0], $hi + imulq 8(%rsp), @acc[0] + + mulq @acc[5] # a[3]*b[0] + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[5] + + mulq 8*5($a_ptr) + add %rax, @acc[5] + mov @acc[0], %rax + adc \$0, %rdx + xor @acc[7], @acc[7] + mov %rdx, @acc[6] +___ +for (my $i=0; $i<6;) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, $hi # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, $hi + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add $hi, @acc[3] + adc \$0, %rdx + add %rax, @acc[3] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*4($n_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*5($n_ptr) + add %rax, @acc[5] + mov $b_next, %rax + adc \$0, %rdx + add $hi, @acc[5] + adc %rdx, @acc[6] + adc \$0, @acc[7] +___ + push(@acc,shift(@acc)); +$code.=<<___ if ($i++<5); + ################################# Multiply by b[$i] + mov %rax, $bi + mulq 8*0($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*1($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*2($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[7] + + mov @acc[0], $hi + imulq 8(%rsp), @acc[0] + + mulq 8*3($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[3] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*5($a_ptr) + add @acc[7], @acc[5] + adc \$0, %rdx + xor @acc[7], @acc[7] + add %rax, @acc[5] + mov @acc[0], %rax + adc %rdx, @acc[6] + adc \$0, @acc[7] +___ +} +$code.=<<___; + ################################# + # Branch-less conditional acc[0:6] - modulus + + #mov @acc[0], %rax + movq %xmm0, $r_ptr # restore $r_ptr + sub 8*0($n_ptr), @acc[0] + mov @acc[1], %rdx + sbb 8*1($n_ptr), @acc[1] + mov @acc[2], $b_ptr + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], $a_ptr + sbb 8*3($n_ptr), @acc[3] + mov @acc[4], $hi + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[7] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, @acc[6] + + cmovc %rax, @acc[0] + cmovc %rdx, @acc[1] + cmovc $b_ptr, @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc $a_ptr, @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc $hi, @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc @acc[7], @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __mulq_mont_384,.-__mulq_mont_384 +___ +} } +$code.=<<___; +.globl sqr_n_mul_mont_384 +.hidden sqr_n_mul_mont_384 +.type sqr_n_mul_mont_384,\@function,6,"unwind" +.align 32 +sqr_n_mul_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*17, %rsp +.cfi_adjust_cfa_offset 8*17 +.cfi_end_prologue + + mov $n0, 8*0(%rsp) + mov $n_ptr, 8*1(%rsp) + movq $r_ptr, %xmm0 # to __mulq_mont_384 + lea 8*4(%rsp), $r_ptr + mov %r9, 8*3(%rsp) # 6th, multiplicand argument + movq (%r9), %xmm2 # prefetch b[0] + +.Loop_sqr_384: + movd %edx, %xmm1 # loop counter + + call __sqrq_384 + + lea 0($r_ptr), $a_ptr + mov 8*0(%rsp), %rcx # n0 for mul_by_1 + mov 8*1(%rsp), $b_ptr # n_ptr for mul_by_1 + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movd %xmm1, %edx + lea 0($r_ptr), $a_ptr + dec %edx + jnz .Loop_sqr_384 + + movq %xmm2, %rax # b[0] + mov $b_ptr, $n_ptr + mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument + + #mov 8*0($b_ptr), %rax + #mov 8*0($a_ptr), @acc[6] + #mov 8*1($a_ptr), @acc[7] + #mov 8*2($a_ptr), @acc[4] + #mov 8*3($a_ptr), @acc[5] + mov @acc[0], @acc[4] + mov @acc[1], @acc[5] + + call __mulq_mont_384 + + lea 8*17(%rsp), %r8 # size optimization + mov 8*17(%rsp), %r15 +.cfi_restore %r15 + mov 8*1(%r8), %r14 +.cfi_restore %r14 + mov 8*2(%r8), %r13 +.cfi_restore %r13 + mov 8*3(%r8), %r12 +.cfi_restore %r12 + mov 8*4(%r8), %rbx +.cfi_restore %rbx + mov 8*5(%r8), %rbp +.cfi_restore %rbp + lea 8*6(%r8), %rsp +.cfi_adjust_cfa_offset -8*23 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_n_mul_mont_384,.-sqr_n_mul_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,\@function,6,"unwind" +.align 32 +sqr_n_mul_mont_383: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*17, %rsp +.cfi_adjust_cfa_offset 8*17 +.cfi_end_prologue + + mov $n0, 8*0(%rsp) + mov $n_ptr, 8*1(%rsp) + movq $r_ptr, %xmm0 # to __mulq_mont_384 + lea 8*4(%rsp), $r_ptr + mov %r9, 8*3(%rsp) # 6th, multiplicand argument + movq (%r9), %xmm2 # prefetch b[0] + +.Loop_sqr_383: + movd %edx, %xmm1 # loop counter + + call __sqrq_384 + + lea 0($r_ptr), $a_ptr + mov 8*0(%rsp), %rcx # n0 for mul_by_1 + mov 8*1(%rsp), $b_ptr # n_ptr for mul_by_1 + call __mulq_by_1_mont_384 + + movd %xmm1, %edx # loop counter + add 8*6($a_ptr), @acc[6] # just accumulate upper half + adc 8*7($a_ptr), @acc[7] + adc 8*8($a_ptr), @acc[0] + adc 8*9($a_ptr), @acc[1] + adc 8*10($a_ptr), @acc[2] + adc 8*11($a_ptr), @acc[3] + lea 0($r_ptr), $a_ptr + + mov @acc[6], 8*0($r_ptr) # omitting full reduction gives ~5% + mov @acc[7], 8*1($r_ptr) # in addition-chains + mov @acc[0], 8*2($r_ptr) + mov @acc[1], 8*3($r_ptr) + mov @acc[2], 8*4($r_ptr) + mov @acc[3], 8*5($r_ptr) + + dec %edx + jnz .Loop_sqr_383 + + movq %xmm2, %rax # b[0] + mov $b_ptr, $n_ptr + mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument + + #movq 8*0($b_ptr), %rax + #mov 8*0($a_ptr), @acc[6] + #mov 8*1($a_ptr), @acc[7] + #mov 8*2($a_ptr), @acc[4] + #mov 8*3($a_ptr), @acc[5] + mov @acc[0], @acc[4] + mov @acc[1], @acc[5] + + call __mulq_mont_384 # formally one can omit full reduction + # even after multiplication... + lea 8*17(%rsp), %r8 # size optimization + mov 8*17(%rsp), %r15 +.cfi_restore %r15 + mov 8*1(%r8), %r14 +.cfi_restore %r14 + mov 8*2(%r8), %r13 +.cfi_restore %r13 + mov 8*3(%r8), %r12 +.cfi_restore %r12 + mov 8*4(%r8), %rbx +.cfi_restore %rbx + mov 8*5(%r8), %rbp +.cfi_restore %rbp + lea 8*6(%r8), %rsp +.cfi_adjust_cfa_offset -8*23 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +___ +{ my @acc=@acc; # will be rotated locally + my $bi = "%rbp"; + +$code.=<<___; +.type __mulq_mont_383_nonred,\@abi-omnipotent +.align 32 +__mulq_mont_383_nonred: + mov %rax, $bi + mulq @acc[6] # a[0]*b[0] + mov %rax, @acc[0] + mov $bi, %rax + mov %rdx, @acc[1] + + mulq @acc[7] # a[1]*b[0] + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq @acc[4] # a[2]*b[0] + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mov @acc[0], @acc[7] + imulq 8(%rsp), @acc[0] + + mulq @acc[5] # a[3]*b[0] + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[5] + + mulq 8*5($a_ptr) + add %rax, @acc[5] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, @acc[6] +___ +for (my $i=0; $i<6;) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, @acc[7] # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, @acc[7] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[7], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[7], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*3($n_ptr) + add @acc[7], @acc[3] + adc \$0, %rdx + add %rax, @acc[3] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*4($n_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[7], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*5($n_ptr) + add %rax, @acc[5] + mov $b_next, %rax + adc \$0, %rdx + add @acc[7], @acc[5] + adc %rdx, @acc[6] +___ + push(@acc,shift(@acc)); +$code.=<<___ if ($i++<5); + ################################# Multiply by b[$i] + mov %rax, $bi + mulq 8*0($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*1($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*2($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[6] + + mov @acc[0], @acc[7] + imulq 8(%rsp), @acc[0] + + mulq 8*3($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[3] + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*5($a_ptr) + add @acc[6], @acc[5] + adc \$0, %rdx + add %rax, @acc[5] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, @acc[6] +___ +} +$code.=<<___; + ret +.size __mulq_mont_383_nonred,.-__mulq_mont_383_nonred +___ +} +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # align +my @acc = (@acc,"%rax","%rdx","%rbx","%rbp"); + +# omitting 3 reductions gives 8-11% better performance in add-chains +$code.=<<___; +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,\@function,4,"unwind" +.align 32 +sqr_mont_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + mov $a_ptr, 8*2(%rsp) + mov $r_ptr, 8*3(%rsp) + + ################################# + mov 8*0($a_ptr), @acc[0] # a->re + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov @acc[0], @acc[6] + add 8*6($a_ptr), @acc[0] # a->re + a->im + mov @acc[1], @acc[7] + adc 8*7($a_ptr), @acc[1] + mov @acc[2], @acc[8] + adc 8*8($a_ptr), @acc[2] + mov @acc[3], @acc[9] + adc 8*9($a_ptr), @acc[3] + mov @acc[4], @acc[10] + adc 8*10($a_ptr), @acc[4] + mov @acc[5], @acc[11] + adc 8*11($a_ptr), @acc[5] + + sub 8*6($a_ptr), @acc[6] # a->re - a->im + sbb 8*7($a_ptr), @acc[7] + sbb 8*8($a_ptr), @acc[8] + sbb 8*9($a_ptr), @acc[9] + sbb 8*10($a_ptr), @acc[10] + sbb 8*11($a_ptr), @acc[11] + sbb $r_ptr, $r_ptr # borrow flag as mask + + mov @acc[0], 32+8*0(%rsp) # t0 + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + mov @acc[6], 32+8*6(%rsp) # t1 + mov @acc[7], 32+8*7(%rsp) + mov @acc[8], 32+8*8(%rsp) + mov @acc[9], 32+8*9(%rsp) + mov @acc[10], 32+8*10(%rsp) + mov @acc[11], 32+8*11(%rsp) + mov $r_ptr, 32+8*12(%rsp) + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + #mov 8*2(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rax # a->im + mov 8*0($a_ptr), @acc[6] # a->re + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[4] + mov 8*3($a_ptr), @acc[5] + + mov 8*3(%rsp), $r_ptr + call __mulq_mont_383_nonred +___ +{ +my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 + 12,13,"ax","bx","bp","si"); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + + mov @acc[0], 8*6($r_ptr) # ret->im + mov @acc[1], 8*7($r_ptr) + mov @acc[2], 8*8($r_ptr) + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32(%rsp), $a_ptr # t0 + lea 32+8*6(%rsp), $b_ptr # t1 + + mov 32+8*6(%rsp), %rax # t1[0] + mov 32+8*0(%rsp), @acc[6] # t0[0..3] + mov 32+8*1(%rsp), @acc[7] + mov 32+8*2(%rsp), @acc[4] + mov 32+8*3(%rsp), @acc[5] + + call __mulq_mont_383_nonred +___ +{ +my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 + 12,13,"ax","bx","bp","si"); +$code.=<<___; + mov 32+8*12(%rsp), @acc[11] # account for sign from a->re - a->im + mov 32+8*0(%rsp), @acc[6] + mov 32+8*1(%rsp), @acc[7] + and @acc[11], @acc[6] + mov 32+8*2(%rsp), @acc[8] + and @acc[11], @acc[7] + mov 32+8*3(%rsp), @acc[9] + and @acc[11], @acc[8] + mov 32+8*4(%rsp), @acc[10] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 32+8*5(%rsp), @acc[11] + + sub @acc[6], @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb @acc[7], @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb @acc[8], @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb @acc[9], @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb @acc[10], @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb @acc[11], @acc[5] + sbb @acc[11], @acc[11] + + and @acc[11], @acc[6] + and @acc[11], @acc[7] + and @acc[11], @acc[8] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 8*5($n_ptr), @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + adc @acc[8], @acc[2] + adc @acc[9], @acc[3] + adc @acc[10], @acc[4] + adc @acc[11], @acc[5] + + mov @acc[0], 8*0($r_ptr) # ret->re + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) +___ +} +$code.=<<___; + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_382x,.-sqr_mont_382x +___ +} + +print $code; +close STDOUT; diff --git a/src/asm/mulx_mont_256-x86_64.pl b/src/asm/mulx_mont_256-x86_64.pl new file mode 100755 index 00000000..8156402d --- /dev/null +++ b/src/asm/mulx_mont_256-x86_64.pl @@ -0,0 +1,472 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# "Sparse" in subroutine names refers to most significant limb of the +# modulus. Though "sparse" is a bit of misnomer, because limitation is +# just not-all-ones. Just in case, why limitation at all and not a +# general-purpose 256-bit subroutines? Unlike 384-bit case, accounting +# for additional carry has disproportionate impact on performance, +# especially in adcx/adox implementation. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +{ ############################################################## 255 bits +my @acc=map("%r$_",(10..15)); + +{ ############################################################## mulq +my ($lo,$hi)=("%rbp","%r9"); + +$code.=<<___; +.text + +.globl mulx_mont_sparse_256 +.hidden mulx_mont_sparse_256 +.type mulx_mont_sparse_256,\@function,5,"unwind" +.align 32 +mulx_mont_sparse_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8,%rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $b_ptr # evacuate from %rdx + mov 8*0($b_org), %rdx + mov 8*0($a_ptr), @acc[4] + mov 8*1($a_ptr), @acc[5] + mov 8*2($a_ptr), $lo + mov 8*3($a_ptr), $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx @acc[4], %rax, @acc[1] # a[0]*b[0] + call __mulx_mont_sparse_256 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_mont_sparse_256,.-mulx_mont_sparse_256 + +.globl sqrx_mont_sparse_256 +.hidden sqrx_mont_sparse_256 +.type sqrx_mont_sparse_256,\@function,4,"unwind" +.align 32 +sqrx_mont_sparse_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8,%rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $b_ptr + mov $n_ptr, $n0 + mov $b_org, $n_ptr + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[5] + mov 8*2($a_ptr), $lo + mov 8*3($a_ptr), $hi + lea -128($b_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %rdx, %rax, @acc[1] # a[0]*a[0] + call __mulx_mont_sparse_256 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_mont_sparse_256,.-sqrx_mont_sparse_256 +___ +{ +my @acc=@acc; +$code.=<<___; +.type __mulx_mont_sparse_256,\@abi-omnipotent +.align 32 +__mulx_mont_sparse_256: + mulx @acc[5], @acc[5], @acc[2] + mulx $lo, $lo, @acc[3] + add @acc[5], @acc[1] + mulx $hi, $hi, @acc[4] + mov 8($b_ptr), %rdx + adc $lo, @acc[2] + adc $hi, @acc[3] + adc \$0, @acc[4] + +___ +for (my $i=1; $i<4; $i++) { +my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : "%rax"; +my $a5 = $i==1 ? @acc[5] : $lo; +$code.=<<___; + mov %rax, @acc[0] + imulq $n0, %rax + + ################################# Multiply by b[$i] + xor $a5, $a5 # [@acc[5]=0,] cf=0, of=0 + mulx 8*0+128($a_ptr), $lo, $hi + adox $lo, @acc[1] + adcx $hi, @acc[2] + + mulx 8*1+128($a_ptr), $lo, $hi + adox $lo, @acc[2] + adcx $hi, @acc[3] + + mulx 8*2+128($a_ptr), $lo, $hi + adox $lo, @acc[3] + adcx $hi, @acc[4] + + mulx 8*3+128($a_ptr), $lo, $hi + mov %rax, %rdx + adox $lo, @acc[4] + adcx @acc[5], $hi # cf=0 + adox $hi, @acc[5] # of=0 + + ################################# reduction + mulx 8*0+128($n_ptr), $lo, %rax + adcx $lo, @acc[0] # guaranteed to be zero + adox @acc[1], %rax + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, %rax # @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + mov $b_next, %rdx + adcx $lo, @acc[3] + adox $hi, @acc[4] + adcx @acc[0], @acc[4] + adox @acc[0], @acc[5] + adcx @acc[0], @acc[5] + adox @acc[0], @acc[0] # acc[5] in next iteration + adc \$0, @acc[0] # cf=0, of=0 +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + imulq $n0, %rdx + + ################################# last reduction + xor $lo, $lo # cf=0, of=0 + mulx 8*0+128($n_ptr), @acc[0], $hi + adcx %rax, @acc[0] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + mov @acc[1], %rdx + lea 128($n_ptr), $n_ptr + adcx $lo, @acc[3] + adox $hi, @acc[4] + mov @acc[2], %rax + adcx @acc[0], @acc[4] + adox @acc[0], @acc[5] + adc \$0, @acc[5] + + ################################# + # Branch-less conditional acc[1:5] - modulus + + mov @acc[3], $lo + sub 8*0($n_ptr), @acc[1] + sbb 8*1($n_ptr), @acc[2] + sbb 8*2($n_ptr), @acc[3] + mov @acc[4], $hi + sbb 8*3($n_ptr), @acc[4] + sbb \$0, @acc[5] + + cmovc %rdx, @acc[1] + cmovc %rax, @acc[2] + cmovc $lo, @acc[3] + mov @acc[1], 8*0($r_ptr) + cmovc $hi, @acc[4] + mov @acc[2], 8*1($r_ptr) + mov @acc[3], 8*2($r_ptr) + mov @acc[4], 8*3($r_ptr) + + ret +.size __mulx_mont_sparse_256,.-__mulx_mont_sparse_256 +___ +} } +{ my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" + +$code.=<<___; +.globl fromx_mont_256 +.hidden fromx_mont_256 +.type fromx_mont_256,\@function,4,"unwind" +.align 32 +fromx_mont_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulx_by_1_mont_256 + + ################################# + # Branch-less conditional acc[0:3] - modulus + + #mov @acc[4], %rax # __mulq_by_1_mont_256 does it + mov @acc[5], %rdx + mov @acc[0], @acc[2] + mov @acc[1], @acc[3] + + sub 8*0($n_ptr), @acc[4] + sbb 8*1($n_ptr), @acc[5] + sbb 8*2($n_ptr), @acc[0] + sbb 8*3($n_ptr), @acc[1] + + cmovnc @acc[4], %rax + cmovnc @acc[5], %rdx + cmovnc @acc[0], @acc[2] + mov %rax, 8*0($r_ptr) + cmovnc @acc[1], @acc[3] + mov %rdx, 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size fromx_mont_256,.-fromx_mont_256 + +.globl redcx_mont_256 +.hidden redcx_mont_256 +.type redcx_mont_256,\@function,4,"unwind" +.align 32 +redcx_mont_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulx_by_1_mont_256 + + add 8*4($a_ptr), @acc[4] # accumulate upper half + adc 8*5($a_ptr), @acc[5] + mov @acc[4], %rax + adc 8*6($a_ptr), @acc[0] + mov @acc[5], %rdx + adc 8*7($a_ptr), @acc[1] + sbb $a_ptr, $a_ptr + + ################################# + # Branch-less conditional acc[0:4] - modulus + + mov @acc[0], @acc[2] + sub 8*0($n_ptr), @acc[4] + sbb 8*1($n_ptr), @acc[5] + sbb 8*2($n_ptr), @acc[0] + mov @acc[1], @acc[3] + sbb 8*3($n_ptr), @acc[1] + sbb \$0, $a_ptr + + cmovnc @acc[4], %rax + cmovnc @acc[5], %rdx + cmovnc @acc[0], @acc[2] + mov %rax, 8*0($r_ptr) + cmovnc @acc[1], @acc[3] + mov %rdx, 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size redcx_mont_256,.-redcx_mont_256 +___ +{ +my @acc=@acc; + +$code.=<<___; +.type __mulx_by_1_mont_256,\@abi-omnipotent +.align 32 +__mulx_by_1_mont_256: + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + mov %rax, @acc[4] + imulq $n0, %rax + mov %rax, @acc[0] +___ +for (my $i=0; $i<4; $i++) { +my $hi = @acc[4]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, @acc[4] # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, @acc[4] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[4], @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) +___ +$code.=<<___ if ($i<3); + mov @acc[1], @acc[5] + imulq $n0, @acc[1] +___ +$code.=<<___; + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add %rax, @acc[3] + mov @acc[1], %rax + adc \$0, %rdx + add $hi, @acc[3] + adc \$0, %rdx + mov %rdx, @acc[4] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + ret +.size __mulx_by_1_mont_256,.-__mulx_by_1_mont_256 +___ +} } } + +print $code; +close STDOUT; diff --git a/src/asm/mulx_mont_384-x86_64.pl b/src/asm/mulx_mont_384-x86_64.pl new file mode 100755 index 00000000..083c2509 --- /dev/null +++ b/src/asm/mulx_mont_384-x86_64.pl @@ -0,0 +1,2385 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +# common accumulator layout +@acc=map("%r$_",(8..15)); + +######################################################################## +{ my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.text + +######################################################################## +# Double-width subtraction modulo n<<384, as opposite to naively +# expected modulo n*n. It works because n<<384 is the actual +# input boundary condition for Montgomery reduction, not n*n. +# Just in case, this is duplicated, but only one module is +# supposed to be linked... +.type __sub_mod_384x384,\@abi-omnipotent +.align 32 +__sub_mod_384x384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov 8*6($a_ptr), @acc[6] + + sub 8*0($b_org), @acc[0] + mov 8*7($a_ptr), @acc[7] + sbb 8*1($b_org), @acc[1] + mov 8*8($a_ptr), @acc[8] + sbb 8*2($b_org), @acc[2] + mov 8*9($a_ptr), @acc[9] + sbb 8*3($b_org), @acc[3] + mov 8*10($a_ptr), @acc[10] + sbb 8*4($b_org), @acc[4] + mov 8*11($a_ptr), @acc[11] + sbb 8*5($b_org), @acc[5] + mov @acc[0], 8*0($r_ptr) + sbb 8*6($b_org), @acc[6] + mov 8*0($n_ptr), @acc[0] + mov @acc[1], 8*1($r_ptr) + sbb 8*7($b_org), @acc[7] + mov 8*1($n_ptr), @acc[1] + mov @acc[2], 8*2($r_ptr) + sbb 8*8($b_org), @acc[8] + mov 8*2($n_ptr), @acc[2] + mov @acc[3], 8*3($r_ptr) + sbb 8*9($b_org), @acc[9] + mov 8*3($n_ptr), @acc[3] + mov @acc[4], 8*4($r_ptr) + sbb 8*10($b_org), @acc[10] + mov 8*4($n_ptr), @acc[4] + mov @acc[5], 8*5($r_ptr) + sbb 8*11($b_org), @acc[11] + mov 8*5($n_ptr), @acc[5] + sbb $b_org, $b_org + + and $b_org, @acc[0] + and $b_org, @acc[1] + and $b_org, @acc[2] + and $b_org, @acc[3] + and $b_org, @acc[4] + and $b_org, @acc[5] + + add @acc[0], @acc[6] + adc @acc[1], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[2], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[3], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[4], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[5], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,\@abi-omnipotent +.align 32 +__add_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + adc 8*2($b_org), @acc[2] + mov @acc[0], @acc[6] + adc 8*3($b_org), @acc[3] + mov @acc[1], @acc[7] + adc 8*4($b_org), @acc[4] + mov @acc[2], @acc[8] + adc 8*5($b_org), @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[9], @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,\@abi-omnipotent +.align 32 +__sub_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +__sub_mod_384_a_is_loaded: + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb 8*4($b_org), @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb 8*5($b_org), @acc[5] + mov 8*5($n_ptr), @acc[11] + sbb $b_org, $b_org + + and $b_org, @acc[6] + and $b_org, @acc[7] + and $b_org, @acc[8] + and $b_org, @acc[9] + and $b_org, @acc[10] + and $b_org, @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[8], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[9], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[10], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[11], @acc[5] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __sub_mod_384,.-__sub_mod_384 +___ +} + +######################################################################## +# "Complex" multiplication and squaring. Use vanilla multiplication when +# possible to fold reductions. I.e. instead of mul_mont, mul_mont +# followed by add/sub_mod, it calls mul, mul, double-width add/sub_mod +# followed by *common* reduction... For single multiplication disjoint +# reduction is bad for performance for given vector length, yet overall +# it's a win here, because it's one reduction less. +{ my $frame = 5*8 + # place for argument off-load + + 3*768/8; # place for 3 768-bit temporary vectors +$code.=<<___; +.globl mulx_mont_384x +.hidden mulx_mont_384x +.type mulx_mont_384x,\@function,5,"unwind" +.align 32 +mulx_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $b_org, $b_ptr + mov $r_ptr, 8*4(%rsp) # offload arguments + mov $a_ptr, 8*3(%rsp) + mov $b_org, 8*2(%rsp) + mov $n_ptr, 8*1(%rsp) + mov $n0, 8*0(%rsp) + + ################################# mul_384(t0, a->re, b->re); + #lea 0($b_btr), $b_ptr # b->re + #lea 0($a_ptr), $a_ptr # a->re + lea 40(%rsp), $r_ptr # t0 + call __mulx_384 + + ################################# mul_384(t1, a->im, b->im); + lea 48($b_ptr), $b_ptr # b->im + lea 128+48($a_ptr), $a_ptr # a->im + lea 96($r_ptr), $r_ptr # t1 + call __mulx_384 + + ################################# mul_384(t2, a->re+a->im, b->re+b->im); + mov 8*1(%rsp), $n_ptr + lea ($b_ptr), $a_ptr # b->re + lea -48($b_ptr), $b_org # b->im + lea 40+192+48(%rsp), $r_ptr + call __add_mod_384 + + mov 8*3(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_org # a->im + lea -48($r_ptr), $r_ptr + call __add_mod_384 + + lea ($r_ptr),$b_ptr + lea 48($r_ptr),$a_ptr + call __mulx_384 + + ################################# t2=t2-t0-t1 + lea ($r_ptr), $a_ptr # t2 + lea 40(%rsp), $b_org # t0 + mov 8*1(%rsp), $n_ptr + call __sub_mod_384x384 # t2-t0 + + lea ($r_ptr), $a_ptr # t2 + lea -96($r_ptr), $b_org # t1 + call __sub_mod_384x384 # t2-t0-t1 + + ################################# t0=t0-t1 + lea 40(%rsp), $a_ptr + lea 40+96(%rsp), $b_org + lea 40(%rsp), $r_ptr + call __sub_mod_384x384 # t0-t1 + + lea ($n_ptr), $b_ptr # n_ptr for redc_mont_384 + + ################################# redc_mont_384(ret->re, t0, mod, n0); + lea 40(%rsp), $a_ptr # t0 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + mov 8*4(%rsp), $r_ptr # ret->re + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + ################################# redc_mont_384(ret->im, t2, mod, n0); + lea 40+192(%rsp), $a_ptr # t2 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + lea 48($r_ptr), $r_ptr # ret->im + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_mont_384x,.-mulx_mont_384x +___ +} +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # alignment +$code.=<<___; +.globl sqrx_mont_384x +.hidden sqrx_mont_384x +.type sqrx_mont_384x,\@function,4,"unwind" +.align 32 +sqrx_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + # gap for __mulx_mont_384 + mov $a_ptr, 8*2(%rsp) + movq $r_ptr, %xmm0 + + ################################# add_mod_384(t0, a->re, a->im); + lea 48($a_ptr), $b_org # a->im + lea 32(%rsp), $r_ptr # t0 + call __add_mod_384 + + ################################# sub_mod_384(t1, a->re, a->im); + mov 8*2(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_org # a->im + lea 32+48(%rsp), $r_ptr # t1 + call __sub_mod_384 + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + mov 8*2(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rdx + mov 8*0($a_ptr), %r14 # @acc[6] + mov 8*1($a_ptr), %r15 # @acc[7] + mov 8*2($a_ptr), %rax # @acc[8] + mov 8*3($a_ptr), %r12 # @acc[4] + mov 8*4($a_ptr), %rdi # $lo + mov 8*5($a_ptr), %rbp # $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_384 +___ +{ +my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 + 8..11,13,14); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $a_ptr, $a_ptr + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $a_ptr + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*6($b_ptr) # ret->im + cmovc @acc[9], @acc[3] + mov @acc[1], 8*7($b_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*8($b_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*9($b_ptr) + mov @acc[4], 8*10($b_ptr) + mov @acc[5], 8*11($b_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32(%rsp), $a_ptr # t0 + lea 32+48(%rsp), $b_ptr # t1 + + mov 32+48(%rsp), %rdx # t1[0] + mov 32+8*0(%rsp), %r14 # @acc[6] + mov 32+8*1(%rsp), %r15 # @acc[7] + mov 32+8*2(%rsp), %rax # @acc[8] + mov 32+8*3(%rsp), %r12 # @acc[4] + mov 32+8*4(%rsp), %rdi # $lo + mov 32+8*5(%rsp), %rbp # $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_mont_384x,.-sqrx_mont_384x + +.globl mulx_382x +.hidden mulx_382x +.type mulx_382x,\@function,4,"unwind" +.align 32 +mulx_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 96($r_ptr), $r_ptr # ret->im + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + mov $r_ptr, 8*2(%rsp) # offload ret->im + mov $n_ptr, 8*3(%rsp) + + ################################# t0 = a->re + a->im + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + add 8*6($a_ptr), @acc[0] + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + + mov @acc[0], 32+8*0(%rsp) + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + ################################# t1 = b->re + b->im + mov 8*0($b_org), @acc[0] + mov 8*1($b_org), @acc[1] + mov 8*2($b_org), @acc[2] + mov 8*3($b_org), @acc[3] + mov 8*4($b_org), @acc[4] + mov 8*5($b_org), @acc[5] + + add 8*6($b_org), @acc[0] + adc 8*7($b_org), @acc[1] + adc 8*8($b_org), @acc[2] + adc 8*9($b_org), @acc[3] + adc 8*10($b_org), @acc[4] + adc 8*11($b_org), @acc[5] + + mov @acc[0], 32+8*6(%rsp) + mov @acc[1], 32+8*7(%rsp) + mov @acc[2], 32+8*8(%rsp) + mov @acc[3], 32+8*9(%rsp) + mov @acc[4], 32+8*10(%rsp) + mov @acc[5], 32+8*11(%rsp) + + ################################# mul_384(ret->im, t0, t1); + lea 32+8*0(%rsp), $a_ptr # t0 + lea 32+8*6(%rsp), $b_ptr # t1 + call __mulx_384 + + ################################# mul_384(ret->re, a->re, b->re); + mov 8*0(%rsp), $a_ptr + mov 8*1(%rsp), $b_ptr + lea -96($r_ptr), $r_ptr # ret->re + call __mulx_384 + + ################################# mul_384(tx, a->im, b->im); + lea 48+128($a_ptr), $a_ptr + lea 48($b_ptr), $b_ptr + lea 32(%rsp), $r_ptr + call __mulx_384 + + ################################# ret->im -= tx + mov 8*2(%rsp), $a_ptr # restore ret->im + lea 32(%rsp), $b_org + mov 8*3(%rsp), $n_ptr + mov $a_ptr, $r_ptr + call __sub_mod_384x384 + + ################################# ret->im -= ret->re + lea 0($r_ptr), $a_ptr + lea -96($r_ptr), $b_org + call __sub_mod_384x384 + + ################################# ret->re -= tx + lea -96($r_ptr), $a_ptr + lea 32(%rsp), $b_org + lea -96($r_ptr), $r_ptr + call __sub_mod_384x384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_382x,.-mulx_382x +___ +} +{ my @acc=(@acc,"%rax","%rbx","%rbp",$b_org); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.globl sqrx_382x +.hidden sqrx_382x +.type sqrx_382x,\@function,3,"unwind" +.align 32 +sqrx_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + + ################################# t0 = a->re + a->im + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + mov 8*5($a_ptr), @acc[11] + + mov @acc[6], @acc[0] + add 8*6($a_ptr), @acc[6] + mov @acc[7], @acc[1] + adc 8*7($a_ptr), @acc[7] + mov @acc[8], @acc[2] + adc 8*8($a_ptr), @acc[8] + mov @acc[9], @acc[3] + adc 8*9($a_ptr), @acc[9] + mov @acc[10], @acc[4] + adc 8*10($a_ptr), @acc[10] + mov @acc[11], @acc[5] + adc 8*11($a_ptr), @acc[11] + + mov @acc[6], 8*0($r_ptr) + mov @acc[7], 8*1($r_ptr) + mov @acc[8], 8*2($r_ptr) + mov @acc[9], 8*3($r_ptr) + mov @acc[10], 8*4($r_ptr) + mov @acc[11], 8*5($r_ptr) + + ################################# t1 = a->re - a->im + lea 48($a_ptr), $b_org + lea 48($r_ptr), $r_ptr + call __sub_mod_384_a_is_loaded + + ################################# mul_384(ret->re, t0, t1); + lea ($r_ptr), $a_ptr + lea -48($r_ptr), $b_ptr + lea -48($r_ptr), $r_ptr + call __mulx_384 + + ################################# mul_384(ret->im, a->re, a->im); + mov (%rsp), $a_ptr + lea 48($a_ptr), $b_ptr + lea 96($r_ptr), $r_ptr + call __mulx_384 + + mov 8*0($r_ptr), @acc[0] # double ret->im + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + mov 8*4($r_ptr), @acc[4] + mov 8*5($r_ptr), @acc[5] + mov 8*6($r_ptr), @acc[6] + mov 8*7($r_ptr), @acc[7] + mov 8*8($r_ptr), @acc[8] + mov 8*9($r_ptr), @acc[9] + mov 8*10($r_ptr), @acc[10] + add @acc[0], @acc[0] + mov 8*11($r_ptr), @acc[11] + adc @acc[1], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[2], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[3], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[4], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[5], @acc[5] + mov @acc[4], 8*4($r_ptr) + adc @acc[6], @acc[6] + mov @acc[5], 8*5($r_ptr) + adc @acc[7], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[8], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[9], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[10], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[11], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + mov 8*1(%rsp),%r15 +.cfi_restore %r15 + mov 8*2(%rsp),%r14 +.cfi_restore %r14 + mov 8*3(%rsp),%r13 +.cfi_restore %r13 + mov 8*4(%rsp),%r12 +.cfi_restore %r12 + mov 8*5(%rsp),%rbx +.cfi_restore %rbx + mov 8*6(%rsp),%rbp +.cfi_restore %rbp + lea 8*7(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_382x,.-sqrx_382x +___ +} +{ ########################################################## 384-bit mulx +my ($a0, $a1) = @acc[6..7]; +my @acc = @acc[0..5]; +my ($lo, $hi, $zr) = ("%rax", "%rcx", "%rbp"); + +$code.=<<___; +.globl mulx_384 +.hidden mulx_384 +.type mulx_384,\@function,3,"unwind" +.align 32 +mulx_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.cfi_end_prologue + + mov $b_org, $b_ptr # evacuate from %rdx + call __mulx_384 + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_384,.-mulx_384 + +.type __mulx_384,\@abi-omnipotent +.align 32 +__mulx_384: + mov 8*0($b_ptr), %rdx + mov 8*0($a_ptr), $a0 + mov 8*1($a_ptr), $a1 + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + lea -128($a_ptr), $a_ptr + + mulx $a0, @acc[1], $hi + xor $zr, $zr + + mulx $a1, @acc[0], $lo + adcx $hi, @acc[0] + mov @acc[1], 8*0($r_ptr) + + mulx @acc[2], @acc[1], $hi + adcx $lo, @acc[1] + + mulx @acc[3], @acc[2], $lo + adcx $hi, @acc[2] + + mulx @acc[4], @acc[3], $hi + adcx $lo, @acc[3] + + mulx @acc[5], @acc[4], @acc[5] + mov 8*1($b_ptr), %rdx + adcx $hi, @acc[4] + adcx $zr, @acc[5] +___ +for(my $i=1; $i<6; $i++) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax"; +$code.=<<___; + mulx $a0, $lo, $hi + adcx @acc[0], $lo + adox $hi, @acc[1] + mov $lo, 8*$i($r_ptr) + + mulx $a1, @acc[0], $hi + adcx @acc[1], $acc[0] + adox $hi, @acc[2] + + mulx 128+8*2($a_ptr), @acc[1], $lo + adcx @acc[2], @acc[1] + adox $lo, @acc[3] + + mulx 128+8*3($a_ptr), @acc[2], $hi + adcx @acc[3], @acc[2] + adox $hi, @acc[4] + + mulx 128+8*4($a_ptr), @acc[3], $lo + adcx @acc[4], @acc[3] + adox @acc[5], $lo + + mulx 128+8*5($a_ptr), @acc[4], @acc[5] + mov $b_next, %rdx + adcx $lo, @acc[4] + adox $zr, @acc[5] + adcx $zr, @acc[5] +___ +} +$code.=<<___; + mov @acc[0], 8*6($r_ptr) + mov @acc[1], 8*7($r_ptr) + mov @acc[2], 8*8($r_ptr) + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) + + ret +.size __mulx_384,.-__mulx_384 +___ +} +{ ########################################################## 384-bit sqrx +$code.=<<___; +.globl sqrx_384 +.hidden sqrx_384 +.type sqrx_384,\@function,2,"unwind" +.align 32 +sqrx_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __sqrx_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_384,.-sqrx_384 +___ +if (0) { +# up to 5% slower than below variant +my @acc=map("%r$_",("no",8..15,"cx","bx")); + push(@acc, $a_ptr); +my ($lo, $hi, $carry)=("%rax", "%rbp", "%rno"); + +$code.=<<___; +.type __sqrx_384,\@abi-omnipotent +.align 32 +__sqrx_384: + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + + ######################################### + mulx @acc[7], @acc[1], $lo # a[1]*a[0] + mov 8*5($a_ptr), @acc[11] + mulx @acc[8], @acc[2], $hi # a[2]*a[0] + add $lo, @acc[2] + mulx @acc[9], @acc[3], $lo # a[3]*a[0] + adc $hi, @acc[3] + mulx @acc[10], @acc[4], $hi # a[4]*a[0] + adc $lo, @acc[4] + mulx @acc[11], @acc[5], @acc[6] # a[5]*a[0] + adc $hi, @acc[5] + adc \$0, @acc[6] + + mulx %rdx, $lo, $hi # a[0]*a[0] + mov @acc[7], %rdx + xor @acc[7], @acc[7] + add @acc[1], @acc[1] # double acc[1] + adc \$0, @acc[7] + add $hi, @acc[1] + adc \$0, @acc[7] + mov $lo, 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) +___ +($carry, @acc[7]) = (@acc[7], @acc[1]); +$code.=<<___; + ######################################### + xor @acc[7], @acc[7] + mulx @acc[8], $lo, $hi # a[2]*a[1] + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx @acc[9], $lo, $hi # a[3]*a[1] + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx @acc[10], $lo, $hi # a[4]*a[1] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[11], $lo, $hi # a[5]*a[1] + adcx $lo, @acc[6] + adox @acc[7], $hi + adcx $hi, @acc[7] + + mulx %rdx, $lo, $hi # a[1]*a[1] + mov @acc[8], %rdx + xor @acc[8], @acc[8] + adox @acc[2], @acc[2] # double acc[2:3] + adcx $carry, $lo # can't carry + adox @acc[3], @acc[3] + adcx $lo, @acc[2] + adox @acc[8], @acc[8] + adcx $hi, @acc[3] + adc \$0, @acc[8] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) +___ +($carry,@acc[8])=(@acc[8],$carry); +$code.=<<___; + ######################################### + xor @acc[8], @acc[8] + mulx @acc[9], $lo, $hi # a[3]*a[2] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[10], $lo, $hi # a[4]*a[2] + adcx $lo, @acc[6] + adox $hi, @acc[7] + + mulx @acc[11], $lo, $hi # a[5]*a[2] + adcx $lo, @acc[7] + adox @acc[8], $hi + adcx $hi, @acc[8] + + mulx %rdx, $lo, $hi # a[2]*a[2] + mov @acc[9], %rdx + xor @acc[9], @acc[9] + adox @acc[4], @acc[4] # double acc[4:5] + adcx $carry, $lo # can't carry + adox @acc[5], @acc[5] + adcx $lo, @acc[4] + adox @acc[9], @acc[9] + adcx $hi, @acc[5] + adc \$0, $acc[9] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) +___ +($carry,@acc[9])=(@acc[9],$carry); +$code.=<<___; + ######################################### + xor @acc[9], @acc[9] + mulx @acc[10], $lo, $hi # a[4]*a[3] + adcx $lo, @acc[7] + adox $hi, @acc[8] + + mulx @acc[11], $lo, $hi # a[5]*a[3] + adcx $lo, @acc[8] + adox @acc[9], $hi + adcx $hi, @acc[9] + + mulx %rdx, $lo, $hi + mov @acc[10], %rdx + xor @acc[10], @acc[10] + adox @acc[6], @acc[6] # double acc[6:7] + adcx $carry, $lo # can't carry + adox @acc[7], @acc[7] + adcx $lo, @acc[6] + adox @acc[10], @acc[10] + adcx $hi, @acc[7] + adc \$0, $acc[10] + mov @acc[6], 8*6($r_ptr) + mov @acc[7], 8*7($r_ptr) +___ +($carry,@acc[10])=(@acc[10],$carry); +$code.=<<___; + ######################################### + mulx @acc[11], $lo, @acc[10] # a[5]*a[4] + add $lo, @acc[9] + adc \$0, @acc[10] + + mulx %rdx, $lo, $hi # a[4]*a[4] + mov @acc[11], %rdx + xor @acc[11], @acc[11] + adox @acc[8], @acc[8] # double acc[8:10] + adcx $carry, $lo # can't carry + adox @acc[9], @acc[9] + adcx $lo, @acc[8] + adox @acc[10], @acc[10] + adcx $hi, @acc[9] + adox @acc[11], @acc[11] + mov @acc[8], 8*8($r_ptr) + mov @acc[9], 8*9($r_ptr) + + ######################################### + mulx %rdx, $lo, $hi # a[5]*a[5] + adcx $lo, @acc[10] + adcx $hi, @acc[11] + + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sqrx_384,.-__sqrx_384 +___ +} else { +my @acc=map("%r$_",("no",8..15,"cx","bx","bp")); +my ($lo, $hi)=($r_ptr, "%rax"); + +$code.=<<___; +.type __sqrx_384,\@abi-omnipotent +.align 32 +__sqrx_384: + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + movq $r_ptr, %xmm0 + + ######################################### + mulx @acc[7], @acc[1], $lo # a[1]*a[0] + mov 8*5($a_ptr), @acc[11] + mulx @acc[8], @acc[2], $hi # a[2]*a[0] + add $lo, @acc[2] + mulx @acc[9], @acc[3], $lo # a[3]*a[0] + adc $hi, @acc[3] + mulx @acc[10], @acc[4], $hi # a[4]*a[0] + adc $lo, @acc[4] + mulx @acc[11], @acc[5], @acc[6] # a[5]*a[0] + mov @acc[7], %rdx + adc $hi, @acc[5] + adc \$0, @acc[6] + + ######################################### + xor @acc[7], @acc[7] + mulx @acc[8], $lo, $hi # a[2]*a[1] + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx @acc[9], $lo, $hi # a[3]*a[1] + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx @acc[10], $lo, $hi # a[4]*a[1] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[11], $lo, $hi # a[5]*a[1] + mov @acc[8], %rdx + adcx $lo, @acc[6] + adox @acc[7], $hi + adcx $hi, @acc[7] + + ######################################### + xor @acc[8], @acc[8] + mulx @acc[9], $lo, $hi # a[3]*a[2] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[10], $lo, $hi # a[4]*a[2] + adcx $lo, @acc[6] + adox $hi, @acc[7] + + mulx @acc[11], $lo, $hi # a[5]*a[2] + mov @acc[9], %rdx + adcx $lo, @acc[7] + adox @acc[8], $hi + adcx $hi, @acc[8] + + ######################################### + xor @acc[9], @acc[9] + mulx @acc[10], $lo, $hi # a[4]*a[3] + adcx $lo, @acc[7] + adox $hi, @acc[8] + + mulx @acc[11], $lo, $hi # a[5]*a[3] + mov @acc[10], %rdx + adcx $lo, @acc[8] + adox @acc[9], $hi + adcx $hi, @acc[9] + + ######################################### + mulx @acc[11], $lo, @acc[10] # a[5]*a[4] + mov 8*0($a_ptr), %rdx + add $lo, @acc[9] + movq %xmm0, $r_ptr # restore $r_ptr + adc \$0, @acc[10] + + ######################################### double acc[1:10] + xor @acc[11], @acc[11] + adcx @acc[1], @acc[1] + adcx @acc[2], @acc[2] + adcx @acc[3], @acc[3] + adcx @acc[4], @acc[4] + adcx @acc[5], @acc[5] + + ######################################### accumulate a[i]*a[i] + mulx %rdx, %rdx, $hi # a[0]*a[0] + mov %rdx, 8*0($r_ptr) + mov 8*1($a_ptr), %rdx + adox $hi, @acc[1] + mov @acc[1], 8*1($r_ptr) + + mulx %rdx, @acc[1], $hi # a[1]*a[1] + mov 8*2($a_ptr), %rdx + adox @acc[1], @acc[2] + adox $hi, @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[2]*a[2] + mov 8*3($a_ptr), %rdx + adox @acc[1], @acc[4] + adox @acc[2], @acc[5] + adcx @acc[6], @acc[6] + adcx @acc[7], @acc[7] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[3]*a[3] + mov 8*4($a_ptr), %rdx + adox @acc[1], @acc[6] + adox @acc[2], @acc[7] + adcx @acc[8], @acc[8] + adcx @acc[9], @acc[9] + mov @acc[6], 8*6($r_ptr) + mov @acc[7], 8*7($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[4]*a[4] + mov 8*5($a_ptr), %rdx + adox @acc[1], @acc[8] + adox @acc[2], @acc[9] + adcx @acc[10], @acc[10] + adcx @acc[11], @acc[11] + mov @acc[8], 8*8($r_ptr) + mov @acc[9], 8*9($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[5]*a[5] + adox @acc[1], @acc[10] + adox @acc[2], @acc[11] + + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sqrx_384,.-__sqrx_384 +___ +} + +{ ########################################################## 384-bit redcx_mont +my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" +my ($lo, $hi) = ("%rax", "%rbp"); + +$code.=<<___; +######################################################################## +# void redcx_mont_384(uint64_t ret[6], const uint64_t a[12], +# uint64_t m[6], uint64_t n0); +.globl redcx_mont_384 +.hidden redcx_mont_384 +.type redcx_mont_384,\@function,4,"unwind" +.align 32 +redcx_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size redcx_mont_384,.-redcx_mont_384 + +######################################################################## +# void fromx_mont_384(uint64_t ret[6], const uint64_t a[6], +# uint64_t m[6], uint64_t n0); +.globl fromx_mont_384 +.hidden fromx_mont_384 +.type fromx_mont_384,\@function,4,"unwind" +.align 32 +fromx_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulx_by_1_mont_384 + + ################################# + # Branch-less conditional acc[0:6] - modulus + + mov @acc[6], %rax + mov @acc[7], %rcx + mov @acc[0], %rdx + mov @acc[1], %rbp + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[7] + mov @acc[2], @acc[5] + sbb 8*2($n_ptr), @acc[0] + sbb 8*3($n_ptr), @acc[1] + sbb 8*4($n_ptr), @acc[2] + mov @acc[3], $a_ptr + sbb 8*5($n_ptr), @acc[3] + + cmovc %rax, @acc[6] + cmovc %rcx, @acc[7] + cmovc %rdx, @acc[0] + mov @acc[6], 8*0($r_ptr) + cmovc %rbp, @acc[1] + mov @acc[7], 8*1($r_ptr) + cmovc @acc[5], @acc[2] + mov @acc[0], 8*2($r_ptr) + cmovc $a_ptr, @acc[3] + mov @acc[1], 8*3($r_ptr) + mov @acc[2], 8*4($r_ptr) + mov @acc[3], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size fromx_mont_384,.-fromx_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulx_by_1_mont_384,\@abi-omnipotent +.align 32 +__mulx_by_1_mont_384: + mov 8*0($a_ptr), @acc[0] + mov $n0, %rdx + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] +___ +for (my $i=0; $i<6; $i++) { +$code.=<<___; + imulq @acc[0], %rdx + + ################################# reduction $i + xor @acc[6], @acc[6] # @acc[6]=0, cf=0, of=0 + mulx 8*0($n_ptr), $lo, $hi + adcx $lo, @acc[0] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5($n_ptr), $lo, $hi + mov $n0, %rdx + adcx $lo, @acc[5] + adox @acc[6], $hi + adcx $hi, @acc[6] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + ret +.size __mulx_by_1_mont_384,.-__mulx_by_1_mont_384 + +.type __redc_tail_mont_384,\@abi-omnipotent +.align 32 +__redc_tail_mont_384: + add 8*6($a_ptr), @acc[0] # accumulate upper half + mov @acc[0], %rax + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + mov @acc[1], %rcx + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + sbb @acc[6], @acc[6] + + ################################# + # Branch-less conditional acc[0:6] - modulus + + mov @acc[2], %rdx + mov @acc[3], %rbp + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[7] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], $a_ptr + sbb 8*5($n_ptr), @acc[5] + sbb \$0, @acc[6] + + cmovc %rax, @acc[0] + cmovc %rcx, @acc[1] + cmovc %rdx, @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc %rbp, @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc $a_ptr, @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl sgn0x_pty_mont_384 +.hidden sgn0x_pty_mont_384 +.type sgn0x_pty_mont_384,\@function,3,"unwind" +.align 32 +sgn0x_pty_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 0($r_ptr), $a_ptr + mov $b_org, $n0 + call __mulx_by_1_mont_384 + + xor %rax, %rax + mov @acc[0], @acc[7] + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + not %rax # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0x_pty_mont_384,.-sgn0x_pty_mont_384 + +.globl sgn0x_pty_mont_384x +.hidden sgn0x_pty_mont_384x +.type sgn0x_pty_mont_384x,\@function,3,"unwind" +.align 32 +sgn0x_pty_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 48($r_ptr), $a_ptr # sgn0(a->im) + mov $b_org, $n0 + call __mulx_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + lea 0($r_ptr), $a_ptr # sgn0(a->re) + xor $r_ptr, $r_ptr + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, $r_ptr + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov @acc[0], 0(%rsp) # a->im is zero or not + not $r_ptr # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, $r_ptr + or @acc[7], $r_ptr # pack sign and parity + + call __mulx_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + xor %rax, %rax + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + mov 0(%rsp), @acc[6] + + not %rax # 2*x > p, which means "negative" + + test @acc[0], @acc[0] + cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) + + test @acc[6], @acc[6] + cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) + + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x +___ +} } + +{ ########################################################## mulx/sqrx_mont +my @acc = (@acc, "%rax"); +my ($lo,$hi)=("%rdi","%rbp"); + +$code.=<<___; +.globl mulx_mont_384 +.hidden mulx_mont_384 +.type mulx_mont_384,\@function,5,"unwind" +.align 32 +mulx_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*3(%rsp), %rsp +.cfi_adjust_cfa_offset 8*3 +.cfi_end_prologue + + mov $b_org, $b_ptr # evacuate from %rdx + mov 8*0($b_org), %rdx + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[4] + movq $r_ptr, %xmm0 + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + mov $n0, (%rsp) + + mulx @acc[6],@acc[0],@acc[1] # a[0]*b[0] + call __mulx_mont_384 + + mov 8*3(%rsp),%r15 +.cfi_restore %r15 + mov 8*4(%rsp),%r14 +.cfi_restore %r14 + mov 8*5(%rsp),%r13 +.cfi_restore %r13 + mov 8*6(%rsp),%r12 +.cfi_restore %r12 + mov 8*7(%rsp),%rbx +.cfi_restore %rbx + mov 8*8(%rsp),%rbp +.cfi_restore %rbp + lea 8*9(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_mont_384,.-mulx_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulx_mont_384,\@abi-omnipotent +.align 32 +__mulx_mont_384: +.cfi_startproc + mulx @acc[7], @acc[6], @acc[2] + mulx @acc[8], @acc[7], @acc[3] + add @acc[6], @acc[1] + mulx @acc[4], @acc[8], @acc[4] + adc @acc[7], @acc[2] + mulx $lo, $lo, @acc[5] + adc @acc[8], @acc[3] + mulx $hi, $hi, @acc[6] + mov 8($b_ptr), %rdx + adc $lo, @acc[4] + adc $hi, @acc[5] + adc \$0, @acc[6] + xor @acc[7], @acc[7] + +___ +for (my $i=1; $i<6; $i++) { +my $tt = $i==1 ? @acc[7] : $hi; +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + mov @acc[0], 16(%rsp) + imulq 8(%rsp), @acc[0] + + ################################# Multiply by b[$i] + xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($a_ptr), $lo, $hi + adox $lo, @acc[1] + adcx $hi, @acc[2] + + mulx 8*1+128($a_ptr), $lo, $hi + adox $lo, @acc[2] + adcx $hi, @acc[3] + + mulx 8*2+128($a_ptr), $lo, $hi + adox $lo, @acc[3] + adcx $hi, @acc[4] + + mulx 8*3+128($a_ptr), $lo, $hi + adox $lo, @acc[4] + adcx $hi, @acc[5] + + mulx 8*4+128($a_ptr), $lo, $hi + adox $lo, @acc[5] + adcx $hi, @acc[6] + + mulx 8*5+128($a_ptr), $lo, $hi + mov @acc[0], %rdx + adox $lo, @acc[6] + adcx $hi, @acc[7] # cf=0 + adox @acc[8], @acc[7] + adox @acc[8], @acc[8] + + ################################# reduction + xor @acc[0], @acc[0] # acc[0]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx 16(%rsp), $lo # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5+128($n_ptr), $lo, $hi + mov $b_next, %rdx + adcx $lo, @acc[5] + adox $hi, @acc[6] + adcx @acc[0], @acc[6] + adox @acc[0], @acc[7] + adcx @acc[0], @acc[7] + adox @acc[0], @acc[8] + adcx @acc[0], @acc[8] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + imulq 8(%rsp), %rdx + movq %xmm0, $b_ptr # restore $r_ptr + + ################################# last reduction + xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx $lo, @acc[0] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + mov @acc[2], @acc[0] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + mov @acc[3], $a_ptr + + mulx 8*5+128($n_ptr), $lo, $hi + adcx $lo, @acc[5] + adox $hi, @acc[6] + mov @acc[1], %rdx + adcx @acc[8], @acc[6] + adox @acc[8], @acc[7] + lea 128($n_ptr), $n_ptr + mov @acc[4], @acc[8] + adc \$0, @acc[7] + + ################################# + # Branch-less conditional acc[1:7] - modulus + + sub 8*0($n_ptr), @acc[1] + sbb 8*1($n_ptr), @acc[2] + mov @acc[5], $lo + sbb 8*2($n_ptr), @acc[3] + sbb 8*3($n_ptr), @acc[4] + sbb 8*4($n_ptr), @acc[5] + mov @acc[6], $hi + sbb 8*5($n_ptr), @acc[6] + sbb \$0, @acc[7] + + cmovnc @acc[1], %rdx + cmovc @acc[0], @acc[2] + cmovc $a_ptr, @acc[3] + cmovnc @acc[4], @acc[8] + mov %rdx, 8*0($b_ptr) + cmovnc @acc[5], $lo + mov @acc[2], 8*1($b_ptr) + cmovnc @acc[6], $hi + mov @acc[3], 8*2($b_ptr) + mov @acc[8], 8*3($b_ptr) + mov $lo, 8*4($b_ptr) + mov $hi, 8*5($b_ptr) + + ret +.cfi_endproc +.size __mulx_mont_384,.-__mulx_mont_384 +___ +} +$code.=<<___; +.globl sqrx_mont_384 +.hidden sqrx_mont_384 +.type sqrx_mont_384,\@function,4,"unwind" +.align 32 +sqrx_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*3(%rsp), %rsp +.cfi_adjust_cfa_offset 8*3 +.cfi_end_prologue + + mov $n_ptr, $n0 # n0 + lea -128($b_org), $n_ptr # control u-op density + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[4] + movq $r_ptr, %xmm0 + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + + lea ($a_ptr), $b_ptr + mov $n0, (%rsp) # n0 + lea -128($a_ptr), $a_ptr # control u-op density + + mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] + call __mulx_mont_384 # as fast as dedicated squaring + + mov 8*3(%rsp),%r15 +.cfi_restore %r15 + mov 8*4(%rsp),%r14 +.cfi_restore %r14 + mov 8*5(%rsp),%r13 +.cfi_restore %r13 + mov 8*6(%rsp),%r12 +.cfi_restore %r12 + mov 8*7(%rsp),%rbx +.cfi_restore %rbx + mov 8*8(%rsp),%rbp +.cfi_restore %rbp + lea 8*9(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_mont_384,.-sqrx_mont_384 + +.globl sqrx_n_mul_mont_384 +.hidden sqrx_n_mul_mont_384 +.type sqrx_n_mul_mont_384,\@function,6,"unwind" +.align 32 +sqrx_n_mul_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*3(%rsp), %rsp +.cfi_adjust_cfa_offset 8*3 +.cfi_end_prologue + + mov $b_org, @acc[2] # loop counter + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov $a_ptr, $b_ptr + mov 8*3($a_ptr), @acc[4] + movq $r_ptr, %xmm0 # to __mulx_mont_384 + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + + mov $n0, (%rsp) + mov %r9, 16(%rsp) # 6th, multiplicand argument + movq 8*0(%r9), %xmm2 # prefetch b[0] + +.Loop_sqrx_384: + movd @acc[2]d, %xmm1 + lea -128($b_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] + call __mulx_mont_384 + + movd %xmm1, @acc[2]d + dec @acc[2]d + jnz .Loop_sqrx_384 + + mov %rdx, @acc[6] + movq %xmm2, %rdx # b[0] + lea -128($b_ptr), $a_ptr # control u-op density + mov 16(%rsp), $b_ptr # 6th, multiplicand argument + lea -128($n_ptr), $n_ptr # control u-op density + + mulx @acc[6],@acc[0],@acc[1] # a[0]*b[0] + call __mulx_mont_384 + + mov 8*3(%rsp),%r15 +.cfi_restore %r15 + mov 8*4(%rsp),%r14 +.cfi_restore %r14 + mov 8*5(%rsp),%r13 +.cfi_restore %r13 + mov 8*6(%rsp),%r12 +.cfi_restore %r12 + mov 8*7(%rsp),%rbx +.cfi_restore %rbx + mov 8*8(%rsp),%rbp +.cfi_restore %rbp + lea 8*9(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384 + +.globl sqrx_n_mul_mont_383 +.hidden sqrx_n_mul_mont_383 +.type sqrx_n_mul_mont_383,\@function,6,"unwind" +.align 32 +sqrx_n_mul_mont_383: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*3(%rsp), %rsp +.cfi_adjust_cfa_offset 8*3 +.cfi_end_prologue + + mov $b_org, @acc[2] # loop counter + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov $a_ptr, $b_ptr + mov 8*3($a_ptr), @acc[4] + movq $r_ptr, %xmm0 # to __mulx_mont_383_nonred + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + + mov $n0, (%rsp) + mov %r9, 16(%rsp) # 6th, multiplicand argument + movq 8*0(%r9), %xmm2 # prefetch b[0] + lea -128($n_ptr), $n_ptr # control u-op density + +.Loop_sqrx_383: + movd @acc[2]d, %xmm1 + lea -128($b_ptr), $a_ptr # control u-op density + + mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] + call __mulx_mont_383_nonred # omitting full reduction gives ~15% + # in addition-chains + movd %xmm1, @acc[2]d + dec @acc[2]d + jnz .Loop_sqrx_383 + + mov %rdx, @acc[6] + movq %xmm2, %rdx # b[0] + lea -128($b_ptr), $a_ptr # control u-op density + mov 16(%rsp), $b_ptr # 6th, multiplicand argument + + mulx @acc[6], @acc[0], @acc[1] # a[0]*b[0] + call __mulx_mont_384 + + mov 8*3(%rsp),%r15 +.cfi_restore %r15 + mov 8*4(%rsp),%r14 +.cfi_restore %r14 + mov 8*5(%rsp),%r13 +.cfi_restore %r13 + mov 8*6(%rsp),%r12 +.cfi_restore %r12 + mov 8*7(%rsp),%rbx +.cfi_restore %rbx + mov 8*8(%rsp),%rbp +.cfi_restore %rbp + lea 8*9(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulx_mont_383_nonred,\@abi-omnipotent +.align 32 +__mulx_mont_383_nonred: +.cfi_startproc + mulx @acc[7], @acc[6], @acc[2] + mulx @acc[8], @acc[7], @acc[3] + add @acc[6], @acc[1] + mulx @acc[4], @acc[8], @acc[4] + adc @acc[7], @acc[2] + mulx $lo, $lo, @acc[5] + adc @acc[8], @acc[3] + mulx $hi, $hi, @acc[6] + mov 8($b_ptr), %rdx + adc $lo, @acc[4] + adc $hi, @acc[5] + adc \$0, @acc[6] +___ +for (my $i=1; $i<6; $i++) { +my $tt = $i==1 ? @acc[7] : $hi; +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + mov @acc[0], @acc[8] + imulq 8(%rsp), @acc[0] + + ################################# Multiply by b[$i] + xor @acc[7], @acc[7] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($a_ptr), $lo, $hi + adox $lo, @acc[1] + adcx $hi, @acc[2] + + mulx 8*1+128($a_ptr), $lo, $hi + adox $lo, @acc[2] + adcx $hi, @acc[3] + + mulx 8*2+128($a_ptr), $lo, $hi + adox $lo, @acc[3] + adcx $hi, @acc[4] + + mulx 8*3+128($a_ptr), $lo, $hi + adox $lo, @acc[4] + adcx $hi, @acc[5] + + mulx 8*4+128($a_ptr), $lo, $hi + adox $lo, @acc[5] + adcx $hi, @acc[6] + + mulx 8*5+128($a_ptr), $lo, $hi + mov @acc[0], %rdx + adox $lo, @acc[6] + adcx @acc[7], $hi + adox $hi, @acc[7] + + ################################# reduction + xor @acc[0], @acc[0] # acc[0]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx $lo, @acc[8] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5+128($n_ptr), $lo, $hi + mov $b_next, %rdx + adcx $lo, @acc[5] + adox $hi, @acc[6] + adcx @acc[8], @acc[6] + adox @acc[8], @acc[7] + adcx @acc[8], @acc[7] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + imulq 8(%rsp), %rdx + movq %xmm0, $b_ptr # restore $r_ptr + + ################################# last reduction + xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx $lo, @acc[0] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5+128($n_ptr), $lo, $hi + mov @acc[1], %rdx + adcx $lo, @acc[5] + adox $hi, @acc[6] + adc \$0, @acc[6] + mov @acc[4], @acc[8] + + mov @acc[1], 8*0($b_ptr) + mov @acc[2], 8*1($b_ptr) + mov @acc[3], 8*2($b_ptr) + mov @acc[5], $lo + mov @acc[4], 8*3($b_ptr) + mov @acc[5], 8*4($b_ptr) + mov @acc[6], 8*5($b_ptr) + mov @acc[6], $hi + + ret +.cfi_endproc +.size __mulx_mont_383_nonred,.-__mulx_mont_383_nonred +___ +} } } +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # align +my @acc = (@acc,"%rax","%rdx","%rbx","%rbp"); + +# omitting 3 reductions gives ~10% better performance in add-chains +$code.=<<___; +.globl sqrx_mont_382x +.hidden sqrx_mont_382x +.type sqrx_mont_382x,\@function,4,"unwind" +.align 32 +sqrx_mont_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + mov $a_ptr, 8*2(%rsp) + mov $r_ptr, %xmm0 + + ################################# + mov 8*0($a_ptr), @acc[0] # a->re + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov @acc[0], @acc[6] + add 8*6($a_ptr), @acc[0] # a->re + a->im + mov @acc[1], @acc[7] + adc 8*7($a_ptr), @acc[1] + mov @acc[2], @acc[8] + adc 8*8($a_ptr), @acc[2] + mov @acc[3], @acc[9] + adc 8*9($a_ptr), @acc[3] + mov @acc[4], @acc[10] + adc 8*10($a_ptr), @acc[4] + mov @acc[5], @acc[11] + adc 8*11($a_ptr), @acc[5] + + sub 8*6($a_ptr), @acc[6] # a->re - a->im + sbb 8*7($a_ptr), @acc[7] + sbb 8*8($a_ptr), @acc[8] + sbb 8*9($a_ptr), @acc[9] + sbb 8*10($a_ptr), @acc[10] + sbb 8*11($a_ptr), @acc[11] + sbb $r_ptr, $r_ptr # borrow flag as mask + + mov @acc[0], 32+8*0(%rsp) # t0 + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + mov @acc[6], 32+8*6(%rsp) # t1 + mov @acc[7], 32+8*7(%rsp) + mov @acc[8], 32+8*8(%rsp) + mov @acc[9], 32+8*9(%rsp) + mov @acc[10], 32+8*10(%rsp) + mov @acc[11], 32+8*11(%rsp) + mov $r_ptr, 32+8*12(%rsp) + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + #mov 8*2(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rdx + mov 8*0($a_ptr), %r14 # @acc[6] + mov 8*1($a_ptr), %r15 # @acc[7] + mov 8*2($a_ptr), %rax # @acc[8] + mov 8*3($a_ptr), %r12 # @acc[4] + mov 8*4($a_ptr), %rdi # $lo + mov 8*5($a_ptr), %rbp # $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_383_nonred +___ +{ +my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 + 8..11,13,14); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + + mov @acc[0], 8*6($b_ptr) # ret->im + mov @acc[1], 8*7($b_ptr) + mov @acc[2], 8*8($b_ptr) + mov @acc[3], 8*9($b_ptr) + mov @acc[4], 8*10($b_ptr) + mov @acc[5], 8*11($b_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32-128(%rsp), $a_ptr # t0 [+u-op density] + lea 32+8*6(%rsp), $b_ptr # t1 + + mov 32+8*6(%rsp), %rdx # t1[0] + mov 32+8*0(%rsp), %r14 # @acc[6] + mov 32+8*1(%rsp), %r15 # @acc[7] + mov 32+8*2(%rsp), %rax # @acc[8] + mov 32+8*3(%rsp), %r12 # @acc[4] + mov 32+8*4(%rsp), %rdi # $lo + mov 32+8*5(%rsp), %rbp # $hi + #lea -128($a_ptr), $a_ptr # control u-op density + #lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_383_nonred +___ +{ +my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 + 8..11,13,14); +$code.=<<___; + mov 32+8*12(%rsp), @acc[11] # account for sign from a->re - a->im + lea 128($n_ptr), $n_ptr + mov 32+8*0(%rsp), @acc[6] + and @acc[11], @acc[6] + mov 32+8*1(%rsp), @acc[7] + and @acc[11], @acc[7] + mov 32+8*2(%rsp), @acc[8] + and @acc[11], @acc[8] + mov 32+8*3(%rsp), @acc[9] + and @acc[11], @acc[9] + mov 32+8*4(%rsp), @acc[10] + and @acc[11], @acc[10] + and 32+8*5(%rsp), @acc[11] + + sub @acc[6], @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb @acc[7], @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb @acc[8], @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb @acc[9], @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb @acc[10], @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb @acc[11], @acc[5] + sbb @acc[11], @acc[11] + + and @acc[11], @acc[6] + and @acc[11], @acc[7] + and @acc[11], @acc[8] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 8*5($n_ptr), @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + adc @acc[8], @acc[2] + adc @acc[9], @acc[3] + adc @acc[10], @acc[4] + adc @acc[11], @acc[5] + + mov @acc[0], 8*0($b_ptr) # ret->re + mov @acc[1], 8*1($b_ptr) + mov @acc[2], 8*2($b_ptr) + mov @acc[3], 8*3($b_ptr) + mov @acc[4], 8*4($b_ptr) + mov @acc[5], 8*5($b_ptr) +___ +} +$code.=<<___; + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_mont_382x,.-sqrx_mont_382x +___ +} + +print $code; +close STDOUT; diff --git a/src/asm/sha256-x86_64.pl b/src/asm/sha256-x86_64.pl new file mode 100755 index 00000000..4d855660 --- /dev/null +++ b/src/asm/sha256-x86_64.pl @@ -0,0 +1,788 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# sha256_block procedure for x86_64. +# +# This module is stripped of AVX and even scalar code paths, with +# raionale that +# +# a) AVX1 is [justifiably] faster than SSSE3 code path only on *one* +# processor, venerable Sandy Bridge; +# b) AVX2 incurs costly power transitions, which would be justifiable +# if AVX2 code was executing most of the time, which is not the +# case in the context; +# c) all comtemporary processors support SSSE3, so that nobody would +# actually use scalar code path anyway; +# +# See original module at CRYPTOGAMS for further details. + +$flavour = shift; +$output = pop; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +$func="sha256_block_data_order"; +$TABLE="K256"; +$SZ=4; +@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", + "%r8d","%r9d","%r10d","%r11d"); +($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); +$rounds=64; + +$ctx="%rdi"; # 1st arg, zapped by $a3 +$inp="%rsi"; # 2nd arg +$Tbl="%rbp"; + +$_ctx="16*$SZ+0*8(%rsp)"; +$_inp="16*$SZ+1*8(%rsp)"; +$_end="16*$SZ+2*8(%rsp)"; +$framesz="16*$SZ+3*8"; + +$code=<<___; +.text + +.align 64 +.type $TABLE,\@object +$TABLE: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 + .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm" +___ + +###################################################################### +# SIMD code paths +# +{{{ +###################################################################### +# Intel SHA Extensions implementation of SHA256 update function. +# +my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); + +my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); +my @MSG=map("%xmm$_",(3..6)); + +$code.=<<___; +.globl sha256_block_data_order_shaext +.hidden sha256_block_data_order_shaext +.type sha256_block_data_order_shaext,\@function,3,"unwind" +.align 64 +sha256_block_data_order_shaext: +.cfi_startproc +___ +$code.=<<___ if ($win64); + sub \$0x58,%rsp +.cfi_adjust_cfa_offset 0x58 + movaps %xmm6,-0x58(%r11) +.cfi_offset %xmm6,-0x60 + movaps %xmm7,-0x48(%r11) +.cfi_offset %xmm7,-0x50 + movaps %xmm8,-0x38(%r11) +.cfi_offset %xmm8,-0x40 + movaps %xmm9,-0x28(%r11) +.cfi_offset %xmm9,-0x30 + movaps %xmm10,-0x18(%r11) +.cfi_offset %xmm10,-0x20 +.cfi_end_prologue +___ +$code.=<<___; + lea K256+0x80(%rip),$Tbl + movdqu ($ctx),$ABEF # DCBA + movdqu 16($ctx),$CDGH # HGFE + movdqa 0x100-0x80($Tbl),$TMP # byte swap mask + + pshufd \$0x1b,$ABEF,$Wi # ABCD + pshufd \$0xb1,$ABEF,$ABEF # CDAB + pshufd \$0x1b,$CDGH,$CDGH # EFGH + movdqa $TMP,$BSWAP # offload + palignr \$8,$CDGH,$ABEF # ABEF + punpcklqdq $Wi,$CDGH # CDGH + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu ($inp),@MSG[0] + movdqu 0x10($inp),@MSG[1] + movdqu 0x20($inp),@MSG[2] + pshufb $TMP,@MSG[0] + movdqu 0x30($inp),@MSG[3] + + movdqa 0*16-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + pshufb $TMP,@MSG[1] + movdqa $CDGH,$CDGH_SAVE # offload + sha256rnds2 $ABEF,$CDGH # 0-3 + pshufd \$0x0e,$Wi,$Wi + nop + movdqa $ABEF,$ABEF_SAVE # offload + sha256rnds2 $CDGH,$ABEF + + movdqa 1*16-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + pshufb $TMP,@MSG[2] + sha256rnds2 $ABEF,$CDGH # 4-7 + pshufd \$0x0e,$Wi,$Wi + lea 0x40($inp),$inp + sha256msg1 @MSG[1],@MSG[0] + sha256rnds2 $CDGH,$ABEF + + movdqa 2*16-0x80($Tbl),$Wi + paddd @MSG[2],$Wi + pshufb $TMP,@MSG[3] + sha256rnds2 $ABEF,$CDGH # 8-11 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[3],$TMP + palignr \$4,@MSG[2],$TMP + nop + paddd $TMP,@MSG[0] + sha256msg1 @MSG[2],@MSG[1] + sha256rnds2 $CDGH,$ABEF + + movdqa 3*16-0x80($Tbl),$Wi + paddd @MSG[3],$Wi + sha256msg2 @MSG[3],@MSG[0] + sha256rnds2 $ABEF,$CDGH # 12-15 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[0],$TMP + palignr \$4,@MSG[3],$TMP + nop + paddd $TMP,@MSG[1] + sha256msg1 @MSG[3],@MSG[2] + sha256rnds2 $CDGH,$ABEF +___ +for($i=4;$i<16-3;$i++) { +$code.=<<___; + movdqa $i*16-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256rnds2 $ABEF,$CDGH # 16-19... + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + nop + paddd $TMP,@MSG[2] + sha256msg1 @MSG[0],@MSG[3] + sha256rnds2 $CDGH,$ABEF +___ + push(@MSG,shift(@MSG)); +} +$code.=<<___; + movdqa 13*16-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256rnds2 $ABEF,$CDGH # 52-55 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + sha256rnds2 $CDGH,$ABEF + paddd $TMP,@MSG[2] + + movdqa 14*16-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + sha256rnds2 $ABEF,$CDGH # 56-59 + pshufd \$0x0e,$Wi,$Wi + sha256msg2 @MSG[1],@MSG[2] + movdqa $BSWAP,$TMP + sha256rnds2 $CDGH,$ABEF + + movdqa 15*16-0x80($Tbl),$Wi + paddd @MSG[2],$Wi + nop + sha256rnds2 $ABEF,$CDGH # 60-63 + pshufd \$0x0e,$Wi,$Wi + dec $num + nop + sha256rnds2 $CDGH,$ABEF + + paddd $CDGH_SAVE,$CDGH + paddd $ABEF_SAVE,$ABEF + jnz .Loop_shaext + + pshufd \$0xb1,$CDGH,$CDGH # DCHG + pshufd \$0x1b,$ABEF,$TMP # FEBA + pshufd \$0xb1,$ABEF,$ABEF # BAFE + punpckhqdq $CDGH,$ABEF # DCBA + palignr \$8,$TMP,$CDGH # HGFE + + movdqu $ABEF,($ctx) + movdqu $CDGH,16($ctx) +___ +$code.=<<___ if ($win64); + movaps -0x58(%r11),%xmm6 + movaps -0x48(%r11),%xmm7 + movaps -0x38(%r11),%xmm8 + movaps -0x28(%r11),%xmm9 + movaps -0x18(%r11),%xmm10 + mov %r11,%rsp +.cfi_def_cfa %r11,8 +.cfi_epilogue +___ +$code.=<<___; + ret +.cfi_endproc +.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext +___ +}}} +{{{ + +my $a4=$T1; +my ($a,$b,$c,$d,$e,$f,$g,$h); + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. + + '&ror ($a0,$Sigma1[2]-$Sigma1[1])', + '&mov ($a,$a1)', + '&mov ($a4,$f)', + + '&ror ($a1,$Sigma0[2]-$Sigma0[1])', + '&xor ($a0,$e)', + '&xor ($a4,$g)', # f^g + + '&ror ($a0,$Sigma1[1]-$Sigma1[0])', + '&xor ($a1,$a)', + '&and ($a4,$e)', # (f^g)&e + + '&xor ($a0,$e)', + '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] + '&mov ($a2,$a)', + + '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g + '&ror ($a1,$Sigma0[1]-$Sigma0[0])', + '&xor ($a2,$b)', # a^b, b^c in next round + + '&add ($h,$a4)', # h+=Ch(e,f,g) + '&ror ($a0,$Sigma1[0])', # Sigma1(e) + '&and ($a3,$a2)', # (b^c)&(a^b) + + '&xor ($a1,$a)', + '&add ($h,$a0)', # h+=Sigma1(e) + '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) + + '&ror ($a1,$Sigma0[0])', # Sigma0(a) + '&add ($d,$h)', # d+=h + '&add ($h,$a3)', # h+=Maj(a,b,c) + + '&mov ($a0,$d)', + '&add ($a1,$h);'. # h+=Sigma0(a) + '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' + ); +} + +###################################################################### +# SSSE3 code path +# +{ +my $Tbl = $inp; +my $_ctx="0(%rbp)"; +my $_inp="8(%rbp)"; +my $_end="16(%rbp)"; +my $framesz=4*8+$win64*16*4+8; + +my @X = map("%xmm$_",(0..3)); +my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); + +$code.=<<___; +.globl ${func} +.hidden ${func} +.type ${func},\@function,3,"unwind" +.align 64 +${func}: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + shl \$4,%rdx # num*16 + sub \$$framesz,%rsp +.cfi_adjust_cfa_offset $framesz + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + mov $ctx,0(%rsp) # save ctx, 1st arg + #mov $inp,8(%rsp) # save inp, 2nd arg + mov %rdx,16(%rsp) # save end pointer, "3rd" arg +___ +$code.=<<___ if ($win64); + movaps %xmm6,0x20(%rsp) +.cfi_offset %xmm6,-0x78 + movaps %xmm7,0x30(%rsp) +.cfi_offset %xmm7,-0x68 + movaps %xmm8,0x40(%rsp) +.cfi_offset %xmm8,-0x58 + movaps %xmm9,0x50(%rsp) +.cfi_offset %xmm9,-0x48 +___ +$code.=<<___; + mov %rsp,%rbp +.cfi_def_cfa_register %rbp +.cfi_end_prologue + + lea -16*$SZ(%rsp),%rsp + mov $SZ*0($ctx),$A + and \$-64,%rsp # align stack + mov $SZ*1($ctx),$B + mov $SZ*2($ctx),$C + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H +___ + +$code.=<<___; + #movdqa $TABLE+`$SZ*$rounds`+32(%rip),$t4 + #movdqa $TABLE+`$SZ*$rounds`+64(%rip),$t5 + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + mov $inp,$_inp # offload $inp + movdqu 0x00($inp),@X[0] + movdqu 0x10($inp),@X[1] + movdqu 0x20($inp),@X[2] + pshufb $t3,@X[0] + movdqu 0x30($inp),@X[3] + lea $TABLE(%rip),$Tbl + pshufb $t3,@X[1] + movdqa 0x00($Tbl),$t0 + movdqa 0x10($Tbl),$t1 + pshufb $t3,@X[2] + paddd @X[0],$t0 + movdqa 0x20($Tbl),$t2 + pshufb $t3,@X[3] + movdqa 0x30($Tbl),$t3 + paddd @X[1],$t1 + paddd @X[2],$t2 + paddd @X[3],$t3 + movdqa $t0,0x00(%rsp) + mov $A,$a1 + movdqa $t1,0x10(%rsp) + mov $B,$a3 + movdqa $t2,0x20(%rsp) + xor $C,$a3 # magic + movdqa $t3,0x30(%rsp) + mov $E,$a0 + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + sub \$`-16*$SZ`,$Tbl # size optimization +___ +sub Xupdate_256_SSSE3 () { + ( + '&movdqa ($t0,@X[1]);', + '&movdqa ($t3,@X[3])', + '&palignr ($t0,@X[0],$SZ)', # X[1..4] + '&palignr ($t3,@X[2],$SZ);', # X[9..12] + '&movdqa ($t1,$t0)', + '&movdqa ($t2,$t0);', + '&psrld ($t0,$sigma0[2])', + '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] + '&psrld ($t2,$sigma0[0])', + '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] + '&pslld ($t1,8*$SZ-$sigma0[1]);'. + '&pxor ($t0,$t2)', + '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. + '&pxor ($t0,$t1)', + '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. + '&pxor ($t0,$t2);', + '&movdqa ($t2,$t3)', + '&pxor ($t0,$t1);', # sigma0(X[1..4]) + '&psrld ($t3,$sigma1[2])', + '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) + '&psrlq ($t2,$sigma1[0])', + '&pxor ($t3,$t2);', + '&psrlq ($t2,$sigma1[1]-$sigma1[0])', + '&pxor ($t3,$t2)', + '&pshufb ($t3,$t4)', # sigma1(X[14..15]) + '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) + '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] + '&movdqa ($t2,$t3);', + '&psrld ($t3,$sigma1[2])', + '&psrlq ($t2,$sigma1[0])', + '&pxor ($t3,$t2);', + '&psrlq ($t2,$sigma1[1]-$sigma1[0])', + '&pxor ($t3,$t2);', + '&movdqa ($t2,16*$j."($Tbl)")', + '&pshufb ($t3,$t5)', + '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) + ); +} + +sub SSSE3_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 104 instructions + + if (0) { + foreach (Xupdate_256_SSSE3()) { # 36 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + } else { # squeeze extra 4% on Westmere and 19% on Atom + eval(shift(@insns)); #@ + &movdqa ($t0,@X[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t3,@X[3]); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &palignr ($t0,@X[0],$SZ); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + &palignr ($t3,@X[2],$SZ); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &movdqa ($t1,$t0); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t2,$t0); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrld ($t0,$sigma0[2]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t3); # X[0..3] += X[9..12] + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrld ($t2,$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pshufd ($t3,@X[3],0b11111010); # X[4..15] + eval(shift(@insns)); + eval(shift(@insns)); #@ + &pslld ($t1,8*$SZ-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrld ($t2,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); + &pxor ($t0,$t1); + eval(shift(@insns)); + eval(shift(@insns)); + &pslld ($t1,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t2); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &movdqa ($t2,$t3); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t1); # sigma0(X[1..4]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + &psrld ($t3,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrlq ($t2,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrlq ($t2,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + #&pshufb ($t3,$t4); # sigma1(X[14..15]) + &pshufd ($t3,$t3,0b10000000); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &psrldq ($t3,8); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pshufd ($t3,@X[0],0b01010000); # X[16..17] + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &movdqa ($t2,$t3); + eval(shift(@insns)); + eval(shift(@insns)); + &psrld ($t3,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrlq ($t2,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrlq ($t2,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + #&pshufb ($t3,$t5); + &pshufd ($t3,$t3,0b00001000); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t2,16*$j."($Tbl)"); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &pslldq ($t3,8); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + } + &paddd ($t2,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &movdqa (16*$j."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<4; $j++) { + &SSSE3_256_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &jne (".Lssse3_00_47"); + + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } +$code.=<<___; + mov $_ctx,$ctx + mov $a1,$A + mov $_inp,$inp + + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + lea 16*$SZ($inp),$inp + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + jb .Lloop_ssse3 + + xorps %xmm0, %xmm0 + lea $framesz+6*8(%rbp),%r11 +.cfi_def_cfa %r11,8 + movaps %xmm0, 0x00(%rsp) # scrub the stack + movaps %xmm0, 0x10(%rsp) + movaps %xmm0, 0x20(%rsp) + movaps %xmm0, 0x30(%rsp) +___ +$code.=<<___ if ($win64); + movaps 0x20(%rbp),%xmm6 + movaps 0x30(%rbp),%xmm7 + movaps 0x40(%rbp),%xmm8 + movaps 0x50(%rbp),%xmm9 +___ +$code.=<<___; + mov $framesz(%rbp),%r15 +.cfi_restore %r15 + mov -40(%r11),%r14 +.cfi_restore %r14 + mov -32(%r11),%r13 +.cfi_restore %r13 + mov -24(%r11),%r12 +.cfi_restore %r12 + mov -16(%r11),%rbx +.cfi_restore %rbx + mov -8(%r11),%rbp +.cfi_restore %rbp +.cfi_epilogue + lea (%r11),%rsp + ret +.cfi_endproc +.size ${func},.-${func} +___ +} +}}} +{ +my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") : # Win64 order + ("%rdi","%rsi","%rdx"); # Unix order +$code.=<<___; +.globl sha256_emit +.hidden sha256_emit +.type sha256_emit,\@abi-omnipotent +.align 16 +sha256_emit: + mov 0($inp), %r8 + mov 8($inp), %r9 + mov 16($inp), %r10 + bswap %r8 + mov 24($inp), %r11 + bswap %r9 + mov %r8d, 4($out) + bswap %r10 + mov %r9d, 12($out) + bswap %r11 + mov %r10d, 20($out) + shr \$32, %r8 + mov %r11d, 28($out) + shr \$32, %r9 + mov %r8d, 0($out) + shr \$32, %r10 + mov %r9d, 8($out) + shr \$32, %r11 + mov %r10d, 16($out) + mov %r11d, 24($out) + ret +.size sha256_emit,.-sha256_emit + +.globl sha256_bcopy +.hidden sha256_bcopy +.type sha256_bcopy,\@abi-omnipotent +.align 16 +sha256_bcopy: + sub $inp, $out +.Loop_bcopy: + movzb ($inp), %eax + lea 1($inp), $inp + mov %al, -1($out,$inp) + dec $len + jnz .Loop_bcopy + ret +.size sha256_bcopy,.-sha256_bcopy + +.globl sha256_hcopy +.hidden sha256_hcopy +.type sha256_hcopy,\@abi-omnipotent +.align 16 +sha256_hcopy: + mov 0($inp), %r8 + mov 8($inp), %r9 + mov 16($inp), %r10 + mov 24($inp), %r11 + mov %r8, 0($out) + mov %r9, 8($out) + mov %r10, 16($out) + mov %r11, 24($out) + ret +.size sha256_hcopy,.-sha256_hcopy +___ +} + +sub sha256op38 { + my $instr = shift; + my %opcodelet = ( + "sha256rnds2" => 0xcb, + "sha256msg1" => 0xcc, + "sha256msg2" => 0xcd ); + + if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { + my @opcode=(0x0f,0x38); + push @opcode,$opcodelet{$instr}; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } else { + return $instr."\t".@_[0]; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; + + print $_,"\n"; +} +close STDOUT; diff --git a/src/asm/x86_64-xlate.pl b/src/asm/x86_64-xlate.pl new file mode 100755 index 00000000..6ae25e00 --- /dev/null +++ b/src/asm/x86_64-xlate.pl @@ -0,0 +1,1779 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Ascetic x86_64 AT&T to MASM/NASM assembler translator by @dot-asm. +# +# Why AT&T to MASM and not vice versa? Several reasons. Because AT&T +# format is way easier to parse. Because it's simpler to "gear" from +# Unix ABI to Windows one [see cross-reference "card" at the end of +# file]. Because Linux targets were available first... +# +# In addition the script also "distills" code suitable for GNU +# assembler, so that it can be compiled with more rigid assemblers, +# such as Solaris /usr/ccs/bin/as. +# +# This translator is not designed to convert *arbitrary* assembler +# code from AT&T format to MASM one. It's designed to convert just +# enough to provide for dual-ABI OpenSSL modules development... +# There *are* limitations and you might have to modify your assembler +# code or this script to achieve the desired result... +# +# Currently recognized limitations: +# +# - can't use multiple ops per line; +# +# Dual-ABI styling rules. +# +# 1. Adhere to Unix register and stack layout [see cross-reference +# ABI "card" at the end for explanation]. +# 2. Forget about "red zone," stick to more traditional blended +# stack frame allocation. If volatile storage is actually required +# that is. If not, just leave the stack as is. +# 3. Functions tagged with ".type name,@function" get crafted with +# unified Win64 prologue and epilogue automatically. If you want +# to take care of ABI differences yourself, tag functions as +# ".type name,@abi-omnipotent" instead. +# 4. To optimize the Win64 prologue you can specify number of input +# arguments as ".type name,@function,N." Keep in mind that if N is +# larger than 6, then you *have to* write "abi-omnipotent" code, +# because >6 cases can't be addressed with unified prologue. +# 5. Name local labels as .L*, do *not* use dynamic labels such as 1: +# (sorry about latter). +# 6. Don't use [or hand-code with .byte] "rep ret." "ret" mnemonic is +# required to identify the spots, where to inject Win64 epilogue! +# But on the pros, it's then prefixed with rep automatically:-) +# 7. Stick to explicit ip-relative addressing. If you have to use +# GOTPCREL addressing, stick to mov symbol@GOTPCREL(%rip),%r??. +# Both are recognized and translated to proper Win64 addressing +# modes. +# +# 8. In order to provide for structured exception handling unified +# Win64 prologue copies %rsp value to %rax. [Unless function is +# tagged with additional .type tag.] For further details see SEH +# paragraph at the end. +# 9. .init segment is allowed to contain calls to functions only. +# a. If function accepts more than 4 arguments *and* >4th argument +# is declared as non 64-bit value, do clear its upper part. + + +use strict; + +my $flavour = shift; +my $output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +open STDOUT,">$output" || die "can't open $output: $!" + if (defined($output)); + +my $gas=1; $gas=0 if ($output =~ /\.asm$/); +my $elf=1; $elf=0 if (!$gas); +my $dwarf=$elf; +my $win64=0; +my $prefix=""; +my $decor=".L"; + +my $masmref=8 + 50727*2**-32; # 8.00.50727 shipped with VS2005 +my $masm=0; +my $PTR=" PTR"; + +my $nasmref=2.03; +my $nasm=0; + +if ($flavour eq "mingw64") { $gas=1; $elf=0; $win64=1; + $prefix=`echo __USER_LABEL_PREFIX__ | \${CC:-false} -E -P -`; + $prefix =~ s|\R$||; # Better chomp + } +elsif ($flavour eq "macosx") { $gas=1; $elf=0; $prefix="_"; $decor="L\$"; } +elsif ($flavour eq "masm") { $gas=0; $elf=0; $masm=$masmref; $win64=1; $decor="\$L\$"; } +elsif ($flavour eq "nasm") { $gas=0; $elf=0; $nasm=$nasmref; $win64=1; $decor="\$L\$"; $PTR=""; } +elsif (!$gas) +{ if ($ENV{ASM} =~ m/nasm/ && `nasm -v` =~ m/version ([0-9]+)\.([0-9]+)/i) + { $nasm = $1 + $2*0.01; $PTR=""; } + elsif (`ml64 2>&1` =~ m/Version ([0-9]+)\.([0-9]+)(\.([0-9]+))?/) + { $masm = $1 + $2*2**-16 + $4*2**-32; } + die "no assembler found on %PATH%" if (!($nasm || $masm)); + $win64=1; + $elf=0; + $decor="\$L\$"; +} + +$dwarf=0 if($win64); + +my $current_segment; +my $current_function; +my %globals; + +{ package opcode; # pick up opcodes + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /^([a-z][a-z0-9]*)/i) { + bless $self,$class; + $self->{op} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + undef $self->{sz}; + if ($self->{op} =~ /^(movz)x?([bw]).*/) { # movz is pain... + $self->{op} = $1; + $self->{sz} = $2; + } elsif ($self->{op} =~ /call|jmp/) { + $self->{sz} = ""; + } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn + $self->{sz} = ""; + } elsif ($self->{op} =~ /^[vk]/) { # VEX or k* such as kmov + $self->{sz} = ""; + } elsif ($self->{op} =~ /mov[dq]/ && $$line =~ /%xmm/) { + $self->{sz} = ""; + } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) { + $self->{op} = $1; + $self->{sz} = $2; + } + } + $ret; + } + sub size { + my ($self, $sz) = @_; + $self->{sz} = $sz if (defined($sz) && !defined($self->{sz})); + $self->{sz}; + } + sub out { + my $self = shift; + if ($gas) { + if ($self->{op} eq "movz") { # movz is pain... + sprintf "%s%s%s",$self->{op},$self->{sz},shift; + } elsif ($self->{op} =~ /^set/) { + "$self->{op}"; + } elsif ($self->{op} eq "ret") { + my $epilogue = ""; + if ($win64 && $current_function->{abi} eq "svr4" + && !$current_function->{unwind}) { + $epilogue = "movq 8(%rsp),%rdi\n\t" . + "movq 16(%rsp),%rsi\n\t"; + } + $epilogue . ".byte 0xf3,0xc3"; + } elsif ($self->{op} eq "call" && !$elf && $current_segment eq ".init") { + ".p2align\t3\n\t.quad"; + } else { + "$self->{op}$self->{sz}"; + } + } else { + $self->{op} =~ s/^movz/movzx/; + if ($self->{op} eq "ret") { + $self->{op} = ""; + if ($win64 && $current_function->{abi} eq "svr4" + && !$current_function->{unwind}) { + $self->{op} = "mov rdi,QWORD$PTR\[8+rsp\]\t;WIN64 epilogue\n\t". + "mov rsi,QWORD$PTR\[16+rsp\]\n\t"; + } + $self->{op} .= "DB\t0F3h,0C3h\t\t;repret"; + } elsif ($self->{op} =~ /^(pop|push)f/) { + $self->{op} .= $self->{sz}; + } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") { + $self->{op} = "\tDQ"; + } + $self->{op}; + } + } + sub mnemonic { + my ($self, $op) = @_; + $self->{op}=$op if (defined($op)); + $self->{op}; + } +} +{ package const; # pick up constants, which start with $ + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /^\$([^,]+)/) { + bless $self, $class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + } + $ret; + } + sub out { + my $self = shift; + + $self->{value} =~ s/\b(0b[0-1]+)/oct($1)/eig; + if ($gas) { + # Solaris /usr/ccs/bin/as can't handle multiplications + # in $self->{value} + my $value = $self->{value}; + no warnings; # oct might complain about overflow, ignore here... + $value =~ s/(?{value} = $value; + } + sprintf "\$%s",$self->{value}; + } else { + my $value = $self->{value}; + $value =~ s/0x([0-9a-f]+)/0$1h/ig if ($masm); + sprintf "%s",$value; + } + } +} +{ package ea; # pick up effective addresses: expr(%reg,%reg,scale) + + my %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", + l=>"DWORD$PTR", d=>"DWORD$PTR", + q=>"QWORD$PTR", o=>"OWORD$PTR", + x=>"XMMWORD$PTR", y=>"YMMWORD$PTR", + z=>"ZMMWORD$PTR" ) if (!$gas); + + my %sifmap = ( ss=>"d", sd=>"q", # broadcast only + i32x2=>"q", f32x2=>"q", + i32x4=>"x", i64x2=>"x", i128=>"x", + f32x4=>"x", f64x2=>"x", f128=>"x", + i32x8=>"y", i64x4=>"y", + f32x8=>"y", f64x4=>"y" ) if (!$gas); + + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + # optional * ----vvv--- appears in indirect jmp/call + if ($$line =~ /^(\*?)([^\(,]*)\(([%\w,]+)\)((?:{[^}]+})*)/) { + bless $self, $class; + $self->{asterisk} = $1; + $self->{label} = $2; + ($self->{base},$self->{index},$self->{scale})=split(/,/,$3); + $self->{scale} = 1 if (!defined($self->{scale})); + $self->{opmask} = $4; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + if ($win64 && $self->{label} =~ s/\@GOTPCREL//) { + die if ($opcode->mnemonic() ne "mov"); + $opcode->mnemonic("lea"); + } + $self->{base} =~ s/^%//; + $self->{index} =~ s/^%// if (defined($self->{index})); + $self->{opcode} = $opcode; + } + $ret; + } + sub size {} + sub out { + my ($self, $sz) = @_; + + $self->{label} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; + $self->{label} =~ s/\.L/$decor/g; + + # Silently convert all EAs to 64-bit. This is required for + # elder GNU assembler and results in more compact code, + # *but* most importantly AES module depends on this feature! + $self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; + $self->{base} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; + + # Solaris /usr/ccs/bin/as can't handle multiplications + # in $self->{label}... + use integer; + $self->{label} =~ s/(?{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg; + + # Some assemblers insist on signed presentation of 32-bit + # offsets, but sign extension is a tricky business in perl... + $self->{label} =~ s/\b([0-9]+)\b/unpack("l",pack("L",$1))/eg; + + # if base register is %rbp or %r13, see if it's possible to + # flip base and index registers [for better performance] + if (!$self->{label} && $self->{index} && $self->{scale}==1 && + $self->{base} =~ /(rbp|r13)/) { + $self->{base} = $self->{index}; $self->{index} = $1; + } + + if ($gas) { + $self->{label} =~ s/^___imp_/__imp__/ if ($flavour eq "mingw64"); + + if (defined($self->{index})) { + sprintf "%s%s(%s,%%%s,%d)%s", + $self->{asterisk},$self->{label}, + $self->{base}?"%$self->{base}":"", + $self->{index},$self->{scale}, + $self->{opmask}; + } else { + sprintf "%s%s(%%%s)%s", $self->{asterisk},$self->{label}, + $self->{base},$self->{opmask}; + } + } else { + $self->{label} =~ s/\./\$/g; + $self->{label} =~ s/(?{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/); + + my $mnemonic = $self->{opcode}->mnemonic(); + ($self->{asterisk}) && ($sz="q") || + ($mnemonic =~ /^v?mov([qd])$/) && ($sz=$1) || + ($mnemonic =~ /^v?pinsr([qdwb])$/) && ($sz=$1) || + ($mnemonic =~ /^vpbroadcast([qdwb])$/) && ($sz=$1) || + ($mnemonic =~ /^v(?:broadcast|extract|insert)([sif]\w+)$/) + && ($sz=$sifmap{$1}); + + $self->{opmask} =~ s/%(k[0-7])/$1/; + + if (defined($self->{index})) { + sprintf "%s[%s%s*%d%s]%s",$szmap{$sz}, + $self->{label}?"$self->{label}+":"", + $self->{index},$self->{scale}, + $self->{base}?"+$self->{base}":"", + $self->{opmask}; + } elsif ($self->{base} eq "rip") { + sprintf "%s[%s]",$szmap{$sz},$self->{label}; + } else { + sprintf "%s[%s%s]%s", $szmap{$sz}, + $self->{label}?"$self->{label}+":"", + $self->{base},$self->{opmask}; + } + } + } +} +{ package register; # pick up registers, which start with %. + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + # optional * ----vvv--- appears in indirect jmp/call + if ($$line =~ /^(\*?)%(\w+)((?:{[^}]+})*)/) { + bless $self,$class; + $self->{asterisk} = $1; + $self->{value} = $2; + $self->{opmask} = $3; + $opcode->size($self->size()); + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + } + $ret; + } + sub size { + my $self = shift; + my $ret; + + if ($self->{value} =~ /^r[\d]+b$/i) { $ret="b"; } + elsif ($self->{value} =~ /^r[\d]+w$/i) { $ret="w"; } + elsif ($self->{value} =~ /^r[\d]+d$/i) { $ret="l"; } + elsif ($self->{value} =~ /^r[\w]+$/i) { $ret="q"; } + elsif ($self->{value} =~ /^[a-d][hl]$/i){ $ret="b"; } + elsif ($self->{value} =~ /^[\w]{2}l$/i) { $ret="b"; } + elsif ($self->{value} =~ /^[\w]{2}$/i) { $ret="w"; } + elsif ($self->{value} =~ /^e[a-z]{2}$/i){ $ret="l"; } + + $ret; + } + sub out { + my $self = shift; + if ($gas) { sprintf "%s%%%s%s", $self->{asterisk}, + $self->{value}, + $self->{opmask}; } + else { $self->{opmask} =~ s/%(k[0-7])/$1/; + $self->{value}.$self->{opmask}; } + } +} +{ package label; # pick up labels, which end with : + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /(^[\.\w]+)\:/) { + bless $self,$class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + $self->{value} =~ s/^\.L/$decor/; + } + $ret; + } + sub out { + my $self = shift; + + if ($gas) { + my $func = ($globals{$self->{value}} or $self->{value}) . ":"; + if ($current_function->{name} eq $self->{value}) { + $func .= "\n.cfi_".cfi_directive::startproc() if ($dwarf); + $func .= "\n .byte 0xf3,0x0f,0x1e,0xfa\n"; # endbranch + if ($win64 && $current_function->{abi} eq "svr4") { + my $fp = $current_function->{unwind} ? "%r11" : "%rax"; + $func .= " movq %rdi,8(%rsp)\n"; + $func .= " movq %rsi,16(%rsp)\n"; + $func .= " movq %rsp,$fp\n"; + $func .= "${decor}SEH_begin_$current_function->{name}:\n"; + my $narg = $current_function->{narg}; + $narg=6 if (!defined($narg)); + $func .= " movq %rcx,%rdi\n" if ($narg>0); + $func .= " movq %rdx,%rsi\n" if ($narg>1); + $func .= " movq %r8,%rdx\n" if ($narg>2); + $func .= " movq %r9,%rcx\n" if ($narg>3); + $func .= " movq 40(%rsp),%r8\n" if ($narg>4); + $func .= " movq 48(%rsp),%r9\n" if ($narg>5); + } + } + $func; + } elsif ($self->{value} ne "$current_function->{name}") { + # Make all labels in masm global. + $self->{value} .= ":" if ($masm); + $self->{value} . ":"; + } elsif ($win64 && $current_function->{abi} eq "svr4") { + my $func = "$current_function->{name}" . + ($nasm ? ":" : "\tPROC $current_function->{scope}") . + "\n"; + my $fp = $current_function->{unwind} ? "r11" : "rax"; + $func .= " DB 243,15,30,250\n"; # endbranch + $func .= " mov QWORD$PTR\[8+rsp\],rdi\t;WIN64 prologue\n"; + $func .= " mov QWORD$PTR\[16+rsp\],rsi\n"; + $func .= " mov $fp,rsp\n"; + $func .= "${decor}SEH_begin_$current_function->{name}:"; + $func .= ":" if ($masm); + $func .= "\n"; + my $narg = $current_function->{narg}; + $narg=6 if (!defined($narg)); + $func .= " mov rdi,rcx\n" if ($narg>0); + $func .= " mov rsi,rdx\n" if ($narg>1); + $func .= " mov rdx,r8\n" if ($narg>2); + $func .= " mov rcx,r9\n" if ($narg>3); + $func .= " mov r8,QWORD$PTR\[40+rsp\]\n" if ($narg>4); + $func .= " mov r9,QWORD$PTR\[48+rsp\]\n" if ($narg>5); + $func .= "\n"; + } else { + "$current_function->{name}". + ($nasm ? ":" : "\tPROC $current_function->{scope}"). + "\n DB 243,15,30,250"; # endbranch + } + } +} +{ package expr; # pick up expressions + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /(^[^,]+)/) { + bless $self,$class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + $self->{value} =~ s/\@PLT// if (!$elf); + $self->{value} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; + $self->{value} =~ s/\.L/$decor/g; + $self->{opcode} = $opcode; + } + $ret; + } + sub out { + my $self = shift; + $self->{value}; + } +} + +my @xdata_seg = (".section .xdata", ".align 8"); +my @pdata_seg = (".section .pdata", ".align 4"); + +{ package cfi_directive; + # CFI directives annotate instructions that are significant for + # stack unwinding procedure compliant with DWARF specification, + # see http://dwarfstd.org/. Besides naturally expected for this + # script platform-specific filtering function, this module adds + # three auxiliary synthetic directives not recognized by [GNU] + # assembler: + # + # - .cfi_push to annotate push instructions in prologue, which + # translates to .cfi_adjust_cfa_offset (if needed) and + # .cfi_offset; + # - .cfi_pop to annotate pop instructions in epilogue, which + # translates to .cfi_adjust_cfa_offset (if needed) and + # .cfi_restore; + # - [and most notably] .cfi_cfa_expression which encodes + # DW_CFA_def_cfa_expression and passes it to .cfi_escape as + # byte vector; + # + # CFA expressions were introduced in DWARF specification version + # 3 and describe how to deduce CFA, Canonical Frame Address. This + # becomes handy if your stack frame is variable and you can't + # spare register for [previous] frame pointer. Suggested directive + # syntax is made-up mix of DWARF operator suffixes [subset of] + # and references to registers with optional bias. Following example + # describes offloaded *original* stack pointer at specific offset + # from *current* stack pointer: + # + # .cfi_cfa_expression %rsp+40,deref,+8 + # + # Final +8 has everything to do with the fact that CFA is defined + # as reference to top of caller's stack, and on x86_64 call to + # subroutine pushes 8-byte return address. In other words original + # stack pointer upon entry to a subroutine is 8 bytes off from CFA. + # + # In addition the .cfi directives are re-purposed even for Win64 + # stack unwinding. Two more synthetic directives were added: + # + # - .cfi_end_prologue to denote point when all non-volatile + # registers are saved and stack or [chosen] frame pointer is + # stable; + # - .cfi_epilogue to denote point when all non-volatile registers + # are restored [and it even adds missing .cfi_restore-s]; + # + # Though it's not universal "miracle cure," it has its limitations. + # Most notably .cfi_cfa_expression won't start working... For more + # information see the end of this file. + + # Below constants are taken from "DWARF Expressions" section of the + # DWARF specification, section is numbered 7.7 in versions 3 and 4. + my %DW_OP_simple = ( # no-arg operators, mapped directly + deref => 0x06, dup => 0x12, + drop => 0x13, over => 0x14, + pick => 0x15, swap => 0x16, + rot => 0x17, xderef => 0x18, + + abs => 0x19, and => 0x1a, + div => 0x1b, minus => 0x1c, + mod => 0x1d, mul => 0x1e, + neg => 0x1f, not => 0x20, + or => 0x21, plus => 0x22, + shl => 0x24, shr => 0x25, + shra => 0x26, xor => 0x27, + ); + + my %DW_OP_complex = ( # used in specific subroutines + constu => 0x10, # uleb128 + consts => 0x11, # sleb128 + plus_uconst => 0x23, # uleb128 + lit0 => 0x30, # add 0-31 to opcode + reg0 => 0x50, # add 0-31 to opcode + breg0 => 0x70, # add 0-31 to opcole, sleb128 + regx => 0x90, # uleb28 + fbreg => 0x91, # sleb128 + bregx => 0x92, # uleb128, sleb128 + piece => 0x93, # uleb128 + ); + + # Following constants are defined in x86_64 ABI supplement, for + # example available at https://www.uclibc.org/docs/psABI-x86_64.pdf, + # see section 3.7 "Stack Unwind Algorithm". + my %DW_reg_idx = ( + "%rax"=>0, "%rdx"=>1, "%rcx"=>2, "%rbx"=>3, + "%rsi"=>4, "%rdi"=>5, "%rbp"=>6, "%rsp"=>7, + "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11, + "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15 + ); + + my ($cfa_reg, $cfa_off, $cfa_rsp, %saved_regs); + my @cfa_stack; + + # [us]leb128 format is variable-length integer representation base + # 2^128, with most significant bit of each byte being 0 denoting + # *last* most significant digit. See "Variable Length Data" in the + # DWARF specification, numbered 7.6 at least in versions 3 and 4. + sub sleb128 { + use integer; # get right shift extend sign + + my $val = shift; + my $sign = ($val < 0) ? -1 : 0; + my @ret = (); + + while(1) { + push @ret, $val&0x7f; + + # see if remaining bits are same and equal to most + # significant bit of the current digit, if so, it's + # last digit... + last if (($val>>6) == $sign); + + @ret[-1] |= 0x80; + $val >>= 7; + } + + return @ret; + } + sub uleb128 { + my $val = shift; + my @ret = (); + + while(1) { + push @ret, $val&0x7f; + + # see if it's last significant digit... + last if (($val >>= 7) == 0); + + @ret[-1] |= 0x80; + } + + return @ret; + } + sub const { + my $val = shift; + + if ($val >= 0 && $val < 32) { + return ($DW_OP_complex{lit0}+$val); + } + return ($DW_OP_complex{consts}, sleb128($val)); + } + sub reg { + my $val = shift; + + return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/); + + my $reg = $DW_reg_idx{$1}; + my $off = eval ("0 $2 $3"); + + return (($DW_OP_complex{breg0} + $reg), sleb128($off)); + # Yes, we use DW_OP_bregX+0 to push register value and not + # DW_OP_regX, because latter would require even DW_OP_piece, + # which would be a waste under the circumstances. If you have + # to use DWP_OP_reg, use "regx:N"... + } + sub cfa_expression { + my $line = shift; + my @ret; + + foreach my $token (split(/,\s*/,$line)) { + if ($token =~ /^%r/) { + push @ret,reg($token); + } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) { + push @ret,reg("$2+$1"); + } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) { + my $i = 1*eval($2); + push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i)); + } elsif (my $i = 1*eval($token) or $token eq "0") { + if ($token =~ /^\+/) { + push @ret,$DW_OP_complex{plus_uconst},uleb128($i); + } else { + push @ret,const($i); + } + } else { + push @ret,$DW_OP_simple{$token}; + } + } + + # Finally we return DW_CFA_def_cfa_expression, 15, followed by + # length of the expression and of course the expression itself. + return (15,scalar(@ret),@ret); + } + + # Following constants are defined in "x64 exception handling" at + # https://docs.microsoft.com/ and match the register sequence in + # CONTEXT structure defined in winnt.h. + my %WIN64_reg_idx = ( + "%rax"=>0, "%rcx"=>1, "%rdx"=>2, "%rbx"=>3, + "%rsp"=>4, "%rbp"=>5, "%rsi"=>6, "%rdi"=>7, + "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11, + "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15 + ); + sub xdata { + our @dat = (); + our $len = 0; + + sub allocstack { + my $offset = shift; + + if ($offset) { + if ($offset <= 128) { + $offset = ($offset - 8) >> 3; + push @dat, [0,$offset<<4|2]; # UWOP_ALLOC_SMALL + } elsif ($offset < 0x80000) { + push @dat, [0,0x01,unpack("C2",pack("v",$offset>>3))]; + } else { + push @dat, [0,0x11,unpack("C4",pack("V",$offset))]; + } + $len += $#{@dat[-1]}+1; + } + } + + # allocate stack frame + if (my $offset = -8 - $cfa_rsp) { + # but see if frame pointer is among saved registers + if ($cfa_reg ne "%rsp" and my $fp_off = $saved_regs{$cfa_reg}) { + $fp_off = -8 - $fp_off; + allocstack($fp_off-8); + $offset -= $fp_off; + push @dat, [0,$WIN64_reg_idx{$cfa_reg}<<4]; # UWOP_PUSH_NONVOL + $len += $#{@dat[-1]}+1; + } + allocstack($offset); + } + # set up frame pointer + my $fp_info = 0; + if ($cfa_reg ne "%rsp") { + my $offset = $cfa_off - $cfa_rsp; + ($offset > 240 or $offset&0xf) and die "invalid FP offset $offset"; + $fp_info = ($offset&-16)|$WIN64_reg_idx{$cfa_reg}; + push @dat, [0,3]; # UWOP_SET_FPREG + $len += $#{@dat[-1]}+1; + } + # save registers + foreach my $key (sort { $saved_regs{$b} <=> $saved_regs{$a} } + keys(%saved_regs)) { + next if ($cfa_reg ne "%rsp" && $cfa_reg eq $key); + my $offset = $saved_regs{$key} - $cfa_rsp; + if ($key =~ /%xmm([0-9]+)/) { + if ($offset < 0x100000) { + push @dat, [0,($1<<4)|8,unpack("C2",pack("v",$offset>>4))]; + } else { + push @dat, [0,($1<<4)|9,unpack("C4",pack("V",$offset))]; + } + } else { + if ($offset < 0x80000) { + push @dat, [0,(($WIN64_reg_idx{$key})<<4)|4, + unpack("C2",pack("v",$offset>>3))]; + } else { + push @dat, [0,(($WIN64_reg_idx{$key})<<4)|5, + unpack("C4",pack("V",$offset))]; + } + } + $len += $#{@dat[-1]}+1; + } + + my @ret; + # generate 4-byte descriptor + push @ret, ".byte 1,0,".($len/2).",$fp_info"; + $len += 4; + # pad to 8*n + unshift @dat, [(0)x((-$len)&7)] if ($len&7); + # emit data + while(defined(my $row = pop @dat)) { + push @ret, ".byte ". join(",", + map { sprintf "0x%02x",$_ } @{$row}); + } + + return @ret; + } + sub startproc { + return if ($cfa_rsp == -8); + ($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", -8, -8); + %saved_regs = (); + return "startproc"; + } + sub endproc { + return if ($cfa_rsp == 0); + ($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", 0, 0); + %saved_regs = (); + return "endproc"; + } + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) { + bless $self,$class; + $ret = $self; + undef $self->{value}; + my $dir = $1; + + SWITCH: for ($dir) { + # What is $cfa_rsp? Effectively it's difference between %rsp + # value and current CFA, Canonical Frame Address, which is + # why it starts with -8. Recall that CFA is top of caller's + # stack... + /startproc/ && do { $dir = startproc(); last; }; + /endproc/ && do { $dir = endproc(); + # .cfi_remember_state directives that are not + # matched with .cfi_restore_state are + # unnecessary. + die "unpaired .cfi_remember_state" if (@cfa_stack); + last; + }; + /def_cfa_register/ + && do { $cfa_off = $cfa_rsp if ($cfa_reg eq "%rsp"); + $cfa_reg = $$line; + last; + }; + /def_cfa_offset/ + && do { $cfa_off = -1*eval($$line); + $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp"); + last; + }; + /adjust_cfa_offset/ + && do { my $val = 1*eval($$line); + $cfa_off -= $val; + if ($cfa_reg eq "%rsp") { + $cfa_rsp -= $val; + } + last; + }; + /def_cfa/ && do { if ($$line =~ /(%r\w+)\s*,\s*(.+)/) { + $cfa_reg = $1; + $cfa_off = -1*eval($2); + $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp"); + } + last; + }; + /push/ && do { $dir = undef; + $cfa_rsp -= 8; + if ($cfa_reg eq "%rsp") { + $cfa_off = $cfa_rsp; + $self->{value} = ".cfi_adjust_cfa_offset\t8\n"; + } + $saved_regs{$$line} = $cfa_rsp; + $self->{value} .= ".cfi_offset\t$$line,$cfa_rsp"; + last; + }; + /pop/ && do { $dir = undef; + $cfa_rsp += 8; + if ($cfa_reg eq "%rsp") { + $cfa_off = $cfa_rsp; + $self->{value} = ".cfi_adjust_cfa_offset\t-8\n"; + } + $self->{value} .= ".cfi_restore\t$$line"; + delete $saved_regs{$$line}; + last; + }; + /cfa_expression/ + && do { $dir = undef; + $self->{value} = ".cfi_escape\t" . + join(",", map(sprintf("0x%02x", $_), + cfa_expression($$line))); + last; + }; + /remember_state/ + && do { push @cfa_stack, + [$cfa_reg,$cfa_off,$cfa_rsp,%saved_regs]; + last; + }; + /restore_state/ + && do { ($cfa_reg,$cfa_off,$cfa_rsp,%saved_regs) + = @{pop @cfa_stack}; + last; + }; + /offset/ && do { if ($$line =~ /(%\w+)\s*,\s*(.+)/) { + $saved_regs{$1} = 1*eval($2); + $dir = undef if ($1 =~ /%xmm/); + } + last; + }; + /restore/ && do { delete $saved_regs{$$line}; last; }; + /end_prologue/ + && do { $dir = undef; + $self->{win64} = ".endprolog"; + last; + }; + /epilogue/ && do { $dir = undef; + $self->{win64} = ".epilogue"; + $self->{value} = join("\n", + map { ".cfi_restore\t$_" } + sort keys(%saved_regs)); + %saved_regs = (); + last; + }; + } + + $self->{value} = ".cfi_$dir\t$$line" if ($dir); + + $$line = ""; + } + + return $ret; + } + sub out { + my $self = shift; + return $self->{value} if ($dwarf); + + if ($win64 and $current_function->{unwind} + and my $ret = $self->{win64}) { + my ($reg, $off) = ($cfa_reg =~ /%(?!rsp)/) ? ($', $cfa_off) + : ("rsp", $cfa_rsp); + my $fname = $current_function->{name}; + + if ($ret eq ".endprolog") { + $saved_regs{"%rdi"} = 0; # relative to CFA, remember? + $saved_regs{"%rsi"} = 8; + + push @pdata_seg, + ".rva .LSEH_begin_${fname}", + ".rva .LSEH_body_${fname}", + ".rva .LSEH_info_${fname}_prologue",""; + push @xdata_seg, + ".LSEH_info_${fname}_prologue:", + ".byte 1,0,5,0x0b", # 5 unwind codes, %r11 is FP + ".byte 0,0x74,1,0", # %rdi at 8(%rsp) + ".byte 0,0x64,2,0", # %rsi at 16(%rsp) + ".byte 0,0x03", # set frame pointer + ".byte 0,0" # padding + ; + push @pdata_seg, + ".rva .LSEH_body_${fname}", + ".rva .LSEH_epilogue_${fname}", + ".rva .LSEH_info_${fname}_body",""; + push @xdata_seg,".LSEH_info_${fname}_body:", xdata(); + $ret = "${decor}SEH_body_${fname}:"; + $ret .= ":" if ($masm); $ret .= "\n"; + } elsif ($ret eq ".epilogue") { + %saved_regs = (); + $saved_regs{"%rdi"} = 0; # relative to CFA, remember? + $saved_regs{"%rsi"} = 8; + $cfa_rsp = $cfa_off; + + push @pdata_seg, + ".rva .LSEH_epilogue_${fname}", + ".rva .LSEH_end_${fname}", + ".rva .LSEH_info_${fname}_epilogue",""; + push @xdata_seg,".LSEH_info_${fname}_epilogue:", xdata(), ""; + $ret = "${decor}SEH_epilogue_${fname}:"; + $ret .= ":" if ($masm); $ret .= "\n"; + if ($gas) { + $ret .= " mov ".(0-$off)."(%$reg),%rdi\n"; + $ret .= " mov ".(8-$off)."(%$reg),%rsi\n"; + } else { + $ret .= " mov rdi,QWORD$PTR\[".(0-$off)."+$reg\]"; + $ret .= " ;WIN64 epilogue\n"; + $ret .= " mov rsi,QWORD$PTR\[".(8-$off)."+$reg\]\n"; + } + } + return $ret; + } + return; + } +} +{ package directive; # pick up directives, which start with . + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + my $dir; + + # chain-call to cfi_directive + $ret = cfi_directive->re($line) and return $ret; + + if ($$line =~ /^\s*(\.\w+)/) { + bless $self,$class; + $dir = $1; + $ret = $self; + undef $self->{value}; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + SWITCH: for ($dir) { + /\.global|\.globl|\.extern/ + && do { $globals{$$line} = $prefix . $$line; + $$line = $globals{$$line} if ($prefix); + last; + }; + /\.type/ && do { my ($sym,$type,$narg,$unwind) = split(',',$$line); + if ($type eq "\@function") { + undef $current_function; + $current_function->{name} = $sym; + $current_function->{abi} = "svr4"; + $current_function->{narg} = $narg; + $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; + $current_function->{unwind} = $unwind; + } elsif ($type eq "\@abi-omnipotent") { + undef $current_function; + $current_function->{name} = $sym; + $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; + } + $$line =~ s/\@abi\-omnipotent/\@function/; + $$line =~ s/\@function.*/\@function/; + last; + }; + /\.asciz/ && do { if ($$line =~ /^"(.*)"$/) { + $dir = ".byte"; + $$line = join(",",unpack("C*",$1),0); + } + last; + }; + /\.rva|\.long|\.quad/ + && do { $$line =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; + $$line =~ s/\.L/$decor/g; + last; + }; + } + + if ($gas) { + $self->{value} = $dir . "\t" . $$line; + + if ($dir =~ /\.extern/) { + $self->{value} = ""; # swallow extern + } elsif (!$elf && $dir =~ /\.type/) { + $self->{value} = ""; + $self->{value} = ".def\t" . ($globals{$1} or $1) . ";\t" . + (defined($globals{$1})?".scl 2;":".scl 3;") . + "\t.type 32;\t.endef" + if ($win64 && $$line =~ /([^,]+),\@function/); + } elsif ($dir =~ /\.size/) { + $self->{value} = "" if (!$elf); + if ($dwarf and my $endproc = cfi_directive::endproc()) { + $self->{value} = ".cfi_$endproc\n$self->{value}"; + } elsif (!$elf && defined($current_function)) { + $self->{value} .= "${decor}SEH_end_$current_function->{name}:" + if ($win64 && $current_function->{abi} eq "svr4"); + undef $current_function; + } + } elsif (!$elf && $dir =~ /\.align/) { + $self->{value} = ".p2align\t" . (log($$line)/log(2)); + } elsif ($dir eq ".section") { + $current_segment=$$line; + if (!$elf && $current_segment eq ".init") { + if ($flavour eq "macosx") { $self->{value} = ".mod_init_func"; } + elsif ($flavour eq "mingw64") { $self->{value} = ".section\t.ctors"; } + } + } elsif ($dir =~ /\.(text|data)/) { + $current_segment=".$1"; + } elsif ($dir =~ /\.hidden/) { + if ($flavour eq "macosx") { $self->{value} = ".private_extern\t$prefix$$line"; } + elsif ($flavour eq "mingw64") { $self->{value} = ""; } + } elsif ($dir =~ /\.comm/) { + $self->{value} = "$dir\t$prefix$$line"; + $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx"); + } + $$line = ""; + return $self; + } + + # non-gas case or nasm/masm + SWITCH: for ($dir) { + /\.text/ && do { my $v=undef; + if ($nasm) { + $v="section .text code align=64\n"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = ".text\$"; + $v.="$current_segment\tSEGMENT "; + $v.=$masm>=$masmref ? "ALIGN(256)" : "PAGE"; + $v.=" 'CODE'"; + } + $self->{value} = $v; + last; + }; + /\.data/ && do { my $v=undef; + if ($nasm) { + $v="section .data data align=8\n"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = "_DATA"; + $v.="$current_segment\tSEGMENT"; + } + $self->{value} = $v; + last; + }; + /\.section/ && do { my $v=undef; + $$line =~ s/([^,]*).*/$1/; + $$line = ".CRT\$XCU" if ($$line eq ".init"); + if ($nasm) { + $v="section $$line"; + if ($$line=~/\.([px])data/) { + $v.=" rdata align="; + $v.=$1 eq "p"? 4 : 8; + } elsif ($$line=~/\.CRT\$/i) { + $v.=" rdata align=8"; + } + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $v.="$$line\tSEGMENT"; + if ($$line=~/\.([px])data/) { + $v.=" READONLY"; + $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref); + } elsif ($$line=~/\.CRT\$/i) { + $v.=" READONLY "; + $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD"; + } + } + $current_segment = $$line; + $self->{value} = $v; + last; + }; + /\.extern/ && do { $self->{value} = "EXTERN\t".$$line; + $self->{value} .= ":NEAR" if ($masm); + last; + }; + /\.globl|.global/ + && do { $self->{value} = $masm?"PUBLIC":"global"; + $self->{value} .= "\t".$$line; + last; + }; + /\.size/ && do { if (defined($current_function)) { + undef $self->{value}; + if ($current_function->{abi} eq "svr4") { + $self->{value}="${decor}SEH_end_$current_function->{name}:"; + $self->{value}.=":\n" if($masm); + } + $self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name}); + undef $current_function; + } + last; + }; + /\.align/ && do { my $max = ($masm && $masm>=$masmref) ? 256 : 4096; + $self->{value} = "ALIGN\t".($$line>$max?$max:$$line); + last; + }; + /\.(value|long|rva|quad)/ + && do { my $sz = substr($1,0,1); + my @arr = split(/,\s*/,$$line); + my $last = pop(@arr); + my $conv = sub { my $var=shift; + $var=~s/^(0b[0-1]+)/oct($1)/eig; + $var=~s/^0x([0-9a-f]+)/0$1h/ig if ($masm); + if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva")) + { $var=~s/^([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; } + $var; + }; + + $sz =~ tr/bvlrq/BWDDQ/; + $self->{value} = "\tD$sz\t"; + for (@arr) { $self->{value} .= &$conv($_).","; } + $self->{value} .= &$conv($last); + last; + }; + /\.byte/ && do { my @str=split(/,\s*/,$$line); + map(s/(0b[0-1]+)/oct($1)/eig,@str); + map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm); + while ($#str>15) { + $self->{value}.="DB\t" + .join(",",@str[0..15])."\n"; + foreach (0..15) { shift @str; } + } + $self->{value}.="DB\t" + .join(",",@str) if (@str); + last; + }; + /\.comm/ && do { my @str=split(/,\s*/,$$line); + my $v=undef; + if ($nasm) { + $v.="common $prefix@str[0] @str[1]"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = "_DATA"; + $v.="$current_segment\tSEGMENT\n"; + $v.="COMM @str[0]:DWORD:".@str[1]/4; + } + $self->{value} = $v; + last; + }; + } + $$line = ""; + } + + $ret; + } + sub out { + my $self = shift; + $self->{value}; + } +} + +# Upon initial x86_64 introduction SSE>2 extensions were not introduced +# yet. In order not to be bothered by tracing exact assembler versions, +# but at the same time to provide a bare security minimum of AES-NI, we +# hard-code some instructions. Extensions past AES-NI on the other hand +# are traced by examining assembler version in individual perlasm +# modules... + +my %regrm = ( "%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3, + "%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7 ); + +sub rex { + my $opcode=shift; + my ($dst,$src,$rex)=@_; + + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + push @$opcode,($rex|0x40) if ($rex); +} + +my $movq = sub { # elderly gas can't handle inter-register movq + my $arg = shift; + my @opcode=(0x66); + if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) { + my ($src,$dst)=($1,$2); + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,$src,$dst,0x8); + push @opcode,0x0f,0x7e; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + @opcode; + } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) { + my ($src,$dst)=($2,$1); + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,$src,$dst,0x8); + push @opcode,0x0f,0x6e; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + @opcode; + } else { + (); + } +}; + +my $pextrd = sub { + if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) { + my @opcode=(0x66); + my $imm=$1; + my $src=$2; + my $dst=$3; + if ($dst =~ /%r([0-9]+)d/) { $dst = $1; } + elsif ($dst =~ /%e/) { $dst = $regrm{$dst}; } + rex(\@opcode,$src,$dst); + push @opcode,0x0f,0x3a,0x16; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + push @opcode,$imm; + @opcode; + } else { + (); + } +}; + +my $pinsrd = sub { + if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + my $imm=$1; + my $src=$2; + my $dst=$3; + if ($src =~ /%r([0-9]+)/) { $src = $1; } + elsif ($src =~ /%e/) { $src = $regrm{$src}; } + rex(\@opcode,$dst,$src); + push @opcode,0x0f,0x3a,0x22; + push @opcode,0xc0|(($dst&7)<<3)|($src&7); # ModR/M + push @opcode,$imm; + @opcode; + } else { + (); + } +}; + +my $pshufb = sub { + if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$2,$1); + push @opcode,0x0f,0x38,0x00; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + @opcode; + } else { + (); + } +}; + +my $palignr = sub { + if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x3a,0x0f; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + push @opcode,$1; + @opcode; + } else { + (); + } +}; + +my $pclmulqdq = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x3a,0x44; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +my $rdrand = sub { + if (shift =~ /%[er](\w+)/) { + my @opcode=(); + my $dst=$1; + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,0,$dst,8); + push @opcode,0x0f,0xc7,0xf0|($dst&7); + @opcode; + } else { + (); + } +}; + +my $rdseed = sub { + if (shift =~ /%[er](\w+)/) { + my @opcode=(); + my $dst=$1; + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,0,$dst,8); + push @opcode,0x0f,0xc7,0xf8|($dst&7); + @opcode; + } else { + (); + } +}; + +# Not all AVX-capable assemblers recognize AMD XOP extension. Since we +# are using only two instructions hand-code them in order to be excused +# from chasing assembler versions... + +sub rxb { + my $opcode=shift; + my ($dst,$src1,$src2,$rxb)=@_; + + $rxb|=0x7<<5; + $rxb&=~(0x04<<5) if($dst>=8); + $rxb&=~(0x01<<5) if($src1>=8); + $rxb&=~(0x02<<5) if($src2>=8); + push @$opcode,$rxb; +} + +my $vprotd = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x8f); + rxb(\@opcode,$3,$2,-1,0x08); + push @opcode,0x78,0xc2; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +my $vprotq = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x8f); + rxb(\@opcode,$3,$2,-1,0x08); + push @opcode,0x78,0xc3; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +# Intel Control-flow Enforcement Technology extension. All functions and +# indirect branch targets will have to start with this instruction... +# However, it should not be used in functions' prologues explicitly, as +# it's added automatically [and in the right spot]. Which leaves only +# non-function indirect branch targets, such as in a case-like dispatch +# table, as application area. + +my $endbr64 = sub { + (0xf3,0x0f,0x1e,0xfa); +}; + +######################################################################## + +if ($nasm) { + print <<___; +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +___ +} elsif ($masm) { + print <<___; +OPTION DOTNAME +___ +} + +sub process { + my $line = shift; + + $line =~ s|\R$||; # Better chomp + + $line =~ s|[#!].*$||; # get rid of asm-style comments... + $line =~ s|/\*.*\*/||; # ... and C-style comments... + $line =~ s|^\s+||; # ... and skip white spaces in beginning + $line =~ s|\s+$||; # ... and at the end + + if (my $label=label->re(\$line)) { print $label->out(); } + + if (my $directive=directive->re(\$line)) { + printf "%s",$directive->out(); + } elsif (my $opcode=opcode->re(\$line)) { + my $asm = eval("\$".$opcode->mnemonic()); + + if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) { + print $gas?".byte\t":"DB\t",join(',',@bytes),"\n"; + next; + } + + my @args; + ARGUMENT: while (1) { + my $arg; + + ($arg=register->re(\$line, $opcode))|| + ($arg=const->re(\$line)) || + ($arg=ea->re(\$line, $opcode)) || + ($arg=expr->re(\$line, $opcode)) || + last ARGUMENT; + + push @args,$arg; + + last ARGUMENT if ($line !~ /^,/); + + $line =~ s/^,\s*//; + } # ARGUMENT: + + if ($#args>=0) { + my $insn; + my $sz=$opcode->size(); + + if ($gas) { + $insn = $opcode->out($#args>=1?$args[$#args]->size():$sz); + @args = map($_->out($sz),@args); + printf "\t%s\t%s",$insn,join(",",@args); + } else { + $insn = $opcode->out(); + foreach (@args) { + my $arg = $_->out(); + # $insn.=$sz compensates for movq, pinsrw, ... + if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; } + if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; } + if ($arg =~ /^zmm[0-9]+$/) { $insn.=$sz; $sz="z" if(!$sz); last; } + if ($arg =~ /^mm[0-9]+$/) { $insn.=$sz; $sz="q" if(!$sz); last; } + } + @args = reverse(@args); + undef $sz if ($nasm && $opcode->mnemonic() eq "lea"); + printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args)); + } + } else { + printf "\t%s",$opcode->out(); + } + } + + print $line,"\n"; +} + +while(<>) { process($_); } + +map { process($_) } @pdata_seg if ($win64); +map { process($_) } @xdata_seg if ($win64); + +# platform-specific epilogue +if ($masm) { + print "\n$current_segment\tENDS\n" if ($current_segment); + print "END\n"; +} elsif ($elf) { + # -fcf-protection segment, snatched from compiler -S output + my $align = ($flavour =~ /elf32/) ? 4 : 8; + print <<___; + +.section .note.GNU-stack,"",\@progbits +.section .note.gnu.property,"a",\@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align $align +2: +___ +} + +close STDOUT; + + ################################################# +# Cross-reference x86_64 ABI "card" +# +# Unix Win64 +# %rax * * +# %rbx - - +# %rcx #4 #1 +# %rdx #3 #2 +# %rsi #2 - +# %rdi #1 - +# %rbp - - +# %rsp - - +# %r8 #5 #3 +# %r9 #6 #4 +# %r10 * * +# %r11 * * +# %r12 - - +# %r13 - - +# %r14 - - +# %r15 - - +# +# (*) volatile register +# (-) preserved by callee +# (#) Nth argument, volatile +# +# In Unix terms top of stack is argument transfer area for arguments +# which could not be accommodated in registers. Or in other words 7th +# [integer] argument resides at 8(%rsp) upon function entry point. +# 128 bytes above %rsp constitute a "red zone" which is not touched +# by signal handlers and can be used as temporal storage without +# allocating a frame. +# +# In Win64 terms N*8 bytes on top of stack is argument transfer area, +# which belongs to/can be overwritten by callee. N is the number of +# arguments passed to callee, *but* not less than 4! This means that +# upon function entry point 5th argument resides at 40(%rsp), as well +# as that 32 bytes from 8(%rsp) can always be used as temporal +# storage [without allocating a frame]. One can actually argue that +# one can assume a "red zone" above stack pointer under Win64 as well. +# Point is that at apparently no occasion Windows kernel would alter +# the area above user stack pointer in true asynchronous manner... +# +# All the above means that if assembler programmer adheres to Unix +# register and stack layout, but disregards the "red zone" existence, +# it's possible to use following prologue and epilogue to "gear" from +# Unix to Win64 ABI in leaf functions with not more than 6 arguments. +# +# omnipotent_function: +# ifdef WIN64 +# movq %rdi,8(%rsp) +# movq %rsi,16(%rsp) +# movq %rcx,%rdi ; if 1st argument is actually present +# movq %rdx,%rsi ; if 2nd argument is actually ... +# movq %r8,%rdx ; if 3rd argument is ... +# movq %r9,%rcx ; if 4th argument ... +# movq 40(%rsp),%r8 ; if 5th ... +# movq 48(%rsp),%r9 ; if 6th ... +# endif +# ... +# ifdef WIN64 +# movq 8(%rsp),%rdi +# movq 16(%rsp),%rsi +# endif +# ret +# + ################################################# +# Win64 SEH, Structured Exception Handling. +# +# Unlike on Unix systems(*) lack of Win64 stack unwinding information +# has undesired side-effect at run-time: if an exception is raised in +# assembler subroutine such as those in question (basically we're +# referring to segmentation violations caused by malformed input +# parameters), the application is briskly terminated without invoking +# any exception handlers, most notably without generating memory dump +# or any user notification whatsoever. This poses a problem. It's +# possible to address it by registering custom language-specific +# handler that would restore processor context to the state at +# subroutine entry point and return "exception is not handled, keep +# unwinding" code. Writing such handler can be a challenge... But it's +# doable, though requires certain coding convention. Consider following +# snippet: +# +# .type function,@function +# function: +# movq %rsp,%rax # copy rsp to volatile register +# pushq %r15 # save non-volatile registers +# pushq %rbx +# pushq %rbp +# movq %rsp,%r11 +# subq %rdi,%r11 # prepare [variable] stack frame +# andq $-64,%r11 +# movq %rax,0(%r11) # check for exceptions +# movq %r11,%rsp # allocate [variable] stack frame +# movq %rax,0(%rsp) # save original rsp value +# magic_point: +# ... +# movq 0(%rsp),%rcx # pull original rsp value +# movq -24(%rcx),%rbp # restore non-volatile registers +# movq -16(%rcx),%rbx +# movq -8(%rcx),%r15 +# movq %rcx,%rsp # restore original rsp +# magic_epilogue: +# ret +# .size function,.-function +# +# The key is that up to magic_point copy of original rsp value remains +# in chosen volatile register and no non-volatile register, except for +# rsp, is modified. While past magic_point rsp remains constant till +# the very end of the function. In this case custom language-specific +# exception handler would look like this: +# +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +# { ULONG64 *rsp = (ULONG64 *)context->Rax; +# ULONG64 rip = context->Rip; +# +# if (rip >= magic_point) +# { rsp = (ULONG64 *)context->Rsp; +# if (rip < magic_epilogue) +# { rsp = (ULONG64 *)rsp[0]; +# context->Rbp = rsp[-3]; +# context->Rbx = rsp[-2]; +# context->R15 = rsp[-1]; +# } +# } +# context->Rsp = (ULONG64)rsp; +# context->Rdi = rsp[1]; +# context->Rsi = rsp[2]; +# +# memcpy (disp->ContextRecord,context,sizeof(CONTEXT)); +# RtlVirtualUnwind(UNW_FLAG_NHANDLER,disp->ImageBase, +# dips->ControlPc,disp->FunctionEntry,disp->ContextRecord, +# &disp->HandlerData,&disp->EstablisherFrame,NULL); +# return ExceptionContinueSearch; +# } +# +# It's appropriate to implement this handler in assembler, directly in +# function's module. In order to do that one has to know members' +# offsets in CONTEXT and DISPATCHER_CONTEXT structures and some constant +# values. Here they are: +# +# CONTEXT.Rax 120 +# CONTEXT.Rcx 128 +# CONTEXT.Rdx 136 +# CONTEXT.Rbx 144 +# CONTEXT.Rsp 152 +# CONTEXT.Rbp 160 +# CONTEXT.Rsi 168 +# CONTEXT.Rdi 176 +# CONTEXT.R8 184 +# CONTEXT.R9 192 +# CONTEXT.R10 200 +# CONTEXT.R11 208 +# CONTEXT.R12 216 +# CONTEXT.R13 224 +# CONTEXT.R14 232 +# CONTEXT.R15 240 +# CONTEXT.Rip 248 +# CONTEXT.Xmm6 512 +# sizeof(CONTEXT) 1232 +# DISPATCHER_CONTEXT.ControlPc 0 +# DISPATCHER_CONTEXT.ImageBase 8 +# DISPATCHER_CONTEXT.FunctionEntry 16 +# DISPATCHER_CONTEXT.EstablisherFrame 24 +# DISPATCHER_CONTEXT.TargetIp 32 +# DISPATCHER_CONTEXT.ContextRecord 40 +# DISPATCHER_CONTEXT.LanguageHandler 48 +# DISPATCHER_CONTEXT.HandlerData 56 +# UNW_FLAG_NHANDLER 0 +# ExceptionContinueSearch 1 +# +# In order to tie the handler to the function one has to compose +# couple of structures: one for .xdata segment and one for .pdata. +# +# UNWIND_INFO structure for .xdata segment would be +# +# function_unwind_info: +# .byte 9,0,0,0 +# .rva handler +# +# This structure designates exception handler for a function with +# zero-length prologue, no stack frame or frame register. +# +# To facilitate composing of .pdata structures, auto-generated "gear" +# prologue copies rsp value to rax and denotes next instruction with +# .LSEH_begin_{function_name} label. This essentially defines the SEH +# styling rule mentioned in the beginning. Position of this label is +# chosen in such manner that possible exceptions raised in the "gear" +# prologue would be accounted to caller and unwound from latter's frame. +# End of function is marked with respective .LSEH_end_{function_name} +# label. To summarize, .pdata segment would contain +# +# .rva .LSEH_begin_function +# .rva .LSEH_end_function +# .rva function_unwind_info +# +# Reference to function_unwind_info from .xdata segment is the anchor. +# In case you wonder why references are 32-bit .rvas and not 64-bit +# .quads. References put into these two segments are required to be +# *relative* to the base address of the current binary module, a.k.a. +# image base. No Win64 module, be it .exe or .dll, can be larger than +# 2GB and thus such relative references can be and are accommodated in +# 32 bits. +# +# Having reviewed the example function code, one can argue that "movq +# %rsp,%rax" above is redundant. It is not! Keep in mind that on Unix +# rax would contain an undefined value. If this "offends" you, use +# another register and refrain from modifying rax till magic_point is +# reached, i.e. as if it was a non-volatile register. If more registers +# are required prior [variable] frame setup is completed, note that +# nobody says that you can have only one "magic point." You can +# "liberate" non-volatile registers by denoting last stack off-load +# instruction and reflecting it in finer grade unwind logic in handler. +# After all, isn't it why it's called *language-specific* handler... +# +# SE handlers are also involved in unwinding stack when executable is +# profiled or debugged. Profiling implies additional limitations that +# are too subtle to discuss here. For now it's sufficient to say that +# in order to simplify handlers one should either a) offload original +# %rsp to stack (like discussed above); or b) if you have a register to +# spare for frame pointer, choose volatile one. +# +# (*) Note that we're talking about run-time, not debug-time. Lack of +# unwind information makes debugging hard on both Windows and +# Unix. "Unlike" refers to the fact that on Unix signal handler +# will always be invoked, core dumped and appropriate exit code +# returned to parent (for user notification). +# +######################################################################## +# As of May 2020 an alternative approach that works with both exceptions +# and debugging/profiling was implemented by re-purposing DWARF .cfi +# annotations even for Win64 unwind tables' generation. Unfortunately, +# but not really unexpectedly, it imposes additional limitations on +# coding style. Probably most significant limitation is that frame +# pointer has to be at 16*n distance from stack pointer at the exit +# from prologue. But first things first. There are two additional +# synthetic .cfi directives, .cfi_end_prologue and .cfi_epilogue, +# that need to be added to all functions marked with additional .type +# tag (see example below). There are "do's and don'ts" for prologue +# and epilogue. It shouldn't come as surprise that in prologue one may +# not modify non-volatile registers, but one may not modify %r11 either. +# This is because it's used as temporary frame pointer(*). There is one +# exception to this rule, and it's setting up frame pointer that is +# non-volatile or %r11. But it must be last instruction in the prologue. +# Constraints for epilogue, or rather on its boundary, depend on whether +# the frame is fixed- or variable-length. In fixed-frame subroutine +# stack pointer has to be restored in the last instruction prior the +# .cfi_epilogue directive. If it's variable-frame subroutine, and a +# non-volatile register was used as frame pointer, then last instruction +# prior the directive has to restore its original value. This means that +# final stack pointer adjustment would have to be pushed past the +# directive. Normally this would render the epilogue non-unwindable, so +# special care has to be taken. To resolve the dilemma, copy frame +# pointer to a volatile register in advance. To give an example: +# +# .type rbp_as_frame_pointer,\@function,3,"unwind" # mind extra tag! +# rbp_as_frame_pointer: +# .cfi_startproc +# push %rbp +# .cfi_push %rbp +# push %rbx +# .cfi_push %rbx +# mov %rsp,%rbp # last instruction in prologue +# .cfi_def_cfa_register %rbp # %rsp-%rbp has to be 16*n, e.g. 16*0 +# .cfi_end_prologue +# sub \$40,%rsp +# and \$-64,%rsp +# ... +# mov %rbp,%r11 +# .cfi_def_cfa_register %r11 # copy frame pointer to volatile %r11 +# mov 0(%rbp),%rbx +# mov 8(%rbp),%rbp # last instruction prior epilogue +# .cfi_epilogue # may not change %r11 in epilogue +# lea 16(%r11),%rsp +# ret +# .cfi_endproc +# .size rbp_as_frame_pointer,.-rbp_as_frame_pointer +# +# To give an example of fixed-frame subroutine for reference: +# +# .type fixed_frame,\@function,3,"unwind" # mind extra tag! +# fixed_frame: +# .cfi_startproc +# push %rbp +# .cfi_push %rbp +# push %rbx +# .cfi_push %rbx +# sub \$40,%rsp +# .cfi_adjust_cfa_offset 40 +# .cfi_end_prologue +# ... +# mov 40(%rsp),%rbx +# mov 48(%rsp),%rbp +# lea 56(%rsp),%rsp +# .cfi_adjust_cfa_offset -56 +# .cfi_epilogue +# ret +# .cfi_endproc +# .size fixed_frame,.-fixed_frame +# +# As for epilogue itself, one can only work on non-volatile registers. +# "Non-volatile" in "Windows" sense, i.e. minus %rdi and %rsi. +# +# On a final note, mixing old-style and modernized subroutines in the +# same file takes some trickery. Ones of the new kind have to appear +# after old-style ones. This has everything to do with the fact that +# entries in the .pdata segment have to appear in strictly same order +# as corresponding subroutines, and auto-generated RUNTIME_FUNCTION +# structures get mechanically appended to whatever existing .pdata. +# +# (*) Just in case, why %r11 and not %rax. This has everything to do +# with the way UNWIND_INFO is, one just can't designate %rax as +# frame pointer. diff --git a/src/client_min_pk.c b/src/client_min_pk.c new file mode 100644 index 00000000..3f04fb65 --- /dev/null +++ b/src/client_min_pk.c @@ -0,0 +1,16 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "keygen.c" +#include "e2.c" +#include "exp2.c" +#include "hash_to_field.c" +#include "map_to_g2.c" +#include "e1.c" +#include "exp.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" diff --git a/src/client_min_sig.c b/src/client_min_sig.c new file mode 100644 index 00000000..ab752ce1 --- /dev/null +++ b/src/client_min_sig.c @@ -0,0 +1,16 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "keygen.c" +#include "e1.c" +#include "exp.c" +#include "hash_to_field.c" +#include "map_to_g1.c" +#include "e2.c" +#include "exp2.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" diff --git a/src/consts.c b/src/consts.c new file mode 100644 index 00000000..021c878a --- /dev/null +++ b/src/consts.c @@ -0,0 +1,36 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "consts.h" + +/* z = -0xd201000000010000 */ +const vec384 BLS12_381_P = { /* (z-1)^2 * (z^4 - z^2 + 1)/3 + z */ + TO_LIMB_T(0xb9feffffffffaaab), TO_LIMB_T(0x1eabfffeb153ffff), + TO_LIMB_T(0x6730d2a0f6b0f624), TO_LIMB_T(0x64774b84f38512bf), + TO_LIMB_T(0x4b1ba7b6434bacd7), TO_LIMB_T(0x1a0111ea397fe69a) +}; +const limb_t BLS12_381_p0 = (limb_t)0x89f3fffcfffcfffd; /* -1/P */ + +const radix384 BLS12_381_Rx = { /* (1<<384)%P, "radix", one-in-Montgomery */ + { { ONE_MONT_P }, + { 0 } } +}; + +const vec384 BLS12_381_RR = { /* (1<<768)%P, "radix"^2, to-Montgomery */ + TO_LIMB_T(0xf4df1f341c341746), TO_LIMB_T(0x0a76e6a609d104f1), + TO_LIMB_T(0x8de5476c4c95b6d5), TO_LIMB_T(0x67eb88a9939d83c0), + TO_LIMB_T(0x9a793e85b519952d), TO_LIMB_T(0x11988fe592cae3aa) +}; + +const vec256 BLS12_381_r = { /* z^4 - z^2 + 1, group order */ + TO_LIMB_T(0xffffffff00000001), TO_LIMB_T(0x53bda402fffe5bfe), + TO_LIMB_T(0x3339d80809a1d805), TO_LIMB_T(0x73eda753299d7d48) +}; + +const vec256 BLS12_381_rRR = { /* (1<<512)%r, "radix"^2, to-Montgomery */ + TO_LIMB_T(0xc999e990f3f29c6d), TO_LIMB_T(0x2b6cedcb87925c23), + TO_LIMB_T(0x05d314967254398f), TO_LIMB_T(0x0748d9d99f59ff11) +}; diff --git a/src/consts.h b/src/consts.h new file mode 100644 index 00000000..1ae20f54 --- /dev/null +++ b/src/consts.h @@ -0,0 +1,28 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_CONST_H__ +#define __BLS12_381_ASM_CONST_H__ +#include "vect.h" + +extern const vec384 BLS12_381_P; +extern const limb_t BLS12_381_p0; +static const limb_t p0 = (limb_t)0x89f3fffcfffcfffd; /* -1/P */ +typedef union { vec384x p2; vec384 p; } radix384; +extern const radix384 BLS12_381_Rx; /* (1<<384)%P, "radix", one-in-Montgomery */ +extern const vec384 BLS12_381_RR; /* (1<<768)%P, "radix"^2, to-Montgomery */ + +#define ONE_MONT_P TO_LIMB_T(0x760900000002fffd), \ + TO_LIMB_T(0xebf4000bc40c0002), \ + TO_LIMB_T(0x5f48985753c758ba), \ + TO_LIMB_T(0x77ce585370525745), \ + TO_LIMB_T(0x5c071a97a256ec6d), \ + TO_LIMB_T(0x15f65ec3fa80e493) + +extern const vec256 BLS12_381_r; /* order */ +static const limb_t r0 = (limb_t)0xfffffffeffffffff; /* -1/r */ +extern const vec256 BLS12_381_rRR; /* (1<<512)%r, "radix"^2, to-Montgomery */ + +#endif diff --git a/src/e1.c b/src/e1.c new file mode 100644 index 00000000..a8f5a6a0 --- /dev/null +++ b/src/e1.c @@ -0,0 +1,390 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" +#include "errors.h" + +/* + * y^2 = x^3 + B + */ +static const vec384 B_E1 = { /* (4 << 384) % P */ + TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), + TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), + TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) +}; + +const POINTonE1 BLS12_381_G1 = { /* generator point [in Montgomery] */ + /* (0x17f1d3a73197d7942695638c4fa9ac0fc3688c4f9774b905 + * a14e3a3f171bac586c55e83ff97a1aeffb3af00adb22c6bb << 384) % P */ + { TO_LIMB_T(0x5cb38790fd530c16), TO_LIMB_T(0x7817fc679976fff5), + TO_LIMB_T(0x154f95c7143ba1c1), TO_LIMB_T(0xf0ae6acdf3d0e747), + TO_LIMB_T(0xedce6ecc21dbf440), TO_LIMB_T(0x120177419e0bfb75) }, + /* (0x08b3f481e3aaa0f1a09e30ed741d8ae4fcf5e095d5d00af6 + * 00db18cb2c04b3edd03cc744a2888ae40caa232946c5e7e1 << 384) % P */ + { TO_LIMB_T(0xbaac93d50ce72271), TO_LIMB_T(0x8c22631a7918fd8e), + TO_LIMB_T(0xdd595f13570725ce), TO_LIMB_T(0x51ac582950405194), + TO_LIMB_T(0x0e1c8c3fad0059c0), TO_LIMB_T(0x0bbc3efc5008a26a) }, + { ONE_MONT_P } +}; + +const POINTonE1 BLS12_381_NEG_G1 = { /* negative generator [in Montgomery] */ + /* (0x17f1d3a73197d7942695638c4fa9ac0fc3688c4f9774b905 + * a14e3a3f171bac586c55e83ff97a1aeffb3af00adb22c6bb << 384) % P */ + { TO_LIMB_T(0x5cb38790fd530c16), TO_LIMB_T(0x7817fc679976fff5), + TO_LIMB_T(0x154f95c7143ba1c1), TO_LIMB_T(0xf0ae6acdf3d0e747), + TO_LIMB_T(0xedce6ecc21dbf440), TO_LIMB_T(0x120177419e0bfb75) }, + /* (0x114d1d6855d545a8aa7d76c8cf2e21f267816aef1db507c9 + * 6655b9d5caac42364e6f38ba0ecb751bad54dcd6b939c2ca << 384) % P */ + { TO_LIMB_T(0xff526c2af318883a), TO_LIMB_T(0x92899ce4383b0270), + TO_LIMB_T(0x89d7738d9fa9d055), TO_LIMB_T(0x12caf35ba344c12a), + TO_LIMB_T(0x3cff1b76964b5317), TO_LIMB_T(0x0e44d2ede9774430) }, + { ONE_MONT_P } +}; + +#if 1 +void mul_by_b_onE1(vec384 out, const vec384 in); +void mul_by_4b_onE1(vec384 out, const vec384 in); +#else +static inline void mul_by_b_onE1(vec384 out, const vec384 in) +{ lshift_mod_384(out, in, 2, BLS12_381_P); } + +static inline void mul_by_4b_onE1(vec384 out, const vec384 in) +{ lshift_mod_384(out, in, 4, BLS12_381_P); } +#endif + +static void POINTonE1_cneg(POINTonE1 *p, limb_t cbit) +{ cneg_fp(p->Y, p->Y, cbit); } + +static void POINTonE1_from_Jacobian(POINTonE1 *out, const POINTonE1 *in) +{ + vec384 Z, ZZ; + limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); + + reciprocal_fp(Z, in->Z); /* 1/Z */ + + sqr_fp(ZZ, Z); + mul_fp(out->X, in->X, ZZ); /* X = X/Z^2 */ + + mul_fp(ZZ, ZZ, Z); + mul_fp(out->Y, in->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, in->Z, BLS12_381_G1.Z, + sizeof(BLS12_381_G1.Z), inf); /* Z = inf ? 0 : 1 */ +} + +static void POINTonE1_to_affine(POINTonE1_affine *out, const POINTonE1 *in) +{ + POINTonE1 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE1_from_Jacobian(&p, in); + in = &p; + } + vec_copy(out, in, sizeof(*out)); +} + +static limb_t POINTonE1_affine_on_curve(const POINTonE1_affine *p) +{ + vec384 XXX, YY; + + sqr_fp(XXX, p->X); + mul_fp(XXX, XXX, p->X); /* X^3 */ + add_fp(XXX, XXX, B_E1); /* X^3 + B */ + + sqr_fp(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)); +} + +static limb_t POINTonE1_on_curve(const POINTonE1 *p) +{ + vec384 XXX, YY, BZ6; + limb_t inf = vec_is_zero(p->Z, sizeof(p->Z)); + + sqr_fp(BZ6, p->Z); + mul_fp(BZ6, BZ6, p->Z); + sqr_fp(BZ6, BZ6); /* Z^6 */ + mul_by_b_onE1(BZ6, BZ6); /* B*Z^6 */ + + sqr_fp(XXX, p->X); + mul_fp(XXX, XXX, p->X); /* X^3 */ + add_fp(XXX, XXX, BZ6); /* X^3 + B*Z^6 */ + + sqr_fp(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)) | inf; +} + +static limb_t POINTonE1_affine_Serialize_BE(unsigned char out[96], + const POINTonE1_affine *in) +{ + vec384 temp; + + from_fp(temp, in->X); + be_bytes_from_limbs(out, temp, sizeof(temp)); + + from_fp(temp, in->Y); + be_bytes_from_limbs(out + 48, temp, sizeof(temp)); + + return sgn0_pty_mod_384(temp, BLS12_381_P); +} + +void blst_p1_affine_serialize(unsigned char out[96], + const POINTonE1_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + vec_zero(out, 96); + out[0] = 0x40; /* infinitiy bit */ + } + + (void)POINTonE1_affine_Serialize_BE(out, in); +} + +static limb_t POINTonE1_Serialize_BE(unsigned char out[96], + const POINTonE1 *in) +{ + POINTonE1 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE1_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE1_affine_Serialize_BE(out, (const POINTonE1_affine *)in); +} + +static void POINTonE1_Serialize(unsigned char out[96], const POINTonE1 *in) +{ + limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); + + if (inf) { + vec_zero(out, 96); + out[0] = 0x40; /* infinitiy bit */ + } else { + (void)POINTonE1_Serialize_BE(out, in); + } +} + +void blst_p1_serialize(unsigned char out[96], const POINTonE1 *in) +{ POINTonE1_Serialize(out, in); } + +static limb_t POINTonE1_affine_Compress_BE(unsigned char out[48], + const POINTonE1_affine *in) +{ + vec384 temp; + + from_fp(temp, in->X); + be_bytes_from_limbs(out, temp, sizeof(temp)); + + return sgn0_pty_mont_384(in->Y, BLS12_381_P, p0); +} + +void blst_p1_affine_compress(unsigned char out[48], const POINTonE1_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + vec_zero(out, 48); + out[0] = 0xc0; /* compressed and infinitiy bits */ + } else { + unsigned char sign = (unsigned char)POINTonE1_affine_Compress_BE(out, + in); + out[0] |= 0x80 | ((sign & 2) << 4); + } +} + +static limb_t POINTonE1_Compress_BE(unsigned char out[48], + const POINTonE1 *in) +{ + POINTonE1 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE1_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE1_affine_Compress_BE(out, (const POINTonE1_affine *)in); +} + +void blst_p1_compress(unsigned char out[48], const POINTonE1 *in) +{ + limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); + + if (inf) { + vec_zero(out, 48); + out[0] = 0xc0; /* compressed and infinitiy bits */ + } else { + unsigned char sign = (unsigned char)POINTonE1_Compress_BE(out, in); + out[0] |= 0x80 | ((sign & 2) << 4); + } +} + +static limb_t POINTonE1_Uncompress_BE(POINTonE1_affine *out, + const unsigned char in[48]) +{ + limbs_from_be_bytes(out->X, in, sizeof(out->X)); + /* clear top 3 bits in case caller was conveying some information there */ + out->X[sizeof(out->X)/sizeof(limb_t)-1] &= (limb_t)0xffffffffffffffff >> 3; + mul_fp(out->X, out->X, BLS12_381_RR); + + sqr_fp(out->Y, out->X); + mul_fp(out->Y, out->Y, out->X); + add_fp(out->Y, out->Y, B_E1); /* X^3 + B */ + if (!sqrt_fp(out->Y, out->Y)) + return (limb_t)0 - BLST_POINT_NOT_ON_CURVE; + + /* + * Even though (0,2) is formally a point on E1 curve it's turned to + * infinity... + */ + vec_select(out->Y, out->X, out->Y, sizeof(out->Y), + vec_is_zero(out->X, sizeof(out->X))); + + return sgn0_pty_mont_384(out->Y, BLS12_381_P, p0); +} + +static BLST_ERROR POINTonE1_Uncompress(POINTonE1_affine *out, + const unsigned char in[48]) +{ + unsigned char info = in[0] & 0xe0; + limb_t sgn0_pty; + + if ((in[0] & 0x80) == 0) /* compressed bit */ + return BLST_BAD_ENCODING; + + if (in[0] & 0x40) { /* infinity bit */ + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + + sgn0_pty = POINTonE1_Uncompress_BE(out, in); + + if (sgn0_pty > 3) + return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */ + + sgn0_pty ^= (info >> 4) & 2; + sgn0_pty >>= 1; + cneg_fp(out->Y, out->Y, sgn0_pty); + + return BLST_SUCCESS; +} + +BLST_ERROR blst_p1_uncompress(POINTonE1_affine *out, const unsigned char in[48]) +{ return POINTonE1_Uncompress(out, in); } + +static void POINTonE1_Deserialize_BE(POINTonE1_affine *out, + const unsigned char in[96]) +{ + limbs_from_be_bytes(out->X, in, sizeof(out->X)); + /* clear top 3 bits in case caller was conveying some information there */ + out->X[sizeof(out->X)/sizeof(limb_t)-1] &= (limb_t)0xffffffffffffffff >> 3; + mul_fp(out->X, out->X, BLS12_381_RR); + + limbs_from_be_bytes(out->Y, in + 48, sizeof(out->Y)); + mul_fp(out->Y, out->Y, BLS12_381_RR); + + /* + * Even though (0,2) is formally a point on E1 curve it's turned to + * infinity... + */ + vec_select(out->Y, out->X, out->Y, sizeof(out->Y), + vec_is_zero(out->X, sizeof(out->X))); +} + +BLST_ERROR blst_p1_deserialize(POINTonE1_affine *out, + const unsigned char in[96]) +{ + if (in[0] & 0x80) /* compressed bit */ + return POINTonE1_Uncompress(out, in); + + if (in[0] & 0x40) { /* infinity bit */ + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + + POINTonE1_Deserialize_BE(out, in); + + if (!POINTonE1_affine_on_curve(out)) + return BLST_POINT_NOT_ON_CURVE; + + return BLST_SUCCESS; +} + +void blst_sk_to_pk_in_g1(POINTonE1 *out, const vec256 SK) +{ POINTonE1_mult_w5(out, &BLS12_381_G1, SK, 255); } + +void blst_sign_pk_in_g2(POINTonE1 *out, const POINTonE1 *msg, const vec256 SK) +{ POINTonE1_mult_w5(out, msg, SK, 255); } + +void blst_sk_to_pk2_in_g1(unsigned char out[96], POINTonE1_affine *PK, + const vec256 SK) +{ + POINTonE1 P[1]; + + POINTonE1_mult_w5(P, &BLS12_381_G1, SK, 255); + POINTonE1_from_Jacobian(P, P); + if (PK != NULL) + vec_copy(PK, P, sizeof(*PK)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_sign_pk2_in_g2(unsigned char out[96], POINTonE1_affine *sig, + const POINTonE1 *hash, const vec256 SK) +{ + POINTonE1 P[1]; + + POINTonE1_mult_w5(P, hash, SK, 255); + POINTonE1_from_Jacobian(P, P); + if (sig != NULL) + vec_copy(sig, P, sizeof(*sig)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_p1_cneg(POINTonE1 *a, size_t cbit) +{ POINTonE1_cneg(a, cbit); } + +void blst_p1_from_jacobian(POINTonE1 *out, const POINTonE1 *a) +{ POINTonE1_from_Jacobian(out, a); } + +void blst_p1_to_affine(POINTonE1_affine *out, const POINTonE1 *a) +{ POINTonE1_to_affine(out, a); } + +void blst_p1_from_affine(POINTonE1 *out, const POINTonE1_affine *a) +{ + vec_copy(out, a, sizeof(*a)); + vec_select(out->Z, a->X, BLS12_381_Rx.p, sizeof(out->Z), + vec_is_zero(a, sizeof(*a))); +} + +limb_t blst_p1_on_curve(const POINTonE1 *p) +{ return POINTonE1_on_curve(p); } + +limb_t blst_p1_affine_on_curve(const POINTonE1_affine *p) +{ return POINTonE1_affine_on_curve(p); } + +#include "ec_ops.h" +POINT_DADD_IMPL(POINTonE1, 384, fp) +POINT_DADD_AFFINE_IMPL_A0(POINTonE1, 384, fp, BLS12_381_Rx.p) +POINT_ADD_IMPL(POINTonE1, 384, fp) +POINT_ADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p) +POINT_DOUBLE_IMPL_A0(POINTonE1, 384, fp) + +#include "ec_mult.h" +POINT_MULT_SCALAR_W5_IMPL(POINTonE1) +POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE1) + +DECLARE_PRIVATE_POINTXZ(POINTonE1, 384) +POINT_LADDER_PRE_IMPL(POINTonE1, 384, fp) +POINT_LADDER_STEP_IMPL_A0(POINTonE1, 384, fp, onE1) +POINT_LADDER_POST_IMPL_A0(POINTonE1, 384, fp, onE1) +POINT_MULT_SCALAR_LADDER_IMPL(POINTonE1) diff --git a/src/e2.c b/src/e2.c new file mode 100644 index 00000000..d70b7dac --- /dev/null +++ b/src/e2.c @@ -0,0 +1,434 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" + +/* + * y^2 = x^3 + B + */ +static const vec384x B_E2 = { /* 4 + 4*i */ + { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), + TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), + TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) }, + { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), + TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), + TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) } +}; + +const POINTonE2 BLS12_381_G2 = { /* generator point [in Montgomery] */ +{ /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02 + b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */ + { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a), + TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9), + TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) }, + /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a + b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */ + { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3), + TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367), + TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) } +}, +{ /* (0x0ce5d527727d6e118cc9cdc6da2e351aadfd9baa8cbdd3a7 + 6d429a695160d12c923ac9cc3baca289e193548608b82801 << 384) % P */ + { TO_LIMB_T(0x4c730af860494c4a), TO_LIMB_T(0x597cfa1f5e369c5a), + TO_LIMB_T(0xe7e6856caa0a635a), TO_LIMB_T(0xbbefb5e96e0d495f), + TO_LIMB_T(0x07d3a975f0ef25a2), TO_LIMB_T(0x0083fd8e7e80dae5) }, + /* (0x0606c4a02ea734cc32acd2b02bc28b99cb3e287e85a763af + 267492ab572e99ab3f370d275cec1da1aaa9075ff05f79be << 384) % P */ + { TO_LIMB_T(0xadc0fc92df64b05d), TO_LIMB_T(0x18aa270a2b1461dc), + TO_LIMB_T(0x86adac6a3be4eba0), TO_LIMB_T(0x79495c4ec93da33a), + TO_LIMB_T(0xe7175850a43ccaed), TO_LIMB_T(0x0b2bc2a163de1bf2) }, +}, +{ { ONE_MONT_P }, { 0 } } +}; + +const POINTonE2 BLS12_381_NEG_G2 = { /* negative generator [in Montgomery] */ +{ /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02 + b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */ + { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a), + TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9), + TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) }, + /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a + b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */ + { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3), + TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367), + TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) } +}, +{ /* (0x0d1b3cc2c7027888be51d9ef691d77bcb679afda66c73f17 + f9ee3837a55024f78c71363275a75d75d86bab79f74782aa << 384) % P */ + { TO_LIMB_T(0x6d8bf5079fb65e61), TO_LIMB_T(0xc52f05df531d63a5), + TO_LIMB_T(0x7f4a4d344ca692c9), TO_LIMB_T(0xa887959b8577c95f), + TO_LIMB_T(0x4347fe40525c8734), TO_LIMB_T(0x197d145bbaff0bb5) }, + /* (0x13fa4d4a0ad8b1ce186ed5061789213d993923066dddaf10 + 40bc3ff59f825c78df74f2d75467e25e0f55f8a00fa030ed << 384) % P */ + { TO_LIMB_T(0x0c3e036d209afa4e), TO_LIMB_T(0x0601d8f4863f9e23), + TO_LIMB_T(0xe0832636bacc0a84), TO_LIMB_T(0xeb2def362a476f84), + TO_LIMB_T(0x64044f659f0ee1e9), TO_LIMB_T(0x0ed54f48d5a1caa7) } +}, +{ { ONE_MONT_P }, { 0 } } +}; + +#if 1 +void mul_by_b_onE2(vec384x out, const vec384x in); +void mul_by_4b_onE2(vec384x out, const vec384x in); +#else +static void mul_by_b_onE2(vec384x out, const vec384x in) +{ + sub_mod_384(out[0], in[0], in[1], BLS12_381_P); + add_mod_384(out[1], in[0], in[1], BLS12_381_P); + lshift_mod_384(out[0], out[0], 2, BLS12_381_P); + lshift_mod_384(out[1], out[1], 2, BLS12_381_P); +} + +static void mul_by_4b_onE2(vec384x out, const vec384x in) +{ + sub_mod_384(out[0], in[0], in[1], BLS12_381_P); + add_mod_384(out[1], in[0], in[1], BLS12_381_P); + lshift_mod_384(out[0], out[0], 4, BLS12_381_P); + lshift_mod_384(out[1], out[1], 4, BLS12_381_P); +} +#endif + +static void POINTonE2_cneg(POINTonE2 *p, limb_t cbit) +{ + cneg_fp2(p->Y, p->Y, cbit); +} + +static void POINTonE2_from_Jacobian(POINTonE2 *out, const POINTonE2 *in) +{ + vec384x Z, ZZ; + limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); + + reciprocal_fp2(Z, in->Z); /* 1/Z */ + + sqr_fp2(ZZ, Z); + mul_fp2(out->X, in->X, ZZ); /* X = X/Z^2 */ + + mul_fp2(ZZ, ZZ, Z); + mul_fp2(out->Y, in->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, in->Z, BLS12_381_G2.Z, + sizeof(BLS12_381_G2.Z), inf); /* Z = inf ? 0 : 1 */ +} + +static void POINTonE2_to_affine(POINTonE2_affine *out, const POINTonE2 *in) +{ + POINTonE2 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) { + POINTonE2_from_Jacobian(&p, in); + in = &p; + } + vec_copy(out, in, sizeof(*out)); +} + +static limb_t POINTonE2_affine_on_curve(const POINTonE2_affine *p) +{ + vec384x XXX, YY; + + sqr_fp2(XXX, p->X); + mul_fp2(XXX, XXX, p->X); /* X^3 */ + add_fp2(XXX, XXX, B_E2); /* X^3 + B */ + + sqr_fp2(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)); +} + +static limb_t POINTonE2_on_curve(const POINTonE2 *p) +{ + vec384x XXX, YY, BZ6; + limb_t inf = vec_is_zero(p->Z, sizeof(p->Z)); + + sqr_fp2(BZ6, p->Z); + mul_fp2(BZ6, BZ6, p->Z); + sqr_fp2(XXX, BZ6); /* Z^6 */ + mul_by_b_onE2(BZ6, XXX); /* B*Z^6 */ + + sqr_fp2(XXX, p->X); + mul_fp2(XXX, XXX, p->X); /* X^3 */ + add_fp2(XXX, XXX, BZ6); /* X^3 + B*Z^6 */ + + sqr_fp2(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)) | inf; +} + +static limb_t POINTonE2_affine_Serialize_BE(unsigned char out[192], + const POINTonE2_affine *in) +{ + vec384x temp; + + from_fp(temp[1], in->X[1]); + be_bytes_from_limbs(out, temp[1], sizeof(temp[1])); + from_fp(temp[0], in->X[0]); + be_bytes_from_limbs(out + 48, temp[0], sizeof(temp[0])); + + from_fp(temp[1], in->Y[1]); + be_bytes_from_limbs(out + 96, temp[1], sizeof(temp[1])); + from_fp(temp[0], in->Y[0]); + be_bytes_from_limbs(out + 144, temp[0], sizeof(temp[0])); + + return sgn0_pty_mod_384x(temp, BLS12_381_P); +} + +void blst_p2_affine_serialize(unsigned char out[192], + const POINTonE2_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + vec_zero(out, 192); + out[0] = 0x40; /* infinitiy bit */ + } + + (void)POINTonE2_affine_Serialize_BE(out, in); +} + +static limb_t POINTonE2_Serialize_BE(unsigned char out[192], + const POINTonE2 *in) +{ + POINTonE2 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) { + POINTonE2_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE2_affine_Serialize_BE(out, (const POINTonE2_affine *)in); +} + +static void POINTonE2_Serialize(unsigned char out[192], const POINTonE2 *in) +{ + limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); + + if (inf) { + vec_zero(out, 192); + out[0] = 0x40; /* infinitiy bit */ + } else { + (void)POINTonE2_Serialize_BE(out, in); + } +} + +void blst_p2_serialize(unsigned char out[192], const POINTonE2 *in) +{ POINTonE2_Serialize(out, in); } + +static limb_t POINTonE2_affine_Compress_BE(unsigned char out[96], + const POINTonE2_affine *in) +{ + vec384 temp; + + from_fp(temp, in->X[1]); + be_bytes_from_limbs(out, temp, sizeof(temp)); + from_fp(temp, in->X[0]); + be_bytes_from_limbs(out + 48, temp, sizeof(temp)); + + return sgn0_pty_mont_384x(in->Y, BLS12_381_P, p0); +} + +void blst_p2_affine_compress(unsigned char out[96], const POINTonE2_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + vec_zero(out, 96); + out[0] = 0xc0; /* compressed and infinitiy bits */ + } else { + unsigned char sign = (unsigned char)POINTonE2_affine_Compress_BE(out, + in); + out[0] |= 0x80 | ((sign & 2) << 4); + } +} + +static limb_t POINTonE2_Compress_BE(unsigned char out[96], + const POINTonE2 *in) +{ + POINTonE2 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE2_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE2_affine_Compress_BE(out, (const POINTonE2_affine *)in); +} + +void blst_p2_compress(unsigned char out[96], const POINTonE2 *in) +{ + limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); + + if (inf) { + vec_zero(out, 96); + out[0] = 0xc0; /* compressed and infinitiy bits */ + } else { + unsigned char sign = (unsigned char)POINTonE2_Compress_BE(out, in); + out[0] |= 0x80 | ((sign & 2) << 4); + } +} + +static limb_t POINTonE2_Uncompress_BE(POINTonE2_affine *out, + const unsigned char in[96]) +{ + limbs_from_be_bytes(out->X[1], in, sizeof(out->X[1])); + /* clear top 3 bits in case caller was conveying some information there */ + out->X[1][sizeof(out->X[1])/sizeof(limb_t)-1] &= (limb_t)0xffffffffffffffff >> 3; + mul_fp(out->X[1], out->X[1], BLS12_381_RR); + + limbs_from_be_bytes(out->X[0], in + 48, sizeof(out->X[0])); + mul_fp(out->X[0], out->X[0], BLS12_381_RR); + + sqr_fp2(out->Y, out->X); + mul_fp2(out->Y, out->Y, out->X); + add_fp2(out->Y, out->Y, B_E2); /* X^3 + B */ + if (!sqrt_fp2(out->Y, out->Y)) + return (limb_t)0 - BLST_POINT_NOT_ON_CURVE; + + vec_select(out->Y, out->X, out->Y, sizeof(out->Y), + vec_is_zero(out->X, sizeof(out->X))); + + return sgn0_pty_mont_384x(out->Y, BLS12_381_P, p0); +} + +static BLST_ERROR POINTonE2_Uncompress(POINTonE2_affine *out, + const unsigned char in[96]) +{ + unsigned char info = in[0] & 0xe0; + limb_t sgn0_pty; + + if ((in[0] & 0x80) == 0) /* compressed bit */ + return BLST_BAD_ENCODING; + + if (in[0] & 0x40) { /* infinity bit */ + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + + sgn0_pty = POINTonE2_Uncompress_BE(out, in); + + if (sgn0_pty > 3) + return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */ + + sgn0_pty ^= (info >> 4) & 2; + sgn0_pty >>= 1; + cneg_fp2(out->Y, out->Y, sgn0_pty); + + return BLST_SUCCESS; +} + +BLST_ERROR blst_p2_uncompress(POINTonE2_affine *out, const unsigned char in[96]) +{ return POINTonE2_Uncompress(out, in); } + +static void POINTonE2_Deserialize_BE(POINTonE2_affine *out, + const unsigned char in[192]) +{ + limbs_from_be_bytes(out->X[1], in, sizeof(out->X[1])); + /* clear top 3 bits in case caller was conveying some information there */ + out->X[1][sizeof(out->X[1])/sizeof(limb_t)-1] &= (limb_t)0xffffffffffffffff >> 3; + mul_fp(out->X[1], out->X[1], BLS12_381_RR); + + limbs_from_be_bytes(out->X[0], in + 48, sizeof(out->X[0])); + mul_fp(out->X[0], out->X[0], BLS12_381_RR); + + limbs_from_be_bytes(out->Y[1], in + 96, sizeof(out->Y[1])); + mul_fp(out->Y[1], out->Y[1], BLS12_381_RR); + + limbs_from_be_bytes(out->Y[0], in + 144, sizeof(out->Y[0])); + mul_fp(out->Y[0], out->Y[0], BLS12_381_RR); + + vec_select(out->Y, out->X, out->Y, sizeof(out->Y), + vec_is_zero(out->X, sizeof(out->X))); +} + +int blst_p2_deserialize(POINTonE2_affine *out, const unsigned char in[192]) +{ + if (in[0] & 0x80) /* compressed bit */ + return POINTonE2_Uncompress(out, in); + + if (in[0] & 0x40) { /* infinity bit */ + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + + POINTonE2_Deserialize_BE(out, in); + + if (!POINTonE2_affine_on_curve(out)) + return BLST_POINT_NOT_ON_CURVE; + + return BLST_SUCCESS; +} + +void blst_sk_to_pk_in_g2(POINTonE2 *out, const vec256 SK) +{ POINTonE2_mult_w5(out, &BLS12_381_G2, SK, 255); } + +void blst_sign_pk_in_g1(POINTonE2 *out, const POINTonE2 *msg, const vec256 SK) +{ POINTonE2_mult_w5(out, msg, SK, 255); } + +void blst_sk_to_pk2_in_g2(unsigned char out[192], POINTonE2_affine *PK, + const vec256 SK) +{ + POINTonE2 P[1]; + + POINTonE2_mult_w5(P, &BLS12_381_G2, SK, 255); + POINTonE2_from_Jacobian(P, P); + if (PK != NULL) + vec_copy(PK, P, sizeof(*PK)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_sign_pk2_in_g1(unsigned char out[192], POINTonE2_affine *sig, + const POINTonE2 *hash, const vec256 SK) +{ + POINTonE2 P[1]; + + POINTonE2_mult_w5(P, hash, SK, 255); + POINTonE2_from_Jacobian(P, P); + if (sig != NULL) + vec_copy(sig, P, sizeof(*sig)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_p2_cneg(POINTonE2 *a, size_t cbit) +{ POINTonE2_cneg(a, cbit); } + +void blst_p2_from_jacobian(POINTonE2 *out, const POINTonE2 *a) +{ POINTonE2_from_Jacobian(out, a); } + +void blst_p2_to_affine(POINTonE2_affine *out, const POINTonE2 *a) +{ POINTonE2_to_affine(out, a); } + +void blst_p2_from_affine(POINTonE2 *out, const POINTonE2_affine *a) +{ + vec_copy(out, a, sizeof(*a)); + vec_select(out->Z, a->X, BLS12_381_Rx.p2, sizeof(out->Z), + vec_is_zero(a, sizeof(*a))); +} + +limb_t blst_p2_on_curve(const POINTonE2 *p) +{ return POINTonE2_on_curve(p); } + +limb_t blst_p2_affine_on_curve(const POINTonE2_affine *p) +{ return POINTonE2_affine_on_curve(p); } + +#include "ec_ops.h" +POINT_DADD_IMPL(POINTonE2, 384x, fp2) +POINT_DADD_AFFINE_IMPL_A0(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINT_ADD_IMPL(POINTonE2, 384x, fp2) +POINT_ADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINT_DOUBLE_IMPL_A0(POINTonE2, 384x, fp2) + +#include "ec_mult.h" +POINT_MULT_SCALAR_W5_IMPL(POINTonE2) +POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE2) + +DECLARE_PRIVATE_POINTXZ(POINTonE2, 384x) +POINT_LADDER_PRE_IMPL(POINTonE2, 384x, fp2) +POINT_LADDER_STEP_IMPL_A0(POINTonE2, 384x, fp2, onE2) +POINT_LADDER_POST_IMPL_A0(POINTonE2, 384x, fp2, onE2) +POINT_MULT_SCALAR_LADDER_IMPL(POINTonE2) diff --git a/src/ec_mult.h b/src/ec_mult.h new file mode 100644 index 00000000..eb9c8743 --- /dev/null +++ b/src/ec_mult.h @@ -0,0 +1,287 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_EC_MULT_H__ +#define __BLS12_381_ASM_EC_MULT_H__ + +#include "point.h" + +static limb_t get_wval(const limb_t *d, size_t off, size_t bits) +{ + size_t top = off + bits - 1; + limb_t ret; + const union { + long one; + char little; + } is_endian = { 1 }; + + if (is_endian.little) { + const unsigned char *c = (const unsigned char *)d; + + ret = (limb_t)c[off / 8] | ((limb_t)c[top / 8] << 8); + ret >>= (off % 8); + } else { + size_t bitoff = off % LIMB_T_BITS; + + ret = d[off / LIMB_T_BITS]; + ret >>= bitoff; + /* in case bitoff is zero, we'll be or-ing same limb over itself */ + bitoff = (LIMB_T_BITS - bitoff) % LIMB_T_BITS; + ret |= d[top / LIMB_T_BITS] << bitoff; + } + + return ret; +} + +/* + * Window value encoding that utilizes the fact that -P is trivially + * calculated, which allows to halve the size of pre-computed table, + * is attributed to A. D. Booth, hence the name of the subroutines... + */ +static limb_t booth_encode_w5(limb_t wval) +{ + limb_t mask = 0 - (wval >> 5); /* "sign" bit -> mask */ + + wval = (wval + 1) >> 1; + wval = (wval & ~mask) | ((0-wval) & mask); + + /* &0x1f, but <=0x10, is index in table, rest is extended "sign" bit */ + return wval; +} + +/* + * Key feature of these constant-time subroutines is that they tolerate + * zeros in most significant bit positions of the scalar[s], or in other + * words, zero-padded scalar values. This means that one can and should + * pass order's bit-length, which is customarily publicly known, instead + * of the factual scalars' bit-lengths. This is facilitated by point + * addition subroutines implemented to handle points at infinity, which + * are encoded as Z==0. [Doubling agorithms handle such points at + * infinity "naturally," since resulting Z is product of original Z.] + */ +#define POINT_MULT_SCALAR_W5_IMPL(ptype) \ +static void ptype##_gather_booth_w5(ptype *restrict p, const ptype table[16], \ + limb_t booth_idx) \ +{ \ + size_t i; \ + limb_t booth_sign = (booth_idx >> 5) & 1; \ +\ + booth_idx &= 0x1f; \ + vec_zero(p, sizeof(ptype)); /* implicit infinity at table[-1] */\ + /* ~6% with -Os, ~2% with -O3 ... */\ + for (i = 1; i <= 16; i++) \ + ptype##_ccopy(p, table + i - 1, i == booth_idx); \ +\ + ptype##_cneg(p, booth_sign); \ +} \ +\ +static void ptype##_precompute(ptype *row, const ptype *point) \ +{ \ + row--; /* row[-1] is implicit infinity */\ +\ + vec_copy(row + 1, point, sizeof(ptype)); /* row[ 1]=p*1 */\ + ptype##_double(row + 2, point); /* row[ 2]=p*(1+1) */\ + ptype##_add (row + 3, row + 2, row + 1); /* row[ 3]=p*(2+1) */\ + ptype##_double(row + 4, row + 2); /* row[ 4]=p*(2+2) */\ + ptype##_add (row + 5, row + 3, row + 2); /* row[ 5]=p*(3+2) */\ + ptype##_double(row + 6, row + 3); /* row[ 6]=p*(3+3) */\ + ptype##_add (row + 7, row + 4, row + 3); /* row[ 7]=p*(4+3) */\ + ptype##_double(row + 8, row + 4); /* row[ 8]=p*(4+4) */\ + ptype##_add (row + 9, row + 5, row + 4); /* row[ 9]=p*(5+4) */\ + ptype##_double(row + 10, row + 5); /* row[10]=p*(5+5) */\ + ptype##_add (row + 11, row + 6, row + 5); /* row[11]=p*(6+5) */\ + ptype##_double(row + 12, row + 6); /* row[12]=p*(6+6) */\ + ptype##_add (row + 13, row + 7, row + 6); /* row[13]=p*(7+6) */\ + ptype##_double(row + 14, row + 7); /* row[14]=p*(7+7) */\ + ptype##_add (row + 15, row + 8, row + 7); /* row[15]=p*(8+7) */\ + ptype##_double(row + 16, row + 8); /* row[16]=p*(8+8) */\ +} \ +\ +static void ptype##s_mult_w5(ptype *ret, \ + const ptype *points[], size_t npoints, \ + const limb_t *scalars[], size_t bits, \ + ptype table[][16]) \ +{ \ + limb_t wmask, wval; \ + size_t i, window; \ + ptype temp[1]; \ +\ + if (table == NULL) \ + table = alloca(16 * sizeof(ptype) * npoints); \ +\ + for (i = 0; i < npoints; i++) \ + ptype##_precompute(table[i], points[i]); \ +\ + /* top excess bits modulo target window size */ \ + window = bits % 5; /* yes, it may be zero */ \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ +\ + bits -= window; \ + if (bits > 0) \ + wval = get_wval(scalars[0], bits - 1, window + 1) & wmask; \ + else \ + wval = (scalars[0][0] << 1) & wmask; \ +\ + wval = booth_encode_w5(wval); \ + ptype##_gather_booth_w5(ret, table[0], wval); \ +\ + i = 1; \ + while (bits > 0) { \ + for (; i < npoints; i++) { \ + wval = get_wval(scalars[i], bits - 1, window + 1) & wmask; \ + wval = booth_encode_w5(wval); \ + ptype##_gather_booth_w5(temp, table[i], wval); \ + ptype##_dadd(ret, ret, temp, NULL); \ + } \ +\ + ptype##_double(ret, ret); \ + ptype##_double(ret, ret); \ + ptype##_double(ret, ret); \ + ptype##_double(ret, ret); \ + ptype##_double(ret, ret); \ +\ + window = 5; \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ + bits -= window; \ + i = 0; \ + } \ +\ + for (; i < npoints; i++) { \ + wval = (scalars[i][0] << 1) & wmask; \ + wval = booth_encode_w5(wval); \ + ptype##_gather_booth_w5(temp, table[i], wval); \ + ptype##_dadd(ret, ret, temp, NULL); \ + } \ +} \ +\ +static void ptype##_mult_w5(ptype *ret, const ptype *point, \ + const limb_t *scalar, size_t bits) \ +{ \ + limb_t wmask, wval; \ + size_t window; \ + ptype temp[1]; \ + ptype table[16]; \ +\ + ptype##_precompute(table, point); \ +\ + /* top excess bits modulo target window size */ \ + window = bits % 5; /* yes, it may be zero */ \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ +\ + bits -= window; \ + wval = bits ? get_wval(scalar, bits - 1, window + 1) : scalar[0] << 1; \ + wval &= wmask; \ + wval = booth_encode_w5(wval); \ + ptype##_gather_booth_w5(ret, table, wval); \ +\ + while (bits > 0) { \ + ptype##_double(ret, ret); \ + ptype##_double(ret, ret); \ + ptype##_double(ret, ret); \ + ptype##_double(ret, ret); \ + ptype##_double(ret, ret); \ +\ + window = 5; \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ + bits -= window; \ +\ + wval = bits ? get_wval(scalar, bits - 1, window + 1) : scalar[0] << 1; \ + wval &= wmask; \ + wval = booth_encode_w5(wval); \ + ptype##_gather_booth_w5(temp, table, wval); \ + ptype##_add(ret, ret, temp); \ + } \ +} + +#if 0 +/* ~50%, or ~2x[!] slower than w5... */ +#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \ +static void ptype##_mult_ladder(ptype *ret, const ptype *p, \ + const limb_t *scalar, size_t bits) \ +{ \ + ptype sum[1]; \ + limb_t bit, pbit = 0; \ +\ + vec_copy(sum, p, sizeof(ptype)); \ + vec_zero(ret, sizeof(ptype)); /* infinity */ \ +\ + while (bits--) { \ + bit = (scalar[bits / LIMB_T_BITS] >> (bits % LIMB_T_BITS)) & 1; \ + bit ^= pbit; \ + ptype##_cswap(ret, sum, bit); \ + ptype##_add(sum, sum, ret); \ + ptype##_double(ret, ret); \ + pbit ^= bit; \ + } \ + ptype##_cswap(ret, sum, pbit); \ +} +#else +/* >40% better performance than above, [and ~30% slower than w5]... */ +#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \ +static void ptype##_mult_ladder(ptype *out, const ptype *p, \ + const limb_t *scalar, size_t bits) \ +{ \ + ptype##xz sum[1]; \ + ptype##xz pxz[1]; \ + ptype##xz ret[1]; \ + limb_t bit, pbit = 0; \ +\ + ptype##xz_ladder_pre(pxz, p); \ + vec_copy(sum, pxz, sizeof(ptype##xz)); \ + vec_zero(ret, sizeof(ptype##xz)); /* infinity */ \ +\ + while (bits--) { \ + bit = (scalar[bits / LIMB_T_BITS] >> (bits % LIMB_T_BITS)) & 1; \ + bit ^= pbit; \ + ptype##xz_cswap(ret, sum, bit); \ + ptype##xz_ladder_step(ret, sum, pxz); \ + pbit ^= bit; \ + } \ + ptype##xz_cswap(ret, sum, pbit); \ + ptype##xz_ladder_post(out, ret, sum, pxz, p->Y); \ +} +#endif + +/* + * Sole reason for existence of this implementation is that addition + * with affine point renders a share of multiplications redundant by + * virtue of Z==1. And since pre-defined generator point can be and + * customarily is instantiated affine, it would be hardly appropriate + * to pass on this opportunity. Though while it's faster than the + * generic ladder implementation, by ~25%, it's not faster than XZ one + * above, <15% slower. Just in case, it's faster than generic ladder + * even if one accounts for prior conversion to affine coordinates, + * so that choice [for resource-constrained case] is actually between + * this plus said conversion and XZ ladder... + * + * To summarize, if ptype##_mult_w5 executed in one unit of time, then + * - naive ptype##_mult_ladder would execute in ~2; + * - XZ version above - in ~1.4; + * - ptype##_affine_mult_ladder below - in ~1.65; + * - [small-footprint ptype##_to_affine would run in ~0.18]. + * + * Caveat lector, |p_affine|*(order+2) produces wrong result, because + * addition doesn't handle doubling. Indeed, P*(order+1) is P and it + * fails to add with itself producing infinity in last addition. But + * as long as |scalar| is reduced modulo order, as it should be, it's + * not a problem... + */ +#define POINT_AFFINE_MULT_SCALAR_IMPL(ptype) \ +static void ptype##_affine_mult_ladder(ptype *ret, const ptype *p_affine, \ + const limb_t *scalar, size_t bits) \ +{ \ + ptype sum[1]; \ + limb_t bit; \ +\ + vec_zero(ret, sizeof(ptype)); /* infinity */ \ +\ + while (bits--) { \ + ptype##_double(ret, ret); \ + ptype##_add_affine(sum, ret, p_affine); \ + bit = (scalar[bits / LIMB_T_BITS] >> (bits % LIMB_T_BITS)) & 1; \ + ptype##_ccopy(ret, sum, bit); \ + } \ +} +#endif diff --git a/src/ec_ops.h b/src/ec_ops.h new file mode 100644 index 00000000..703a0e01 --- /dev/null +++ b/src/ec_ops.h @@ -0,0 +1,462 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_384_ASM_EC_OPS_H__ +#define __BLS12_384_ASM_EC_OPS_H__ +/* + * Addition that can handle doubling [as well as points at infinity, + * which are encoded as Z==0] in constant time. It naturally comes at + * cost, but this subroutine should be called only when independent + * points are processed, which is considered reasonable compromise. + * For example, ptype##s_mult_w5 calls it, but since *major* gain is + * result of pure doublings being effectively divided by amount of + * points, slightly slower addition can be tolerated. But what is the + * additional cost more specifically? Best addition result is 11M+5S, + * while this routine takes 13M+5S (+1M+1S if a4!=0), as per + * + * -------------+------------- + * addition | doubling + * -------------+------------- + * U1 = X1*Z2^2 | U1 = X1 + * U2 = X2*Z1^2 | + * S1 = Y1*Z2^3 | S1 = Y1 + * S2 = Y2*Z1^3 | + * zz = Z1*Z2 | zz = Z1 + * H = U2-U1 | H' = 2*Y1 + * R = S2-S1 | R' = 3*X1^2[+a*Z1^4] + * sx = U1+U2 | sx = X1+X1 + * -------------+------------- + * H!=0 || R!=0 | H==0 && R==0 + * + * X3 = R^2-H^2*sx + * Y3 = R*(H^2*U1-X3)-H^3*S1 + * Z3 = H*zz + * + * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is + * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0. + */ +#define POINT_DADD_IMPL(ptype, bits, field) \ +static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2, \ + const vec##bits a4) \ +{ \ + ptype p3; /* starts as (U1, S1, zz) from addition side */\ + struct { vec##bits H, R, sx; } add, dbl; \ + limb_t p1inf, p2inf, is_dbl; \ +\ + add_##field(dbl.H, p1->Y, p1->Y); /* H = 2*Y1 */\ + sqr_##field(dbl.R, p1->X); /* X1^2 */\ + add_##field(dbl.sx, p1->X, p1->X); /* sx = X1+X1 */\ + mul_by_3_##field(dbl.R, dbl.R); /* R = 3*X1^2 */\ +\ + sqr_##field(add.H, p1->Z); /* Z1^2 */\ + sqr_##field(p3.X, p2->Z); /* Z2^2 */\ + mul_##field(p3.Z, p1->Z, p2->Z); /* Z1*Z2 */\ +\ + if (a4 != NULL) { \ + sqr_##field(p3.Y, add.H); /* Z1^4, [borrow p3.Y] */\ + mul_##field(p3.Y, p3.Y, a4); \ + add_##field(dbl.R, dbl.R, p3.Y);/* R = 3*X1^2+a*Z1^4 */\ + } \ +\ + mul_##field(p3.Y, p1->Y, p2->Z); \ + mul_##field(add.R, p2->Y, p1->Z); \ + mul_##field(p3.Y, p3.Y, p3.X); /* S1 = Y1*Z2^3 */\ + mul_##field(add.R, add.R, add.H); /* S2 = Y2*Z1^3 */\ +\ + mul_##field(p3.X, p3.X, p1->X); /* U1 = X1*Z2^2 */\ + mul_##field(add.H, add.H, p2->X); /* U2 = X2*Z1^2 */\ +\ + sub_##field(add.R, add.R, p3.Y); /* R = S2-S1 */\ + add_##field(add.sx, add.H, p3.X); /* sx = U1+U2 */\ + sub_##field(add.H, add.H, p3.X); /* H = U2-U1 */\ +\ + /* make the choice between addition and doubling */\ + is_dbl = vec_is_zero(add.H, 2*sizeof(add.H)); \ + vec_select(&p3, p1, &p3, sizeof(p3), is_dbl); \ + vec_select(&add, &dbl, &add, sizeof(add), is_dbl); \ + /* |p3| and |add| hold all inputs now, |p3| will hold output */\ +\ + sqr_##field(dbl.H, add.H); /* H^2 */\ + mul_##field(dbl.R, add.H, p3.Y); /* H*S1 */\ + mul_##field(p3.Y, dbl.H, p3.X); /* H^2*U1 */\ + mul_##field(dbl.R, dbl.R, dbl.H); /* H^3*S1 */\ +\ + mul_##field(dbl.H, dbl.H, add.sx); /* H^2*sx */\ + sqr_##field(p3.X, add.R); /* R^2 */\ + sub_##field(p3.X, p3.X, dbl.H); /* X3 = R^2-H^2*sx */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* H^2*U1-X3 */\ + mul_##field(p3.Y, p3.Y, add.R); /* R*(H^2*U1-X3) */\ + sub_##field(p3.Y, p3.Y, dbl.R); /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\ +\ + mul_##field(p3.Z, p3.Z, add.H); /* Z3 = H*Z1*Z2 */\ +\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \ +\ + vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \ + vec_select(out, p2, &p3, sizeof(ptype), p1inf); \ +} + +/* + * Addition with affine point that can handle doubling [as well as + * points at infinity, with |p1| being encoded as Z==0 and |p2| as + * X,Y==0] in constant time. But at what additional cost? Best + * addition result is 7M+4S, while this routine takes 8M+5S, as per + * + * -------------+------------- + * addition | doubling + * -------------+------------- + * U1 = X1 | U1 = X2 + * U2 = X2*Z1^2 | + * S1 = Y1 | S1 = Y2 + * S2 = Y2*Z1^3 | + * H = U2-X1 | H' = 2*Y2 + * R = S2-Y1 | R' = 3*X2^2[+a] + * sx = X1+U2 | sx = X2+X2 + * zz = H*Z1 | zz = H' + * -------------+------------- + * H!=0 || R!=0 | H==0 && R==0 + * + * X3 = R^2-H^2*sx + * Y3 = R*(H^2*U1-X3)-H^3*S1 + * Z3 = zz + * + * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is + * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0. + */ +#define POINT_DADD_AFFINE_IMPL_A0(ptype, bits, field, one) \ +static void ptype##_dadd_affine(ptype *out, const ptype *p1, const ptype *p2) \ +{ \ + ptype p3; /* starts as (,, H*Z1) from addition side */\ + struct { vec##bits H, R, sx; } add, dbl; \ + limb_t p1inf, p2inf, is_dbl; \ +\ + add_##field(dbl.H, p2->Y, p2->Y); /* H = 2*Y2 */\ + sqr_##field(dbl.R, p2->X); /* X2^2 */\ + add_##field(dbl.sx, p2->X, p2->X); /* sx = X2+X2 */\ + mul_by_3_##field(dbl.R, dbl.R); /* R = 3*X2^2 */\ +\ + sqr_##field(add.H, p1->Z); /* Z1^2 */\ + mul_##field(add.R, add.H, p1->Z); /* Z1^3 */\ + mul_##field(add.H, add.H, p2->X); /* U2 = X2*Z1^2 */\ + mul_##field(add.R, add.R, p2->Y); /* S2 = Y2*Z1^3 */\ +\ + add_##field(add.sx, add.H, p1->X); /* sx = X1+U2 */\ + sub_##field(add.H, add.H, p1->X); /* H = U2-X1 */\ + sub_##field(add.R, add.R, p1->Y); /* R = S2-Y1 */\ +\ + mul_##field(p3.Z, add.H, p1->Z); /* Z3 = H*Z1 */\ +\ + /* make the choice between addition and doubling */ \ + is_dbl = vec_is_zero(add.H, 2*sizeof(add.H)); \ + vec_select(p3.X, p2, p1, 2*sizeof(p3.X), is_dbl); \ + vec_select(p3.Z, dbl.H, p3.Z, sizeof(p3.Z), is_dbl);\ + vec_select(&add, &dbl, &add, sizeof(add), is_dbl); \ + /* |p3| and |add| hold all inputs now, |p3| will hold output */\ +\ + sqr_##field(dbl.H, add.H); /* H^2 */\ + mul_##field(dbl.R, add.H, p3.Y); /* H*S1 */\ + mul_##field(p3.Y, dbl.H, p3.X); /* H^2*U1 */\ + mul_##field(dbl.R, dbl.R, dbl.H); /* H^3*S1 */\ +\ + mul_##field(dbl.H, dbl.H, add.sx); /* H^2*sx */\ + sqr_##field(p3.X, add.R); /* R^2 */\ + sub_##field(p3.X, p3.X, dbl.H); /* X3 = R^2-H^2*sx */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* H^2*U1-X3 */\ + mul_##field(p3.Y, p3.Y, add.R); /* R*(H^2*U1-X3) */\ + sub_##field(p3.Y, p3.Y, dbl.R); /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\ +\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \ +\ + vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \ + vec_select(out->X, p2, p3.X, 2*sizeof(p3.X), p1inf); \ + vec_select(out->Z, one, p3.Z, sizeof(p3.Z), p1inf & (p2inf^1)); \ +} + +/* + * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl + * with twist to handle either input at infinity, which are encoded as Z==0. + */ +#define POINT_ADD_IMPL(ptype, bits, field) \ +static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2) \ +{ \ + ptype p3; \ + vec##bits Z1Z1, Z2Z2, U1, S1, H, I, J, r, V; \ + limb_t p1inf, p2inf; \ +\ + sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ + sqr_##field(Z2Z2, p2->Z); /* Z2Z2 = Z2^2 */\ +\ + mul_##field(U1, p1->X, Z2Z2); /* U1 = X1*Z2Z2 */\ + mul_##field(H, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ +\ + mul_##field(S1, p1->Y, p2->Z); /* Y1*Z2 */\ + mul_##field(S1, S1, Z2Z2); /* S1 = Y1*Z2*Z2Z2 */\ +\ + mul_##field(r, p2->Y, p1->Z); /* Y2*Z1 */\ + mul_##field(r, r, Z1Z1); /* S2 = Y2*Z1*Z1Z1 */\ +\ + sub_##field(H, H, U1); /* H = U2-U1 */\ +\ + add_##field(I, H, H); /* 2*H */\ + sqr_##field(I, I); /* I = (2*H)^2 */\ +\ + mul_##field(J, H, I); /* J = H*I */\ +\ + sub_##field(r, r, S1); /* S2-S1 */\ + add_##field(r, r, r); /* r = 2*(S2-S1) */\ +\ + mul_##field(V, U1, I); /* V = U1*I */\ +\ + sqr_##field(p3.X, r); /* r^2 */\ + sub_##field(p3.X, p3.X, J); /* r^2-J */\ + sub_##field(p3.X, p3.X, V); \ + sub_##field(p3.X, p3.X, V); /* X3 = r^2-J-2*V */\ +\ + sub_##field(p3.Y, V, p3.X); /* V-X3 */\ + mul_##field(p3.Y, p3.Y, r); /* r*(V-X3) */\ + mul_##field(S1, S1, J); /* S1*J */\ + sub_##field(p3.Y, p3.Y, S1); \ + sub_##field(p3.Y, p3.Y, S1); /* Y3 = r*(V-X3)-2*S1*J */\ +\ + add_##field(p3.Z, p1->Z, p2->Z); /* Z1+Z2 */\ + sqr_##field(p3.Z, p3.Z); /* (Z1+Z2)^2 */\ + sub_##field(p3.Z, p3.Z, Z1Z1); /* (Z1+Z2)^2-Z1Z1 */\ + sub_##field(p3.Z, p3.Z, Z2Z2); /* (Z1+Z2)^2-Z1Z1-Z2Z2 */\ + mul_##field(p3.Z, p3.Z, H); /* Z3 = ((Z1+Z2)^2-Z1Z1-Z2Z2)*H */\ +\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \ +\ + vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \ + vec_select(out, p2, &p3, sizeof(ptype), p1inf); \ +} + +/* + * https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl + * with twist to handle |p1| at infinity, which is encoded as Z==0. + * |p2|->Z is expected to be one, that's what makes it affine, and + * |p2| not equal to |p3|!!! + */ +#define POINT_ADD_AFFINE_IMPL(ptype, bits, field, one) \ +static void ptype##_add_affine(ptype *p3, const ptype *p1, const ptype *p2) \ +{ \ + vec##bits Z1Z1, H, HH, I, J, r, V; \ + limb_t p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ +\ + sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ +\ + mul_##field(H, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ +\ + mul_##field(r, p2->Y, p1->Z); /* Y2*Z1 */\ + mul_##field(r, r, Z1Z1); /* S2 = Y2*Z1*Z1Z1 */\ +\ + sub_##field(H, H, p1->X); /* H = U2-X1 */\ +\ + sqr_##field(HH, H); /* HH = H^2 */\ + add_##field(I, HH, HH); \ + add_##field(I, I, I); /* I = 4*HH */\ +\ + mul_##field(J, H, I); /* J = H*I */\ +\ + sub_##field(r, r, p1->Y); /* S2-Y1 */\ + add_##field(r, r, r); /* r = 2*(S2-Y1) */\ +\ + mul_##field(V, p1->X, I); /* V = X1*I */\ +\ + sqr_##field(p3->X, r); /* r^2 */\ + sub_##field(p3->X, p3->X, J); /* r^2-J */\ + sub_##field(p3->X, p3->X, V); \ + sub_##field(p3->X, p3->X, V); /* X3 = r^2-J-2*V */\ +\ + mul_##field(J, J, p1->Y); /* Y1*J */\ + sub_##field(p3->Y, V, p3->X); /* V-X3 */\ + mul_##field(p3->Y, p3->Y, r); /* r*(V-X3) */\ + sub_##field(p3->Y, p3->Y, J); \ + sub_##field(p3->Y, p3->Y, J); /* Y3 = r*(V-X3)-2*Y1*J */\ +\ + add_##field(p3->Z, p1->Z, H); /* Z1+H */\ + sqr_##field(p3->Z, p3->Z); /* (Z1+H)^2 */\ + sub_##field(p3->Z, p3->Z, Z1Z1); /* (Z1+H)^2-Z1Z1 */\ + sub_##field(p3->Z, p3->Z, HH); /* Z3 = (Z1+H)^2-Z1Z1-HH */\ +\ + vec_select(p3->X, p2->X, p3->X, 2*sizeof(p3->X), p1inf); \ + vec_select(p3->Z, one, p3->Z, sizeof(p3->Z), p1inf); \ +} + +/* + * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l + */ +#define POINT_DOUBLE_IMPL_A0(ptype, bits, field) \ +static void ptype##_double(ptype *p3, const ptype *p1) \ +{ \ + vec##bits A, B, C, D; \ +\ + sqr_##field(A, p1->X); /* A = X1^2 */\ + sqr_##field(B, p1->Y); /* B = Y1^2 */\ + sqr_##field(C, B); /* C = B^2 */\ +\ + add_##field(D, p1->X, B); /* X1+B */\ + sqr_##field(D, D); /* (X1+B)^2 */\ + sub_##field(D, D, A); /* (X1+B)^2-A */\ + sub_##field(D, D, C); /* (X1+B)^2-A-C */\ + add_##field(D, D, D); /* D = 2*((X1+B)^2-A-C) */\ +\ + mul_by_3_##field(A, A); /* E = 3*A */\ +\ + sqr_##field(p3->X, A); /* F = E^2 */\ + sub_##field(p3->X, p3->X, D); \ + sub_##field(p3->X, p3->X, D); /* X3 = F-2*D */\ +\ + add_##field(p3->Z, p1->Z, p1->Z); /* 2*Z1 */\ + mul_##field(p3->Z, p3->Z, p1->Y); /* Z3 = 2*Z1*Y1 */\ +\ + mul_by_8_##field(C, C); /* 8*C */\ + sub_##field(p3->Y, D, p3->X); /* D-X3 */\ + mul_##field(p3->Y, p3->Y, A); /* E*(D-X3) */\ + sub_##field(p3->Y, p3->Y, C); /* Y3 = E*(D-X3)-8*C */\ +} + +#define POINT_LADDER_PRE_IMPL(ptype, bits, field) \ +static void ptype##xz_ladder_pre(ptype##xz *pxz, const ptype *p) \ +{ \ + mul_##field(pxz->X, p->X, p->Z); /* X2 = X1*Z1 */\ + sqr_##field(pxz->Z, p->Z); \ + mul_##field(pxz->Z, pxz->Z, p->Z); /* Z2 = Z1^3 */\ +} + +/* + * https://hyperelliptic.org/EFD/g1p/auto-shortw-xz.html#ladder-ladd-2002-it-3 + * with twist to handle either input at infinity, which are encoded as Z==0. + * Just in case, order of doubling and addition is reverse in comparison to + * hyperelliptic.org entry. This was done to minimize temporary storage. + * + * XZ1 is |p|, XZ2&XZ4 are in&out |r|, XZ3&XZ5 are in&out |s|. + */ +#define POINT_LADDER_STEP_IMPL_A0(ptype, bits, field, suffix4b) \ +static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s, \ + const ptype##xz *p) \ +{ \ + ptype##xz p5; \ + vec##bits A, B, C, D, XX, ZZ; \ + limb_t r_inf, s_inf; \ + /* s += r */\ + mul_##field(A, r->X, s->X); /* A = X2*X3 */\ + mul_##field(B, r->Z, s->Z); /* B = Z2*Z3 */\ + mul_##field(C, r->X, s->Z); /* C = X2*Z3 */\ + mul_##field(D, r->Z, s->X); /* D = X3*Z2 */\ +\ + sqr_##field(A, A); /* (A[-a*B])^2 */\ + add_##field(p5.X, C, D); /* C+D */\ + mul_##field(p5.X, p5.X, B); /* B*(C+D) */\ + mul_by_4b_##suffix4b(B, p5.X); /* b4*B*(C+D) */\ + sub_##field(p5.X, A, B); /* (A[-a*B])^2-b4*B*(C+D) */\ + mul_##field(p5.X, p5.X, p->Z); /* X5 = Z1*((A[-a*B])^2-b4*B*(C+D)) */\ +\ + sub_##field(p5.Z, C, D); /* C-D */\ + sqr_##field(p5.Z, p5.Z); /* (C-D)^2 */\ + mul_##field(p5.Z, p5.Z, p->X); /* Z5 = X1*(C-D)^2 */\ +\ + r_inf = vec_is_zero(r->Z, sizeof(r->Z)); \ + s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \ +\ + vec_select(&p5, r, &p5, sizeof(ptype##xz), s_inf); \ + vec_select(s, s, &p5, sizeof(ptype##xz), r_inf); \ + /* r *= 2 */\ + sqr_##field(XX, r->X); /* XX = X2^2 */\ + sqr_##field(ZZ, r->Z); /* ZZ = Z2^2 */\ +\ + add_##field(r->Z, r->X, r->Z); /* X2+Z2 */\ + sqr_##field(r->Z, r->Z); /* (X2+Z2)^2 */\ + sub_##field(r->Z, r->Z, XX); /* (X2+Z2)^2-XX */\ + sub_##field(r->Z, r->Z, ZZ); /* E = (X2+Z2)^2-XX-ZZ */\ +\ + sqr_##field(A, XX); /* (XX[-a*ZZ])^2 */\ + mul_##field(B, r->Z, ZZ); /* E*ZZ */\ + mul_by_4b_##suffix4b(C, B); /* b4*E*ZZ */\ + sub_##field(r->X, A, C); /* X4 = (XX[-a*ZZ])^2-b4*E*ZZ */\ +\ + sqr_##field(ZZ, ZZ); /* ZZ^2 */\ + mul_by_4b_##suffix4b(B, ZZ); /* b4*ZZ^2 */\ + mul_##field(r->Z, r->Z, XX); /* E*(XX[+a*ZZ]) */\ + add_##field(r->Z, r->Z, r->Z); /* 2*E*(XX[+a*ZZ]) */\ + add_##field(r->Z, r->Z, B); /* Z4 = 2*E*(XX[+a*ZZ])+b4*ZZ^2 */\ +} + +/* + * Recover the |r|'s y-coordinate using Eq. (8) from Brier-Joye, + * "Weierstraß Elliptic Curves and Side-Channel Attacks", with XZ twist + * and conversion to Jacobian coordinates from /.../ecp_smpl.c, + * and with twist to recover from |s| at infinity [which occurs when + * multiplying by (order-1)]. + * + * X4 = 2*Y1*X2*Z3*Z1*Z2 + * Y4 = 2*b*Z3*(Z1*Z2)^2 + Z3*(a*Z1*Z2+X1*X2)*(X1*Z2+X2*Z1) - X3*(X1*Z2-X2*Z1)^2 + * Z4 = 2*Y1*Z3*Z2^2*Z1 + * + * Z3x2 = 2*Z3 + * Y1Z3x2 = Y1*Z3x2 + * Z1Z2 = Z1*Z2 + * X1Z2 = X1*Z2 + * X2Z1 = X2*Z1 + * X4 = Y1Z3x2*X2*Z1Z2 + * A = b*Z3x2*(Z1Z2)^2 + * B = Z3*(a*Z1Z2+X1*X2)*(X1Z2+X2Z1) + * C = X3*(X1Z2-X2Z1)^2 + * Y4 = A+B-C + * Z4 = Y1Z3x2*Z1Z2*Z2 + * + * XZ1 is |p|, XZ2 is |r|, XZ3 is |s|, 'a' is 0. + */ +#define POINT_LADDER_POST_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##xz_ladder_post(ptype *p4, \ + const ptype##xz *r, const ptype##xz *s, \ + const ptype##xz *p, const vec##bits Y1) \ +{ \ + vec##bits Z3x2, Y1Z3x2, Z1Z2, X1Z2, X2Z1, A, B, C; \ + limb_t s_inf; \ +\ + add_##field(Z3x2, s->Z, s->Z); /* Z3x2 = 2*Z3 */\ + mul_##field(Y1Z3x2, Y1, Z3x2); /* Y1Z3x2 = Y1*Z3x2 */\ + mul_##field(Z1Z2, p->Z, r->Z); /* Z1Z2 = Z1*Z2 */\ + mul_##field(X1Z2, p->X, r->Z); /* X1Z2 = X1*Z2 */\ + mul_##field(X2Z1, r->X, p->Z); /* X2Z1 = X2*Z1 */\ +\ + mul_##field(p4->X, Y1Z3x2, r->X); /* Y1Z3x2*X2 */\ + mul_##field(p4->X, p4->X, Z1Z2); /* X4 = Y1Z3x2*X2*Z1Z2 */\ +\ + sqr_##field(A, Z1Z2); /* (Z1Z2)^2 */\ + mul_##field(B, A, Z3x2); /* Z3x2*(Z1Z2)^2 */\ + mul_by_b_##suffixb(A, B); /* A = b*Z3x2*(Z1Z2)^2 */\ +\ + mul_##field(B, p->X, r->X); /* [a*Z1Z2+]X1*X2 */\ + mul_##field(B, B, s->Z); /* Z3*([a*Z1Z2+]X1*X2) */\ + add_##field(C, X1Z2, X2Z1); /* X1Z2+X2Z1 */\ + mul_##field(B, B, C); /* B = Z3*([a*Z2Z1+]X1*X2)*(X1Z2+X2Z1) */\ +\ + sub_##field(C, X1Z2, X2Z1); /* X1Z2-X2Z1 */\ + sqr_##field(C, C); /* (X1Z2-X2Z1)^2 */\ + mul_##field(C, C, s->X); /* C = X3*(X1Z2-X2Z1)^2 */\ +\ + add_##field(A, A, B); /* A+B */\ + sub_##field(A, A, C); /* Y4 = A+B-C */\ +\ + mul_##field(p4->Z, Z1Z2, r->Z); /* Z1Z2*Z2 */\ + mul_##field(p4->Z, p4->Z, Y1Z3x2); /* Y1Z3x2*Z1Z2*Z2 */\ +\ + s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \ + vec_select(p4->X, p->X, p4->X, sizeof(p4->X), s_inf); \ + vec_select(p4->Y, Y1, A, sizeof(p4->Y), s_inf); \ + vec_select(p4->Z, p->Z, p4->Z, sizeof(p4->Z), s_inf); \ + ptype##_cneg(p4, s_inf); \ + /* to Jacobian */\ + mul_##field(p4->X, p4->X, p4->Z); /* X4 = X4*Z4 */\ + sqr_##field(B, p4->Z); \ + mul_##field(p4->Y, p4->Y, B); /* Y4 = Y4*Z4^2 */\ +} +#endif diff --git a/src/errors.h b/src/errors.h new file mode 100644 index 00000000..387b3576 --- /dev/null +++ b/src/errors.h @@ -0,0 +1,13 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +typedef enum { + BLST_SUCCESS = 0, + BLST_BAD_ENCODING, + BLST_POINT_NOT_ON_CURVE, + BLST_POINT_NOT_IN_GROUP, + BLST_AGGR_TYPE_MISMATCH, + BLST_VERIFY_FAIL, +} BLST_ERROR; diff --git a/src/exp.c b/src/exp.c new file mode 100644 index 00000000..3031e24b --- /dev/null +++ b/src/exp.c @@ -0,0 +1,145 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "vect.h" +#include "fields.h" + +/* + * |out| = |inp|^|pow|, small footprint, public exponent + */ +static void exp_mont_384(vec384 out, const vec384 inp, const limb_t *pow, + size_t pow_bits, const vec384 p, limb_t n0) +{ +#if 1 + vec384 ret; + + vec_copy(ret, inp, sizeof(ret)); /* ret = inp^1 */ + --pow_bits; /* most significant bit is set, skip over */ + while (pow_bits--) { + sqr_mont_384(ret, ret, p, n0); + if (is_bit_set(pow, pow_bits)) + mul_mont_384(ret, ret, inp, p, n0); + } + vec_copy(out, ret, sizeof(ret)); /* out = ret */ +#else + unsigned int i; + vec384 sqr; + + vec_copy(sqr, inp, sizeof(sqr)); + for (i = 0; !is_bit_set(pow, i++);) + sqr_mont_384(sqr, sqr, sqr, p, n0); + vec_copy(out, sqr, sizeof(sqr)); + for (; i < pow_bits; i++) { + sqr_mont_384(sqr, sqr, sqr, p, n0); + if (is_bit_set(pow, i)) + mul_mont_384(out, out, sqr, p, n0); + } +#endif +} + +#ifdef __OPTIMIZE_SIZE__ +/* + * 608 multiplications for scalar inversion modulo BLS12-381 prime, 32% + * more than corresponding optimal addition-chain, plus mispredicted + * branch penalties on top of that... The addition chain below was + * measured to be >50% faster. + */ +static void reciprocal_fp(vec384 out, const vec384 inp) +{ + static const limb_t BLS12_381_P_minus_2[] = { + TO_LIMB_T(0xb9feffffffffaaa9), TO_LIMB_T(0x1eabfffeb153ffff), + TO_LIMB_T(0x6730d2a0f6b0f624), TO_LIMB_T(0x64774b84f38512bf), + TO_LIMB_T(0x4b1ba7b6434bacd7), TO_LIMB_T(0x1a0111ea397fe69a) + }; + + exp_mont_384(out, inp, BLS12_381_P_minus_2, 381, BLS12_381_P, p0); +} + +static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp) +{ + static const limb_t BLS_12_381_P_minus_3_div_4[] = { + TO_LIMB_T(0xee7fbfffffffeaaa), TO_LIMB_T(0x07aaffffac54ffff), + TO_LIMB_T(0xd9cc34a83dac3d89), TO_LIMB_T(0xd91dd2e13ce144af), + TO_LIMB_T(0x92c6e9ed90d2eb35), TO_LIMB_T(0x0680447a8e5ff9a6) + }; + + exp_mont_384(out, inp, BLS_12_381_P_minus_3_div_4, 379, BLS12_381_P, p0); +} +#else +# if 1 +/* + * "383"-bit variant omits full reductions at the ends of squarings, + * which results in up to ~15% improvement. [One can improve further + * by omitting full reductions even after multiplications and + * performing final reduction at the very end of the chain.] + */ +static inline void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count, + const vec384 b) +{ sqr_n_mul_mont_383(out, a, count, BLS12_381_P, p0, b); } +# else +static void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count, + const vec384 b) +{ + while(count--) { + sqr_fp(out, a); + a = out; + } + mul_fp(out, out, b); +} +# endif + +# define sqr(ret,a) sqr_fp(ret,a) +# define mul(ret,a,b) mul_fp(ret,a,b) +# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fp(ret,a,n,b) + +# include "recip-addchain.h" +static void reciprocal_fp(vec384 out, const vec384 inp) +{ + RECIPROCAL_MOD_BLS12_381_P(out, inp, vec384); +} +# undef RECIPROCAL_MOD_BLS12_381_P + +# include "sqrt-addchain.h" +static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp) +{ + RECIP_SQRT_MOD_BLS12_381_P(out, inp, vec384); +} +# undef RECIP_SQRT_MOD_BLS12_381_P + +# undef sqr_n_mul +# undef sqr +# undef mul +#endif + +static limb_t recip_sqrt_fp(vec384 out, const vec384 inp) +{ + vec384 t0, t1; + limb_t ret; + + recip_sqrt_fp_3mod4(t0, inp); + + mul_fp(t1, t0, inp); + sqr_fp(t1, t1); + ret = vec_is_equal(t1, inp, sizeof(t1)); + vec_copy(out, t0, sizeof(t0)); + + return ret; +} + +static limb_t sqrt_fp(vec384 out, const vec384 inp) +{ + vec384 t0, t1; + limb_t ret; + + recip_sqrt_fp_3mod4(t0, inp); + + mul_fp(t0, t0, inp); + sqr_fp(t1, t0); + ret = vec_is_equal(t1, inp, sizeof(t1)); + vec_copy(out, t0, sizeof(t0)); + + return ret; +} diff --git a/src/exp2.c b/src/exp2.c new file mode 100644 index 00000000..3f19b683 --- /dev/null +++ b/src/exp2.c @@ -0,0 +1,179 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "vect.h" +#include "fields.h" + +static void reciprocal_fp2(vec384x out, const vec384x inp) +{ + vec384 t0, t1; + + /* + * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i + */ + sqr_fp(t0, inp[0]); + sqr_fp(t1, inp[1]); + add_fp(t0, t0, t1); + reciprocal_fp(t1, t0); + mul_fp(out[0], inp[0], t1); + mul_fp(out[1], inp[1], t1); + neg_fp(out[1], out[1]); +} + +/* + * |out| = |inp|^|pow|, small footprint, public exponent + */ +static void exp_mont_384x(vec384x out, const vec384x inp, const limb_t *pow, + size_t pow_bits, const vec384 p, limb_t n0) +{ + vec384x ret; + + vec_copy(ret, inp, sizeof(ret)); /* |ret| = |inp|^1 */ + --pow_bits; /* most significant bit is accounted for, skip over */ + while (pow_bits--) { + sqr_mont_384x(ret, ret, p, n0); + if (is_bit_set(pow, pow_bits)) + mul_mont_384x(ret, ret, inp, p, n0); + } + vec_copy(out, ret, sizeof(ret)); /* |out| = |ret| */ +} + +#ifdef __OPTIMIZE_SIZE__ +static void recip_sqrt_fp2_9mod16(vec384x out, const vec384x inp) +{ + static const limb_t BLS_12_381_P_2_minus_9_div_16[] = { + TO_LIMB_T(0xb26aa00001c718e3), TO_LIMB_T(0xd7ced6b1d76382ea), + TO_LIMB_T(0x3162c338362113cf), TO_LIMB_T(0x966bf91ed3e71b74), + TO_LIMB_T(0xb292e85a87091a04), TO_LIMB_T(0x11d68619c86185c7), + TO_LIMB_T(0xef53149330978ef0), TO_LIMB_T(0x050a62cfd16ddca6), + TO_LIMB_T(0x466e59e49349e8bd), TO_LIMB_T(0x9e2dc90e50e7046b), + TO_LIMB_T(0x74bd278eaa22f25e), TO_LIMB_T(0x002a437a4b8c35fc) + }; + + exp_mont_384x(out, inp, BLS_12_381_P_2_minus_9_div_16, 758, + BLS12_381_P, p0); +} +#else +static void sqr_n_mul_fp2(vec384x out, const vec384x a, size_t count, + const vec384x b) +{ + while(count--) { + sqr_mont_382x(out, a, BLS12_381_P, p0); + a = out; + } + mul_mont_384x(out, out, b, BLS12_381_P, p0); +} + +# define sqr(ret,a) sqr_fp2(ret,a) +# define mul(ret,a,b) mul_fp2(ret,a,b) +# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fp2(ret,a,n,b) + +# include "sqrt2-addchain.h" +static void recip_sqrt_fp2_9mod16(vec384x out, const vec384x inp) +{ + RECIP_SQRT_MOD_BLS12_381_P2(out, inp, vec384x); +} +# undef RECIP_SQRT_MOD_BLS12_381_P2 + +# undef sqr_n_mul +# undef sqr +# undef mul +#endif + +static limb_t sqrt_align_fp2(vec384x out, const vec384x ret, + const vec384x sqrt, const vec384x inp) +{ + static const vec384x sqrt_minus_1 = { { 0 }, { ONE_MONT_P } }; + static const vec384x sqrt_sqrt_minus_1 = { + /* + * "magic" number is ±2^((p-3)/4)%p, which is "1/sqrt(2)", + * in quotes because 2*"1/sqrt(2)"^2 == -1 mod p, not 1, + * but it pivots into "complex" plane nevertheless... + */ + { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), + TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), + TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } + }; + static const vec384x sqrt_minus_sqrt_minus_1 = { + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }, + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } + }; + vec384x coeff, t0, t1; + limb_t is_sqrt, flag; + + /* + * Instead of multiple trial squarings we can perform just one + * and see if the result is "rotated by multiple of 90°" in + * relation to |inp|, and "rotate" |ret| accordingly. + */ + sqr_fp2(t0, sqrt); + /* "sqrt(|inp|)"^2 = (a + b*i)^2 = (a^2-b^2) + 2ab*i */ + + /* (a^2-b^2) + 2ab*i == |inp| ? |ret| is spot on */ + sub_fp2(t1, t0, inp); + is_sqrt = vec_is_zero(t1, sizeof(t1)); + vec_copy(coeff, BLS12_381_Rx.p2, sizeof(coeff)); + + /* -(a^2-b^2) - 2ab*i == |inp| ? "rotate |ret| by 90°" */ + add_fp2(t1, t0, inp); + vec_select(coeff, sqrt_minus_1, coeff, sizeof(coeff), + flag = vec_is_zero(t1, sizeof(t1))); + is_sqrt |= flag; + + /* 2ab - (a^2-b^2)*i == |inp| ? "rotate |ret| by 135°" */ + sub_fp(t1[0], t0[0], inp[1]); + add_fp(t1[1], t0[1], inp[0]); + vec_select(coeff, sqrt_sqrt_minus_1, coeff, sizeof(coeff), + flag = vec_is_zero(t1, sizeof(t1))); + is_sqrt |= flag; + + /* -2ab + (a^2-b^2)*i == |inp| ? "rotate |ret| by 45°" */ + add_fp(t1[0], t0[0], inp[1]); + sub_fp(t1[1], t0[1], inp[0]); + vec_select(coeff, sqrt_minus_sqrt_minus_1, coeff, sizeof(coeff), + flag = vec_is_zero(t1, sizeof(t1))); + is_sqrt |= flag; + + /* actual "rotation" */ + mul_fp2(out, ret, coeff); + + return is_sqrt; +} + +static limb_t recip_sqrt_fp2(vec384x out, const vec384x inp) +{ + vec384x ret, sqrt; + + recip_sqrt_fp2_9mod16(ret, inp); + mul_fp2(sqrt, ret, inp); + + /* + * Now see if |ret| is or can be made 1/sqrt(|inp|)... + */ + + return sqrt_align_fp2(out, ret, sqrt, inp); +} + +static limb_t sqrt_fp2(vec384x out, const vec384x inp) +{ + vec384x ret; + + recip_sqrt_fp2_9mod16(ret, inp); + mul_fp2(ret, ret, inp); + + /* + * Now see if |ret| is or can be made sqrt(|inp|)... + */ + + return sqrt_align_fp2(out, ret, ret, inp); +} diff --git a/src/exports.c b/src/exports.c new file mode 100644 index 00000000..49647dc3 --- /dev/null +++ b/src/exports.c @@ -0,0 +1,452 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * Why this file? Overall goal is to ensure that all internal calls + * remain internal after linking application. This is to both + * + * a) minimize possibility of external name conflicts (since all + * non-blst-prefixed and [assembly subroutines] remain static); + * b) preclude possibility of unintentional internal reference + * overload in shared library context (one can achieve same + * effect with -Bsymbolic, but we don't want to rely on end-user + * to remember to use it); + */ + +#include "fields.h" + +/* + * BLS12-381-specifc Fr shortcuts to assembly. + */ +void blst_fr_add(vec256 ret, const vec256 a, const vec256 b) +{ add_mod_256(ret, a, b, BLS12_381_r); } + +void blst_fr_sub(vec256 ret, const vec256 a, const vec256 b) +{ sub_mod_256(ret, a, b, BLS12_381_r); } + +void blst_fr_mul_by_3(vec256 ret, const vec256 a) +{ mul_by_3_mod_256(ret, a, BLS12_381_r); } + +void blst_fr_lshift(vec256 ret, const vec256 a, size_t count) +{ lshift_mod_256(ret, a, count, BLS12_381_r); } + +void blst_fr_rshift(vec256 ret, const vec256 a, size_t count) +{ rshift_mod_256(ret, a, count, BLS12_381_r); } + +void blst_fr_mul(vec256 ret, const vec256 a, const vec256 b) +{ mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0); } + +void blst_fr_sqr(vec256 ret, const vec256 a) +{ sqr_mont_sparse_256(ret, a, BLS12_381_r, r0); } + +void blst_fr_cneg(vec256 ret, const vec256 a, size_t flag) +{ cneg_mod_256(ret, a, flag, BLS12_381_r); } + +void blst_fr_to(vec256 ret, const vec256 a) +{ mul_mont_sparse_256(ret, a, BLS12_381_rRR, BLS12_381_r, r0); } + +void blst_fr_from(vec256 ret, const vec256 a) +{ from_mont_256(ret, a, BLS12_381_r, r0); } + +/* + * BLS12-381-specifc Fp shortcuts to assembly. + */ +void blst_fp_add(vec384 ret, const vec384 a, const vec384 b) +{ add_fp(ret, a, b); } + +void blst_fp_sub(vec384 ret, const vec384 a, const vec384 b) +{ sub_fp(ret, a, b); } + +void blst_fp_mul_by_3(vec384 ret, const vec384 a) +{ mul_by_3_fp(ret, a); } + +void blst_fp_mul_by_8(vec384 ret, const vec384 a) +{ mul_by_8_fp(ret, a); } + +void blst_fp_lshift(vec384 ret, const vec384 a, size_t count) +{ lshift_fp(ret, a, count); } + +void blst_fp_mul(vec384 ret, const vec384 a, const vec384 b) +{ mul_fp(ret, a, b); } + +void blst_fp_sqr(vec384 ret, const vec384 a) +{ sqr_fp(ret, a); } + +void blst_fp_cneg(vec384 ret, const vec384 a, size_t flag) +{ cneg_fp(ret, a, flag); } + +void blst_fp_eucl_inverse(vec384 ret, const vec384 a) +{ eucl_inverse_fp(ret, a); } + +void blst_fp_to(vec384 ret, const vec384 a) +{ mul_fp(ret, a, BLS12_381_RR); } + +void blst_fp_from(vec384 ret, const vec384 a) +{ from_fp(ret, a); } + +/* + * Fp serialization/deserialization. + */ +void blst_fp_from_uint32(vec384 ret, const unsigned int a[12]) +{ + if (sizeof(limb_t) == 8) { + int i; + for (i = 0; i < 6; i++) + ret[i] = a[2*i] | ((limb_t)a[2*i+1] << 32); + a = (const unsigned int *)ret; + } + mul_fp(ret, (const limb_t *)a, BLS12_381_RR); +} + +void blst_uint32_from_fp(unsigned int ret[12], const vec384 a) +{ + if (sizeof(limb_t) == 4) { + from_fp((limb_t *)ret, a); + } else { + vec384 out; + int i; + + from_fp(out, a); + for (i = 0; i < 6; i++) { + limb_t limb = out[i]; + ret[2*i] = (unsigned int)limb; + ret[2*i+1] = (unsigned int)(limb >> 32); + } + } +} + +void blst_fp_from_uint64(vec384 ret, const unsigned long long a[6]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 4 && !is_endian.little) { + int i; + for (i = 0; i < 6; i++) { + unsigned long long limb = a[i]; + ret[2*i] = (limb_t)limb; + ret[2*i+1] = (limb_t)(limb >> 32); + } + a = (const unsigned long long *)ret; + } + mul_fp(ret, (const limb_t *)a, BLS12_381_RR); +} + +void blst_uint64_from_fp(unsigned long long ret[6], const vec384 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 8 || is_endian.little) { + from_fp((limb_t *)ret, a); + } else { + vec384 out; + int i; + + from_fp(out, a); + for (i = 0; i < 6; i++) + ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32); + } +} + +void blst_fp_from_bendian(vec384 ret, const unsigned char a[48]) +{ + vec384 out; + + limbs_from_be_bytes(out, a, sizeof(vec384)); + mul_fp(ret, out, BLS12_381_RR); +} + +void blst_bendian_from_fp(unsigned char ret[48], const vec384 a) +{ + vec384 out; + + from_fp(out, a); + be_bytes_from_limbs(ret, out, sizeof(vec384)); +} + +void blst_fp_from_lendian(vec384 ret, const unsigned char a[48]) +{ + vec384 out; + + limbs_from_le_bytes(out, a, sizeof(vec384)); + mul_fp(ret, out, BLS12_381_RR); +} + +void blst_lendian_from_fp(unsigned char ret[48], const vec384 a) +{ + vec384 out; + + from_fp(out, a); + le_bytes_from_limbs(ret, out, sizeof(vec384)); +} + +/* + * BLS12-381-specifc Fp2 shortcuts to assembly. + */ +void blst_fp2_add(vec384x ret, const vec384x a, const vec384x b) +{ add_fp2(ret, a, b); } + +void blst_fp2_sub(vec384x ret, const vec384x a, const vec384x b) +{ sub_fp2(ret, a, b); } + +void blst_fp2_mul_by_3(vec384x ret, const vec384x a) +{ mul_by_3_fp2(ret, a); } + +void blst_fp2_mul_by_8(vec384x ret, const vec384x a) +{ mul_by_8_fp2(ret, a); } + +void blst_fp2_lshift(vec384x ret, const vec384x a, size_t count) +{ lshift_fp2(ret, a, count); } + +void blst_fp2_mul(vec384x ret, const vec384x a, const vec384x b) +{ mul_fp2(ret, a, b); } + +void blst_fp2_sqr(vec384x ret, const vec384x a) +{ sqr_fp2(ret, a); } + +void blst_fp2_cneg(vec384x ret, const vec384x a, size_t flag) +{ cneg_fp2(ret, a, flag); } + +/* + * BLS12-381-specifc point operations. + */ +void blst_p1_add(POINTonE1 *out, const POINTonE1 *a, const POINTonE1 *b) +{ POINTonE1_add(out, a, b); } + +void blst_p1_add_or_double(POINTonE1 *out, const POINTonE1 *a, + const POINTonE1 *b) +{ POINTonE1_dadd(out, a, b, NULL); } + +void blst_p1_add_affine(POINTonE1 *out, const POINTonE1 *a, const POINTonE1 *b) +{ POINTonE1_add_affine(out, a, b); } + +void blst_p1_add_or_double_affine(POINTonE1 *out, const POINTonE1 *a, + const POINTonE1 *b) +{ POINTonE1_dadd_affine(out, a, b); } + +void blst_p1_double(POINTonE1 *out, const POINTonE1 *a) +{ POINTonE1_double(out, a); } + +void blst_p1_mult_w5(POINTonE1 *out, const POINTonE1 *a, + const limb_t *scalar, size_t nbits) +{ POINTonE1_mult_w5(out, a, scalar, nbits); } + +limb_t blst_p1_affine_is_equal(const POINTonE1_affine *a, + const POINTonE1_affine *b) +{ return vec_is_equal(a, b, sizeof(*a)); } + +void blst_p2_add(POINTonE2 *out, const POINTonE2 *a, const POINTonE2 *b) +{ POINTonE2_add(out, a, b); } + +void blst_p2_add_or_double(POINTonE2 *out, const POINTonE2 *a, + const POINTonE2 *b) +{ POINTonE2_dadd(out, a, b, NULL); } + +void blst_p2_add_affine(POINTonE2 *out, const POINTonE2 *a, const POINTonE2 *b) +{ POINTonE2_add_affine(out, a, b); } + +void blst_p2_add_or_double_affine(POINTonE2 *out, const POINTonE2 *a, + const POINTonE2 *b) +{ POINTonE2_dadd_affine(out, a, b); } + +void blst_p2_double(POINTonE2 *out, const POINTonE2 *a) +{ POINTonE2_double(out, a); } + +void blst_p2_mult_w5(POINTonE2 *out, const POINTonE2 *a, + const limb_t *scalar, size_t nbits) +{ POINTonE2_mult_w5(out, a, scalar, nbits); } + +limb_t blst_p2_affine_is_equal(const POINTonE2_affine *a, + const POINTonE2_affine *b) +{ return vec_is_equal(a, b, sizeof(*a)); } + +/* + * Scalar serialization/deseriazation + */ +#ifdef __UINTPTR_TYPE__ +typedef __UINTPTR_TYPE__ uptr_t; +#else +typedef const void *uptr_t; +#endif + +void blst_scalar_from_uint32(vec256 ret, const unsigned int a[8]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if ((uptr_t)ret==(uptr_t)a && (sizeof(limb_t)==4 || is_endian.little)) + return; + + if (sizeof(limb_t)==4) { + vec_copy(ret, a, sizeof(vec256)); + } else { + int i; + for (i = 0; i < 4; i++) + ret[i] = a[2*i] | ((limb_t)a[2*i+1] << 32); + } +} + +void blst_uint32_from_scalar(unsigned int ret[8], const vec256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if ((uptr_t)ret==(uptr_t)a && (sizeof(limb_t)==4 || is_endian.little)) + return; + + if (sizeof(limb_t)==4) { + vec_copy(ret, a, sizeof(vec256)); + } else { + int i; + for (i = 0; i < 4; i++) { + limb_t limb = a[i]; + ret[2*i] = (unsigned int)limb; + ret[2*i+1] = (unsigned int)(limb >> 32); + } + } +} + +void blst_scalar_from_uint64(vec256 ret, const unsigned long long a[4]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t)==8 || is_endian.little) { + if ((uptr_t)ret != (uptr_t)a) + vec_copy(ret, a, sizeof(vec256)); + } else { + int i; + for (i = 0; i < 4; i++) { + unsigned long long limb = a[i]; + ret[2*i] = (limb_t)limb; + ret[2*i+1] = (limb_t)(limb >> 32); + } + } +} + +void blst_uint64_from_scalar(unsigned long long ret[4], const vec256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t)==8 || is_endian.little) { + if ((uptr_t)ret != (uptr_t)a) + vec_copy(ret, a, sizeof(vec256)); + } else { + int i; + for (i = 0; i < 4; i++) + ret[i] = a[2*i] | ((unsigned long long)a[2*i+1] << 32); + } +} + +void blst_scalar_from_bendian(vec256 ret, const unsigned char a[32]) +{ + vec256 out; + limbs_from_be_bytes(out, a, sizeof(out)); + vec_copy(ret, out, sizeof(out)); +} + +void blst_bendian_from_scalar(unsigned char ret[32], const vec256 a) +{ + vec256 out; + vec_copy(out, a, sizeof(out)); + be_bytes_from_limbs(ret, out, sizeof(out)); +} + +void blst_scalar_from_lendian(vec256 ret, const unsigned char a[32]) +{ + vec256 out; + const union { + long one; + char little; + } is_endian = { 1 }; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + limbs_from_le_bytes(out, a, sizeof(out)); + vec_copy(ret, out, sizeof(out)); +} + +void blst_lendian_from_scalar(unsigned char ret[32], const vec256 a) +{ + vec256 out; + const union { + long one; + char little; + } is_endian = { 1 }; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + vec_copy(out, a, sizeof(out)); + le_bytes_from_limbs(ret, out, sizeof(out)); +} + +limb_t blst_scalar_fr_check(const vec256 a) +{ + vec256 zero = { 0 }; + + add_mod_256(zero, zero, a, BLS12_381_r); + return vec_is_equal(zero, a, sizeof(zero)); +} + +/* + * Test facilitator + */ +static unsigned char nibble(unsigned char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + else if (c >= 'a' && c <= 'f') + return 10 + c - 'a'; + else if (c >= 'A' && c <= 'F') + return 10 + c - 'A'; + else + return 16; +} + +static void limbs_from_hexascii(limb_t *ret, size_t sz, + const unsigned char *hex) +{ + size_t len; + limb_t limb = 0; + + if (hex[0]=='0' && (hex[1]=='x' || hex[1]=='X')) + hex += 2; + + for (len = 0; len<2*sz && nibble(hex[len])<16; len++) ; + + vec_zero(ret, sz); + + while(len--) { + limb <<= 4; + limb |= nibble(*hex++); + if (len % (2*sizeof(limb_t)) == 0) + ret[len / (2*sizeof(limb_t))] = limb; + } +} + +void blst_scalar_from_hexascii(vec256 ret, const unsigned char *hex) +{ limbs_from_hexascii(ret, sizeof(vec256), hex); } + +void blst_fp_from_hexascii(vec384 ret, const unsigned char *hex) +{ + limbs_from_hexascii(ret, sizeof(vec384), hex); + mul_fp(ret, ret, BLS12_381_RR); +} diff --git a/src/fields.h b/src/fields.h new file mode 100644 index 00000000..190bab58 --- /dev/null +++ b/src/fields.h @@ -0,0 +1,96 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_FIELDS_H__ +#define __BLS12_381_ASM_FIELDS_H__ + +#include "vect.h" +#include "consts.h" + +/* + * BLS12-381-specifc Fp shortcuts to assembly. + */ +static inline void add_fp(vec384 ret, const vec384 a, const vec384 b) +{ add_mod_384(ret, a, b, BLS12_381_P); } + +static inline void sub_fp(vec384 ret, const vec384 a, const vec384 b) +{ sub_mod_384(ret, a, b, BLS12_381_P); } + +static inline void mul_by_3_fp(vec384 ret, const vec384 a) +{ mul_by_3_mod_384(ret, a, BLS12_381_P); } + +static inline void mul_by_8_fp(vec384 ret, const vec384 a) +{ mul_by_8_mod_384(ret, a, BLS12_381_P); } + +static inline void lshift_fp(vec384 ret, const vec384 a, size_t count) +{ lshift_mod_384(ret, a, count, BLS12_381_P); } + +static inline void mul_fp(vec384 ret, const vec384 a, const vec384 b) +{ mul_mont_384(ret, a, b, BLS12_381_P, p0); } + +static inline void sqr_fp(vec384 ret, const vec384 a) +{ sqr_mont_384(ret, a, BLS12_381_P, p0); } + +static inline void cneg_fp(vec384 ret, const vec384 a, limb_t flag) +{ cneg_mod_384(ret, a, flag, BLS12_381_P); } + +#define neg_fp(r,a) cneg_fp((r),(a),1) + +static inline void eucl_inverse_fp(vec384 ret, const vec384 a) +{ eucl_inverse_mod_384(ret, a, BLS12_381_P, BLS12_381_RR); } + +static inline void from_fp(vec384 ret, const vec384 a) +{ from_mont_384(ret, a, BLS12_381_P, p0); } + +/* + * BLS12-381-specifc Fp2 shortcuts to assembly. + */ +static inline void add_fp2(vec384x ret, const vec384x a, const vec384x b) +{ add_mod_384x(ret, a, b, BLS12_381_P); } + +static inline void sub_fp2(vec384x ret, const vec384x a, const vec384x b) +{ sub_mod_384x(ret, a, b, BLS12_381_P); } + +static inline void mul_by_3_fp2(vec384x ret, const vec384x a) +{ mul_by_3_mod_384x(ret, a, BLS12_381_P); } + +static inline void mul_by_8_fp2(vec384x ret, const vec384x a) +{ mul_by_8_mod_384x(ret, a, BLS12_381_P); } + +static inline void lshift_fp2(vec384x ret, const vec384x a, size_t count) +{ + lshift_mod_384(ret[0], a[0], count, BLS12_381_P); + lshift_mod_384(ret[1], a[1], count, BLS12_381_P); +} + +static inline void mul_fp2(vec384x ret, const vec384x a, const vec384x b) +{ mul_mont_384x(ret, a, b, BLS12_381_P, p0); } + +static inline void sqr_fp2(vec384x ret, const vec384x a) +{ sqr_mont_384x(ret, a, BLS12_381_P, p0); } + +static inline void cneg_fp2(vec384x ret, const vec384x a, limb_t flag) +{ + cneg_mod_384(ret[0], a[0], flag, BLS12_381_P); + cneg_mod_384(ret[1], a[1], flag, BLS12_381_P); +} + +#define neg_fp2(r,a) cneg_fp2((r),(a),1) + +typedef vec384x vec384fp2; +typedef vec384fp2 vec384fp6[3]; +typedef vec384fp6 vec384fp12[2]; + +static void sqr_fp12(vec384fp12 ret, const vec384fp12 a); +static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a); +static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b); +static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0); +static void conjugate_fp12(vec384fp12 a); +static void inverse_fp12(vec384fp12 ret, const vec384fp12 a); +/* caveat lector! |n| has to be non-zero and not more than 3! */ +static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n); + +#endif /* __BLS12_381_ASM_FIELDS_H__ */ diff --git a/src/fp12_tower.c b/src/fp12_tower.c new file mode 100644 index 00000000..714e81e3 --- /dev/null +++ b/src/fp12_tower.c @@ -0,0 +1,785 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +/* + * Fp2 = Fp[u] / (u^2 + 1) + * Fp6 = Fp2[v] / (v^3 - u - 1) + * Fp12 = Fp6[w] / (w^2 - v) + */ + +static inline void mul_by_u_plus_1_fp2(vec384x ret, const vec384x a) +{ mul_by_1_plus_i_mod_384x(ret, a, BLS12_381_P); } + +#if 1 +#define __FP2x2__ +/* + * Fp2x2 is a "widened" version of Fp2, which allows to consolidate + * reductions from several multiplications. In other words instead of + * "mul_redc-mul_redc-add" we get "mul-mul-add-redc," where latter + * addition is double-width... To be more specific this gives ~7-10% + * faster pairing depending on platform... + */ +typedef vec768 vec768x[2]; + +static inline void add_fp2x2(vec768x ret, const vec768x a, const vec768x b) +{ + add_mod_384x384(ret[0], a[0], b[0], BLS12_381_P); + add_mod_384x384(ret[1], a[1], b[1], BLS12_381_P); +} + +static inline void sub_fp2x2(vec768x ret, const vec768x a, const vec768x b) +{ + sub_mod_384x384(ret[0], a[0], b[0], BLS12_381_P); + sub_mod_384x384(ret[1], a[1], b[1], BLS12_381_P); +} + +static inline void mul_by_u_plus_1_fp2x2(vec768x ret, const vec768x a) +{ + /* caveat lector! |ret| may not be same as |a| */ + sub_mod_384x384(ret[0], a[0], a[1], BLS12_381_P); + add_mod_384x384(ret[1], a[0], a[1], BLS12_381_P); +} + +static inline void redc_fp2x2(vec384x ret, const vec768x a) +{ + redc_mont_384(ret[0], a[0], BLS12_381_P, p0); + redc_mont_384(ret[1], a[1], BLS12_381_P, p0); +} + +static void mul_fp2x2(vec768x ret, const vec384x a, const vec384x b) +{ +#if 1 + mul_382x(ret, a, b, BLS12_381_P); /* +~6% in Miller loop */ +#else + union { vec384 x[2]; vec768 x2; } t; + + add_mod_384(t.x[0], a[0], a[1], BLS12_381_P); + add_mod_384(t.x[1], b[0], b[1], BLS12_381_P); + mul_384(ret[1], t.x[0], t.x[1]); + + mul_384(ret[0], a[0], b[0]); + mul_384(t.x2, a[1], b[1]); + + sub_mod_384x384(ret[1], ret[1], ret[0], BLS12_381_P); + sub_mod_384x384(ret[1], ret[1], t.x2, BLS12_381_P); + + sub_mod_384x384(ret[0], ret[0], t.x2, BLS12_381_P); +#endif +} + +static void sqr_fp2x2(vec768x ret, const vec384x a) +{ +#if 1 + sqr_382x(ret, a, BLS12_381_P); /* +~5% in final exponentiation */ +#else + vec384 t0, t1; + + add_mod_384(t0, a[0], a[1], BLS12_381_P); + sub_mod_384(t1, a[0], a[1], BLS12_381_P); + + mul_384(ret[1], a[0], a[1]); + add_mod_384x384(ret[1], ret[1], ret[1], BLS12_381_P); + + mul_384(ret[0], t0, t1); +#endif +} +#endif /* __FP2x2__ */ + +/* + * Fp6 extension + */ +#if defined(__FP2x2__) /* ~10-13% improvement for mul_fp12 and sqr_fp12 */ +typedef vec768x vec768fp6[3]; + +static inline void sub_fp6x2(vec768fp6 ret, const vec768fp6 a, + const vec768fp6 b) +{ + sub_fp2x2(ret[0], a[0], b[0]); + sub_fp2x2(ret[1], a[1], b[1]); + sub_fp2x2(ret[2], a[2], b[2]); +} + +static void mul_fp6x2(vec768fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec768x t0, t1, t2; + vec384x aa, bb; + + mul_fp2x2(t0, a[0], b[0]); + mul_fp2x2(t1, a[1], b[1]); + mul_fp2x2(t2, a[2], b[2]); + + /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0 + = (a1*b2 + a2*b1)*(u+1) + a0*b0 */ + add_fp2(aa, a[1], a[2]); + add_fp2(bb, b[1], b[2]); + mul_fp2x2(ret[0], aa, bb); + sub_fp2x2(ret[0], ret[0], t1); + sub_fp2x2(ret[0], ret[0], t2); + mul_by_u_plus_1_fp2x2(ret[1], ret[0]); /* borrow ret[1] for a moment */ + add_fp2x2(ret[0], ret[1], t0); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1) + = a0*b1 + a1*b0 + a2*b2*(u+1) */ + add_fp2(aa, a[0], a[1]); + add_fp2(bb, b[0], b[1]); + mul_fp2x2(ret[1], aa, bb); + sub_fp2x2(ret[1], ret[1], t0); + sub_fp2x2(ret[1], ret[1], t1); + mul_by_u_plus_1_fp2x2(ret[2], t2); /* borrow ret[2] for a moment */ + add_fp2x2(ret[1], ret[1], ret[2]); + + /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1 + = a0*b2 + a2*b0 + a1*b1 */ + add_fp2(aa, a[0], a[2]); + add_fp2(bb, b[0], b[2]); + mul_fp2x2(ret[2], aa, bb); + sub_fp2x2(ret[2], ret[2], t0); + sub_fp2x2(ret[2], ret[2], t2); + add_fp2x2(ret[2], ret[2], t1); +} + +static inline void redc_fp6x2(vec384fp6 ret, const vec768fp6 a) +{ + redc_fp2x2(ret[0], a[0]); + redc_fp2x2(ret[1], a[1]); + redc_fp2x2(ret[2], a[2]); +} + +static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec768fp6 r; + + mul_fp6x2(r, a, b); + redc_fp6x2(ret, r); /* narrow to normal width */ +} + +static void sqr_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec768x s0, m01, m12, s2, rx; + + sqr_fp2x2(s0, a[0]); + + mul_fp2x2(m01, a[0], a[1]); + add_fp2x2(m01, m01, m01); + + mul_fp2x2(m12, a[1], a[2]); + add_fp2x2(m12, m12, m12); + + sqr_fp2x2(s2, a[2]); + + /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2) + = a1^2 + 2*(a0*a2) */ + add_fp2(ret[2], a[2], a[1]); + add_fp2(ret[2], ret[2], a[0]); + sqr_fp2x2(rx, ret[2]); + sub_fp2x2(rx, rx, s0); + sub_fp2x2(rx, rx, s2); + sub_fp2x2(rx, rx, m01); + sub_fp2x2(rx, rx, m12); + redc_fp2x2(ret[2], rx); + + /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */ + mul_by_u_plus_1_fp2x2(rx, m12); + add_fp2x2(rx, rx, s0); + redc_fp2x2(ret[0], rx); + + /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */ + mul_by_u_plus_1_fp2x2(rx, s2); + add_fp2x2(rx, rx, m01); + redc_fp2x2(ret[1], rx); +} +#else +static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec384x t0, t1, t2, t3, t4, t5; + + mul_fp2(t0, a[0], b[0]); + mul_fp2(t1, a[1], b[1]); + mul_fp2(t2, a[2], b[2]); + + /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0 + = (a1*b2 + a2*b1)*(u+1) + a0*b0 */ + add_fp2(t4, a[1], a[2]); + add_fp2(t5, b[1], b[2]); + mul_fp2(t3, t4, t5); + sub_fp2(t3, t3, t1); + sub_fp2(t3, t3, t2); + mul_by_u_plus_1_fp2(t3, t3); + /* add_fp2(ret[0], t3, t0); considering possible aliasing... */ + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1) + = a0*b1 + a1*b0 + a2*b2*(u+1) */ + add_fp2(t4, a[0], a[1]); + add_fp2(t5, b[0], b[1]); + mul_fp2(ret[1], t4, t5); + sub_fp2(ret[1], ret[1], t0); + sub_fp2(ret[1], ret[1], t1); + mul_by_u_plus_1_fp2(t4, t2); + add_fp2(ret[1], ret[1], t4); + + /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1 + = a0*b2 + a2*b0 + a1*b1 */ + add_fp2(t4, a[0], a[2]); + add_fp2(t5, b[0], b[2]); + mul_fp2(ret[2], t4, t5); + sub_fp2(ret[2], ret[2], t0); + sub_fp2(ret[2], ret[2], t2); + add_fp2(ret[2], ret[2], t1); + + add_fp2(ret[0], t3, t0); /* ... moved from above */ +} + +static void sqr_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec384x s0, m01, m12, s2; + + sqr_fp2(s0, a[0]); + + mul_fp2(m01, a[0], a[1]); + add_fp2(m01, m01, m01); + + mul_fp2(m12, a[1], a[2]); + add_fp2(m12, m12, m12); + + sqr_fp2(s2, a[2]); + + /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2) + = a1^2 + 2*(a0*a2) */ + add_fp2(ret[2], a[2], a[1]); + add_fp2(ret[2], ret[2], a[0]); + sqr_fp2(ret[2], ret[2]); + sub_fp2(ret[2], ret[2], s0); + sub_fp2(ret[2], ret[2], s2); + sub_fp2(ret[2], ret[2], m01); + sub_fp2(ret[2], ret[2], m12); + + /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */ + mul_by_u_plus_1_fp2(ret[0], m12); + add_fp2(ret[0], ret[0], s0); + + /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */ + mul_by_u_plus_1_fp2(ret[1], s2); + add_fp2(ret[1], ret[1], m01); +} +#endif + +static void add_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + add_fp2(ret[0], a[0], b[0]); + add_fp2(ret[1], a[1], b[1]); + add_fp2(ret[2], a[2], b[2]); +} + +static void sub_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + sub_fp2(ret[0], a[0], b[0]); + sub_fp2(ret[1], a[1], b[1]); + sub_fp2(ret[2], a[2], b[2]); +} + +static void neg_fp6(vec384fp6 ret, const vec384fp6 a) +{ + neg_fp2(ret[0], a[0]); + neg_fp2(ret[1], a[1]); + neg_fp2(ret[2], a[2]); +} + +#if 0 +#define mul_by_v_fp6 mul_by_v_fp6 +static void mul_by_v_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec384x t; + + mul_by_u_plus_1_fp2(t, a[2]); + vec_copy(ret[2], a[1], sizeof(a[1])); + vec_copy(ret[1], a[0], sizeof(a[0])); + vec_copy(ret[0], t, sizeof(t)); +} +#endif + +/* + * Fp12 extension + */ +#if defined(__FP2x2__) +static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) +{ + vec768fp6 t0, t1, rx; + vec384fp6 t2; + + mul_fp6x2(t0, a[0], b[0]); + mul_fp6x2(t1, a[1], b[1]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + add_fp6(t2, a[0], a[1]); + add_fp6(ret[1], b[0], b[1]); + mul_fp6x2(rx, ret[1], t2); + sub_fp6x2(rx, rx, t0); + sub_fp6x2(rx, rx, t1); + redc_fp6x2(ret[1], rx); + + /* ret[0] = a0*b0 + a1*b1*v */ + mul_by_u_plus_1_fp2x2(rx[0], t1[2]); + add_fp2x2(rx[0], t0[0], rx[0]); + add_fp2x2(rx[1], t0[1], t1[0]); + add_fp2x2(rx[2], t0[2], t1[1]); + redc_fp6x2(ret[0], rx); +} + +static inline void mul_by_0y0_fp6x2(vec768fp6 ret, const vec384fp6 a, + const vec384fp2 b) +{ + mul_fp2x2(ret[1], a[2], b); /* borrow ret[1] for a moment */ + mul_by_u_plus_1_fp2x2(ret[0], ret[1]); + mul_fp2x2(ret[1], a[0], b); + mul_fp2x2(ret[2], a[1], b); +} + +static void mul_by_xy0_fp6x2(vec768fp6 ret, const vec384fp6 a, + const vec384fp6 b) +{ + vec768x t0, t1; + vec384x aa, bb; + + mul_fp2x2(t0, a[0], b[0]); + mul_fp2x2(t1, a[1], b[1]); + + /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0 + = (a1*0 + a2*b1)*(u+1) + a0*b0 */ + mul_fp2x2(ret[1], a[2], b[1]); /* borrow ret[1] for a moment */ + mul_by_u_plus_1_fp2x2(ret[0], ret[1]); + add_fp2x2(ret[0], ret[0], t0); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1) + = a0*b1 + a1*b0 + a2*0*(u+1) */ + add_fp2(aa, a[0], a[1]); + add_fp2(bb, b[0], b[1]); + mul_fp2x2(ret[1], aa, bb); + sub_fp2x2(ret[1], ret[1], t0); + sub_fp2x2(ret[1], ret[1], t1); + + /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1 + = a0*0 + a2*b0 + a1*b1 */ + mul_fp2x2(ret[2], a[2], b[0]); + add_fp2x2(ret[2], ret[2], t1); +} + +static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0) +{ + vec768fp6 t0, t1, rr; + vec384fp6 t2; + + mul_by_xy0_fp6x2(t0, a[0], xy00z0); + mul_by_0y0_fp6x2(t1, a[1], xy00z0[2]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + vec_copy(t2[0], xy00z0[0], sizeof(t2[0])); + add_fp2(t2[1], xy00z0[1], xy00z0[2]); + add_fp6(ret[1], a[0], a[1]); + mul_by_xy0_fp6x2(rr, ret[1], t2); + sub_fp6x2(rr, rr, t0); + sub_fp6x2(rr, rr, t1); + redc_fp6x2(ret[1], rr); + + /* ret[0] = a0*b0 + a1*b1*v */ + mul_by_u_plus_1_fp2x2(rr[0], t1[2]); + add_fp2x2(rr[0], t0[0], rr[0]); + add_fp2x2(rr[1], t0[1], t1[0]); + add_fp2x2(rr[2], t0[2], t1[1]); + redc_fp6x2(ret[0], rr); +} +#else +static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) +{ + vec384fp6 t0, t1, t2; + + mul_fp6(t0, a[0], b[0]); + mul_fp6(t1, a[1], b[1]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + add_fp6(t2, a[0], a[1]); + add_fp6(ret[1], b[0], b[1]); + mul_fp6(ret[1], ret[1], t2); + sub_fp6(ret[1], ret[1], t0); + sub_fp6(ret[1], ret[1], t1); + + /* ret[0] = a0*b0 + a1*b1*v */ +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + add_fp6(ret[0], t0, t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + add_fp2(ret[0][0], t0[0], t1[2]); + add_fp2(ret[0][1], t0[1], t1[0]); + add_fp2(ret[0][2], t0[2], t1[1]); +#endif +} + +static inline void mul_by_0y0_fp6(vec384fp6 ret, const vec384fp6 a, + const vec384fp2 b) +{ + vec384x t; + + mul_fp2(t, a[2], b); + mul_fp2(ret[2], a[1], b); + mul_fp2(ret[1], a[0], b); + mul_by_u_plus_1_fp2(ret[0], t); +} + +static void mul_by_xy0_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec384x t0, t1, /*t2,*/ t3, t4, t5; + + mul_fp2(t0, a[0], b[0]); + mul_fp2(t1, a[1], b[1]); + + /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0 + = (a1*0 + a2*b1)*(u+1) + a0*b0 */ + mul_fp2(t3, a[2], b[1]); + mul_by_u_plus_1_fp2(t3, t3); + /* add_fp2(ret[0], t3, t0); considering possible aliasing... */ + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1) + = a0*b1 + a1*b0 + a2*0*(u+1) */ + add_fp2(t4, a[0], a[1]); + add_fp2(t5, b[0], b[1]); + mul_fp2(ret[1], t4, t5); + sub_fp2(ret[1], ret[1], t0); + sub_fp2(ret[1], ret[1], t1); + + /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1 + = a0*0 + a2*b0 + a1*b1 */ + mul_fp2(ret[2], a[2], b[0]); + add_fp2(ret[2], ret[2], t1); + + add_fp2(ret[0], t3, t0); /* ... moved from above */ +} + +static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0) +{ + vec384fp6 t0, t1, t2; + + mul_by_xy0_fp6(t0, a[0], xy00z0); + mul_by_0y0_fp6(t1, a[1], xy00z0[2]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + vec_copy(t2[0], xy00z0[0], sizeof(t2[0])); + add_fp2(t2[1], xy00z0[1], xy00z0[2]); + add_fp6(ret[1], a[0], a[1]); + mul_by_xy0_fp6(ret[1], ret[1], t2); + sub_fp6(ret[1], ret[1], t0); + sub_fp6(ret[1], ret[1], t1); + + /* ret[0] = a0*b0 + a1*b1*v */ +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + add_fp6(ret[0], t0, t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + add_fp2(ret[0][0], t0[0], t1[2]); + add_fp2(ret[0][1], t0[1], t1[0]); + add_fp2(ret[0][2], t0[2], t1[1]); +#endif +} +#endif + +static void sqr_fp12(vec384fp12 ret, const vec384fp12 a) +{ + vec384fp6 t0, t1; + + add_fp6(t0, a[0], a[1]); +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, a[1]); + add_fp6(t1, a[0], t1); +#else + mul_by_u_plus_1_fp2(t1[2], a[1][2]); + add_fp2(t1[0], a[0][0], t1[2]); + add_fp2(t1[1], a[0][1], a[1][0]); + add_fp2(t1[2], a[0][2], a[1][1]); +#endif + mul_fp6(t0, t0, t1); + mul_fp6(t1, a[0], a[1]); + + /* ret[1] = 2*(a0*a1) */ + add_fp6(ret[1], t1, t1); + + /* ret[0] = (a0 + a1)*(a0 + a1*v) - a0*a1 - a0*a1*v + = a0^2 + a1^2*v */ + sub_fp6(ret[0], t0, t1); +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + sub_fp6(ret[0], ret[0], t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + sub_fp2(ret[0][0], ret[0][0], t1[2]); + sub_fp2(ret[0][1], ret[0][1], t1[0]); + sub_fp2(ret[0][2], ret[0][2], t1[1]); +#endif +} + +static void conjugate_fp12(vec384fp12 a) +{ neg_fp6(a[1], a[1]); } + +static void inverse_fp2(vec384x ret, const vec384x a) +{ + vec384 t0, t1; + + /* 1/(a0^2 + a1^2) */ + sqr_fp(t0, a[0]); + sqr_fp(t1, a[1]); + add_fp(t0, t0, t1); + /* It's assumed that "higher-dimension" operations are performed + * on public data, hence no requirement for constant-time-ness. */ + eucl_inverse_fp(t1, t0); + + mul_fp(ret[0], a[0], t1); + mul_fp(ret[1], a[1], t1); + neg_fp(ret[1], ret[1]); +} + +static void inverse_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec384x c0, c1, c2, t0, t1; + + /* c0 = a0^2 - (a1*a2)*(u+1) */ + sqr_fp2(c0, a[0]); + mul_fp2(t0, a[1], a[2]); + mul_by_u_plus_1_fp2(t0, t0); + sub_fp2(c0, c0, t0); + + /* c1 = a2^2*(u+1) - (a0*a1) */ + sqr_fp2(c1, a[2]); + mul_by_u_plus_1_fp2(c1, c1); + mul_fp2(t0, a[0], a[1]); + sub_fp2(c1, c1, t0); + + /* c2 = a1^2 - a0*a2 */ + sqr_fp2(c2, a[1]); + mul_fp2(t0, a[0], a[2]); + sub_fp2(c2, c2, t0); + + /* (a2*c1 + a1*c2)*(u+1) + a0*c0 */ + mul_fp2(t0, c1, a[2]); + mul_fp2(t1, c2, a[1]); + add_fp2(t0, t0, t1); + mul_by_u_plus_1_fp2(t0, t0); + mul_fp2(t1, c0, a[0]); + add_fp2(t0, t0, t1); + + inverse_fp2(t1, t0); + + mul_fp2(ret[0], c0, t1); + mul_fp2(ret[1], c1, t1); + mul_fp2(ret[2], c2, t1); +} + +static void inverse_fp12(vec384fp12 ret, const vec384fp12 a) +{ + vec384fp6 t0, t1; + + sqr_fp6(t0, a[0]); + sqr_fp6(t1, a[1]); +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + sub_fp6(t0, t0, t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + sub_fp2(t0[0], t0[0], t1[2]); + sub_fp2(t0[1], t0[1], t1[0]); + sub_fp2(t0[2], t0[2], t1[1]); +#endif + + inverse_fp6(t1, t0); + + mul_fp6(ret[0], a[0], t1); + mul_fp6(ret[1], a[1], t1); + neg_fp6(ret[1], ret[1]); +} + +typedef vec384x vec384fp4[2]; + +#if defined(__FP2x2__) +static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1) +{ + vec768x t0, t1, t2; + + sqr_fp2x2(t0, a0); + sqr_fp2x2(t1, a1); + add_fp2(ret[1], a0, a1); + + mul_by_u_plus_1_fp2x2(t2, t1); + add_fp2x2(t2, t2, t0); + redc_fp2x2(ret[0], t2); + + sqr_fp2x2(t2, ret[1]); + sub_fp2x2(t2, t2, t0); + sub_fp2x2(t2, t2, t1); + redc_fp2x2(ret[1], t2); +} +#else +static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1) +{ + vec384x t0, t1; + + sqr_fp2(t0, a0); + sqr_fp2(t1, a1); + add_fp2(ret[1], a0, a1); + + mul_by_u_plus_1_fp2(ret[0], t1); + add_fp2(ret[0], ret[0], t0); + + sqr_fp2(ret[1], ret[1]); + sub_fp2(ret[1], ret[1], t0); + sub_fp2(ret[1], ret[1], t1); +} +#endif + +static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a) +{ + vec384fp4 t0, t1, t2; + + sqr_fp4(t0, a[0][0], a[1][1]); + sqr_fp4(t1, a[1][0], a[0][2]); + sqr_fp4(t2, a[0][1], a[1][2]); + + sub_fp2(ret[0][0], t0[0], a[0][0]); + add_fp2(ret[0][0], ret[0][0], ret[0][0]); + add_fp2(ret[0][0], ret[0][0], t0[0]); + + sub_fp2(ret[0][1], t1[0], a[0][1]); + add_fp2(ret[0][1], ret[0][1], ret[0][1]); + add_fp2(ret[0][1], ret[0][1], t1[0]); + + sub_fp2(ret[0][2], t2[0], a[0][2]); + add_fp2(ret[0][2], ret[0][2], ret[0][2]); + add_fp2(ret[0][2], ret[0][2], t2[0]); + + mul_by_u_plus_1_fp2(t2[1], t2[1]); + add_fp2(ret[1][0], t2[1], a[1][0]); + add_fp2(ret[1][0], ret[1][0], ret[1][0]); + add_fp2(ret[1][0], ret[1][0], t2[1]); + + add_fp2(ret[1][1], t0[1], a[1][1]); + add_fp2(ret[1][1], ret[1][1], ret[1][1]); + add_fp2(ret[1][1], ret[1][1], t0[1]); + + add_fp2(ret[1][2], t1[1], a[1][2]); + add_fp2(ret[1][2], ret[1][2], ret[1][2]); + add_fp2(ret[1][2], ret[1][2], t1[1]); +} + +/* + * caveat lector! |n| has to be non-zero and not more than 3! + */ +static inline void frobenius_map_fp2(vec384x ret, const vec384x a, size_t n) +{ + vec_copy(ret[0], a[0], sizeof(ret[0])); + cneg_fp(ret[1], a[1], n & 1); +} + +static void frobenius_map_fp6(vec384fp6 ret, const vec384fp6 a, size_t n) +{ + static const vec384x coeffs1[] = { /* (u + 1)^((P^n - 1) / 3) */ + { { 0 }, + { TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), + TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), + TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) } }, + { { TO_LIMB_T(0x30f1361b798a64e8), TO_LIMB_T(0xf3b8ddab7ece5a2a), + TO_LIMB_T(0x16a8ca3ac61577f7), TO_LIMB_T(0xc26a2ff874fd029b), + TO_LIMB_T(0x3636b76660701c6e), TO_LIMB_T(0x051ba4ab241b6160) } }, + { { 0 }, { ONE_MONT_P } } + }; + static const vec384 coeffs2[] = { /* (u + 1)^((2P^n - 2) / 3) */ + { TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5), + TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024), + TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a) }, + { TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), + TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), + TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) }, + { TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd), + TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a), + TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206) } + }; + + frobenius_map_fp2(ret[0], a[0], n); + frobenius_map_fp2(ret[1], a[1], n); + frobenius_map_fp2(ret[2], a[2], n); + --n; /* implied ONE_MONT_P at index 0 */ + mul_fp2(ret[1], ret[1], coeffs1[n]); + mul_fp(ret[2][0], ret[2][0], coeffs2[n]); + mul_fp(ret[2][1], ret[2][1], coeffs2[n]); +} + +static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n) +{ + static const vec384x coeffs[] = { /* (u + 1)^((P^n - 1) / 6) */ + { { TO_LIMB_T(0x07089552b319d465), TO_LIMB_T(0xc6695f92b50a8313), + TO_LIMB_T(0x97e83cccd117228f), TO_LIMB_T(0xa35baecab2dc29ee), + TO_LIMB_T(0x1ce393ea5daace4d), TO_LIMB_T(0x08f2220fb0fb66eb) }, + { TO_LIMB_T(0xb2f66aad4ce5d646), TO_LIMB_T(0x5842a06bfc497cec), + TO_LIMB_T(0xcf4895d42599d394), TO_LIMB_T(0xc11b9cba40a8e8d0), + TO_LIMB_T(0x2e3813cbe5a0de89), TO_LIMB_T(0x110eefda88847faf) } }, + { { TO_LIMB_T(0xecfb361b798dba3a), TO_LIMB_T(0xc100ddb891865a2c), + TO_LIMB_T(0x0ec08ff1232bda8e), TO_LIMB_T(0xd5c13cc6f1ca4721), + TO_LIMB_T(0x47222a47bf7b5c04), TO_LIMB_T(0x0110f184e51c5f59) } }, + { { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), + TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), + TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } }, + }; + + frobenius_map_fp6(ret[0], a[0], n); + frobenius_map_fp6(ret[1], a[1], n); + --n; /* implied ONE_MONT_P at index 0 */ + mul_fp2(ret[1][0], ret[1][0], coeffs[n]); + mul_fp2(ret[1][1], ret[1][1], coeffs[n]); + mul_fp2(ret[1][2], ret[1][2], coeffs[n]); +} + + +/* + * BLS12-381-specifc Fp12 shortcuts. + */ +void blst_fp12_sqr(vec384fp12 ret, const vec384fp12 a) +{ sqr_fp12(ret, a); } + +void blst_fp12_cyclotomic_sqr(vec384fp12 ret, const vec384fp12 a) +{ cyclotomic_sqr_fp12(ret, a); } + +void blst_fp12_mul(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) +{ mul_fp12(ret, a, b); } + +void blst_fp12_mul_by_xy00z0(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0) +{ mul_by_xy00z0_fp12(ret, a, xy00z0); } + +void blst_fp12_conjugate(vec384fp12 a) +{ conjugate_fp12(a); } + +void blst_fp12_inverse(vec384fp12 ret, const vec384fp12 a) +{ inverse_fp12(ret, a); } + +/* caveat lector! |n| has to be non-zero and not more than 3! */ +void blst_fp12_frobenius_map(vec384fp12 ret, const vec384fp12 a, size_t n) +{ frobenius_map_fp12(ret, a, n); } + +limb_t blst_fp12_is_equal(const vec384fp12 a, const vec384fp12 b) +{ return vec_is_equal(a, b, sizeof(vec384fp12)); } + +limb_t blst_fp12_is_one(const vec384fp12 a) +{ + return vec_is_equal(a[0][0], BLS12_381_Rx.p2, sizeof(a[0][0])) & + vec_is_zero(a[0][1], sizeof(vec384fp12) - sizeof(a[0][0])); +} diff --git a/src/hash_to_field.c b/src/hash_to_field.c new file mode 100644 index 00000000..aed1e688 --- /dev/null +++ b/src/hash_to_field.c @@ -0,0 +1,136 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "consts.h" +#include "sha256.h" + +static const vec384 BLS12_381_RRRR = { /* RR^2 */ + TO_LIMB_T(0xed48ac6bd94ca1e0), TO_LIMB_T(0x315f831e03a7adf8), + TO_LIMB_T(0x9a53352a615e29dd), TO_LIMB_T(0x34c04e5e921e1761), + TO_LIMB_T(0x2512d43565724728), TO_LIMB_T(0x0aa6346091755d4d) +}; + +static void sha256_init_Zpad(SHA256_CTX *ctx) +{ + ctx->h[0] = 0xda5698beU; + ctx->h[1] = 0x17b9b469U; + ctx->h[2] = 0x62335799U; + ctx->h[3] = 0x779fbecaU; + ctx->h[4] = 0x8ce5d491U; + ctx->h[5] = 0xc0d26243U; + ctx->h[6] = 0xbafef9eaU; + ctx->h[7] = 0x1837a9d8U; + ctx->N = 64; + vec_zero(ctx->buf, sizeof(ctx->buf)); + ctx->off = 0; +} + +static void vec_xor(void *restrict ret, const void *restrict a, + const void *restrict b, size_t num) +{ + limb_t *rp = (limb_t *)ret; + const limb_t *ap = (const limb_t *)a; + const limb_t *bp = (const limb_t *)b; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] = ap[i] ^ bp[i]; +} + +static void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len) +{ + union { limb_t align; unsigned char c[32]; } b_0; + union { limb_t align; unsigned char c[33+256+31]; } b_i; + unsigned char *p; + size_t i, b_i_bits, b_i_blocks; + SHA256_CTX ctx; + + /* + * compose template for 'strxor(b_0, b_(i-1)) || I2OSP(i, 1) || DST_prime' + */ + DST_len &= 0xff; /* just in case */ + b_i_blocks = ((33 + DST_len + 1 + 9) + 63) & -64; + vec_zero(b_i.c + b_i_blocks - 64, 64); + + p = b_i.c + 33; + for (i = 0; i < DST_len; i++) + p[i] = DST[i]; + p[i++] = (unsigned char)DST_len; + p[i++] = 0x80; + b_i_bits = (33 + DST_len + 1) * 8; + p = b_i.c + b_i_blocks; + p[-2] = (unsigned char)(b_i_bits >> 8); + p[-1] = (unsigned char)(b_i_bits); + + sha256_init_Zpad(&ctx); /* Z_pad | */ + sha256_update(&ctx, aug, aug_len); /* | aug | */ + sha256_update(&ctx, msg, msg_len); /* | msg | */ + /* | I2OSP(len_in_bytes, 2) || I2OSP(0, 1) || DST_prime */ + b_i.c[30] = (unsigned char)(len_in_bytes >> 8); + b_i.c[31] = (unsigned char)(len_in_bytes); + b_i.c[32] = 0; + sha256_update(&ctx, b_i.c + 30, 3 + DST_len + 1); + sha256_final(b_0.c, &ctx); + + sha256_init_h(ctx.h); + vec_copy(b_i.c, b_0.c, 32); + ++b_i.c[32]; + sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64); + sha256_emit(bytes, ctx.h); + + len_in_bytes /= 32; /* divisible by 64, remember? hence 32 works too */ + while (--len_in_bytes) { + sha256_init_h(ctx.h); + vec_xor(b_i.c, b_0.c, bytes, 32); + bytes += 32; + ++b_i.c[32]; + sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64); + sha256_emit(bytes, ctx.h); + } +} + +/* + * |nelems| is 'count * m' from spec + */ +static void hash_to_field(vec384 elems[], size_t nelems, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len) +{ + size_t L = sizeof(vec384) + 128/8; /* ceil((ceil(log2(p)) + k) / 8) */ + size_t len_in_bytes = L * nelems; /* divisible by 64, hurray! */ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 + limb_t *pseudo_random = alloca(len_in_bytes); +#else + limb_t pseudo_random[len_in_bytes/sizeof(limb_t)]; +#endif + unsigned char *bytes; + vec768 elem; + + aug_len = aug!=NULL ? aug_len : 0; + DST_len = DST!=NULL ? DST_len : 0; + + expand_message_xmd((unsigned char *)pseudo_random, len_in_bytes, + aug, aug_len, msg, msg_len, DST, DST_len); + + vec_zero(elem, sizeof(elem)); + bytes = (unsigned char *)pseudo_random; + while (nelems--) { + limbs_from_be_bytes(elem, bytes, L); + bytes += L; + /* + * L-bytes block % P, output is in Montgomery domain... + */ + redc_mont_384(elems[0], elem, BLS12_381_P, p0); + mul_mont_384(elems[0], elems[0], BLS12_381_RRRR, BLS12_381_P, p0); + elems++; + } +} diff --git a/src/keygen.c b/src/keygen.c new file mode 100644 index 00000000..28446770 --- /dev/null +++ b/src/keygen.c @@ -0,0 +1,161 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "consts.h" +#include "sha256.h" + +typedef struct { + SHA256_CTX ctx; + unsigned int h_ipad[8]; + unsigned int h_opad[8]; + union { limb_t l[64/sizeof(limb_t)]; unsigned char c[64]; } tail; +} HMAC_SHA256_CTX; + +static void HMAC_init(HMAC_SHA256_CTX *ctx, const void *K, size_t K_len) +{ + size_t i; + + if (K == NULL) { /* reuse h_ipad and h_opad */ + sha256_hcopy(ctx->ctx.h, ctx->h_ipad); + ctx->ctx.N = 64; + vec_zero(ctx->ctx.buf, sizeof(ctx->ctx.buf)); + ctx->ctx.off = 0; + + return; + } + + vec_zero(ctx->tail.c, sizeof(ctx->tail)); + if (K_len > 64) { + sha256_init(&ctx->ctx); + sha256_update(&ctx->ctx, K, K_len); + sha256_final(ctx->tail.c, &ctx->ctx); + } else { + sha256_bcopy(ctx->tail.c, K, K_len); + } + + for (i = 0; i < 64/sizeof(limb_t); i++) + ctx->tail.l[i] ^= (limb_t)0x3636363636363636; + + sha256_init(&ctx->ctx); + sha256_update(&ctx->ctx, ctx->tail.c, 64); + sha256_hcopy(ctx->h_ipad, ctx->ctx.h); + + for (i = 0; i < 64/sizeof(limb_t); i++) + ctx->tail.l[i] ^= (limb_t)(0x3636363636363636 ^ 0x5c5c5c5c5c5c5c5c); + + sha256_init_h(ctx->h_opad); + sha256_block_data_order(ctx->h_opad, ctx->tail.c, 1); + + vec_zero(ctx->tail.c, sizeof(ctx->tail)); + ctx->tail.c[32] = 0x80; + ctx->tail.c[62] = 3; /* (64+32)*8 in big endian */ + ctx->tail.c[63] = 0; +} + +static void HMAC_update(HMAC_SHA256_CTX *ctx, const unsigned char *inp, + size_t len) +{ sha256_update(&ctx->ctx, inp, len); } + +static void HMAC_final(unsigned char md[32], HMAC_SHA256_CTX *ctx) +{ + sha256_final(ctx->tail.c, &ctx->ctx); + sha256_hcopy(ctx->ctx.h, ctx->h_opad); + sha256_block_data_order(ctx->ctx.h, ctx->tail.c, 1); + sha256_emit(md, ctx->ctx.h); +} + +static void HKDF_Extract(unsigned char PRK[32], + const void *salt, size_t salt_len, + const void *IKM, size_t IKM_len, + HMAC_SHA256_CTX *ctx) +{ + unsigned char zero[1] = { 0 }; + + HMAC_init(ctx, salt != NULL ? salt : zero, salt_len); + HMAC_update(ctx, IKM, IKM_len); +#ifndef HKDF_TESTMODE + /* Section 2.3 KeyGen in BLS-signature draft */ + HMAC_update(ctx, zero, 1); +#endif + HMAC_final(PRK, ctx); +} + +static void HKDF_Expand(unsigned char *OKM, size_t L, + const unsigned char PRK[32], + const void *info, size_t info_len, + HMAC_SHA256_CTX *ctx) +{ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 + unsigned char *info_prime = alloca(info_len + 2 + 1); +#else + unsigned char info_prime[info_len + 2 + 1]; +#endif + + HMAC_init(ctx, PRK, 32); + + if (info_len != 0) + sha256_bcopy(info_prime, info, info_len); +#ifndef HKDF_TESTMODE + /* Section 2.3 KeyGen in BLS-signature draft */ + info_prime[info_len + 0] = (unsigned char)(L >> 8); + info_prime[info_len + 1] = (unsigned char)(L); + info_len += 2; +#endif + info_prime[info_len] = 1; /* counter */ + HMAC_update(ctx, info_prime, info_len + 1); + HMAC_final(ctx->tail.c, ctx); + while (L > 32) { + sha256_hcopy((unsigned int *)OKM, (const unsigned int *)ctx->tail.c); + OKM += 32; L -= 32; + ++info_prime[info_len]; /* counter */ + HMAC_init(ctx, NULL, 0); + HMAC_update(ctx, ctx->tail.c, 32); + HMAC_update(ctx, info_prime, info_len + 1); + HMAC_final(ctx->tail.c, ctx); + } + sha256_bcopy(OKM, ctx->tail.c, L); +} + +#ifndef HKDF_TESTMODE +void blst_keygen(vec256 SK, const void *IKM, size_t IKM_len, + const void *info, size_t info_len) +{ + struct { + HMAC_SHA256_CTX ctx; + unsigned char PRK[32], OKM[48]; + vec512 key; + } scratch; + size_t i; + volatile limb_t *p; + + /* + * Vet |info| since some callers were caught to be sloppy, e.g. + * SWIG-4.0-generated Python wrapper... + */ + info_len = info==NULL ? 0 : info_len; + + /* PRK = HKDF-Extract("BLS-SIG-KEYGEN-SALT-", IKM || I2OSP(0, 1)) */ + HKDF_Extract(scratch.PRK, "BLS-SIG-KEYGEN-SALT-", 20, + IKM, IKM_len, &scratch.ctx); + + /* OKM = HKDF-Expand(PRK, key_info || I2OSP(L, 2), L) */ + HKDF_Expand(scratch.OKM, sizeof(scratch.OKM), scratch.PRK, + info, info_len, &scratch.ctx); + + /* SK = OS2IP(OKM) mod r */ + vec_zero(scratch.key, sizeof(scratch.key)); + limbs_from_be_bytes(scratch.key, scratch.OKM, sizeof(scratch.OKM)); + redc_mont_256(SK, scratch.key, BLS12_381_r, r0); + mul_mont_sparse_256(SK, SK, BLS12_381_rRR, BLS12_381_r, r0); + + /* + * scrub the stack just in case next callee inadvertently flashes + * a fragment across application boundary... + */ + for(p = (limb_t *)&scratch, i = 0; i < sizeof(scratch)/sizeof(limb_t); i++) + p[i] = 0; +} +#endif diff --git a/src/map_to_g1.c b/src/map_to_g1.c new file mode 100644 index 00000000..43a2546c --- /dev/null +++ b/src/map_to_g1.c @@ -0,0 +1,533 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" + +/* + * y^2 = x^3 + A'*x + B', isogenous one + */ +static const vec384 Aprime_E1 = { + /* (0x00144698a3b8e9433d693a02c96d4982b0ea985383ee66a8 + d8e8981aefd881ac98936f8da0e0f97f5cf428082d584c1d << 384) % P */ + TO_LIMB_T(0x2f65aa0e9af5aa51), TO_LIMB_T(0x86464c2d1e8416c3), + TO_LIMB_T(0xb85ce591b7bd31e2), TO_LIMB_T(0x27e11c91b5f24e7c), + TO_LIMB_T(0x28376eda6bfc1835), TO_LIMB_T(0x155455c3e5071d85) +}; +static const vec384 Bprime_E1 = { + /* (0x12e2908d11688030018b12e8753eee3b2016c1f0f24f4070 + a0b9c14fcef35ef55a23215a316ceaa5d1cc48e98e172be0 << 384) % P */ + TO_LIMB_T(0xfb996971fe22a1e0), TO_LIMB_T(0x9aa93eb35b742d6f), + TO_LIMB_T(0x8c476013de99c5c4), TO_LIMB_T(0x873e27c3a221e571), + TO_LIMB_T(0xca72b5e45a52d888), TO_LIMB_T(0x06824061418a386b) +}; + +static void map_fp_times_Zz(vec384 map[], const vec384 isogeny_map[], + const vec384 Zz_powers[], size_t n) +{ + while (n--) + mul_fp(map[n], isogeny_map[n], Zz_powers[n]); +} + +static void map_fp(vec384 acc, const vec384 x, const vec384 map[], size_t n) +{ + while (n--) { + mul_fp(acc, acc, x); + add_fp(acc, acc, map[n]); + } +} + +static void isogeny_map_to_E1(POINTonE1 *out, const POINTonE1 *p) +{ + /* + * x = x_num / x_den, where + * x_num = k_(1,11) * x'^11 + k_(1,10) * x'^10 + k_(1,9) * x'^9 + + * ... + k_(1,0) + * ... + */ + static const vec384 isogeny_map_x_num[] = { /* (k_(1,*)<<384) % P */ + { TO_LIMB_T(0x4d18b6f3af00131c), TO_LIMB_T(0x19fa219793fee28c), + TO_LIMB_T(0x3f2885f1467f19ae), TO_LIMB_T(0x23dcea34f2ffb304), + TO_LIMB_T(0xd15b58d2ffc00054), TO_LIMB_T(0x0913be200a20bef4) }, + { TO_LIMB_T(0x898985385cdbbd8b), TO_LIMB_T(0x3c79e43cc7d966aa), + TO_LIMB_T(0x1597e193f4cd233a), TO_LIMB_T(0x8637ef1e4d6623ad), + TO_LIMB_T(0x11b22deed20d827b), TO_LIMB_T(0x07097bc5998784ad) }, + { TO_LIMB_T(0xa542583a480b664b), TO_LIMB_T(0xfc7169c026e568c6), + TO_LIMB_T(0x5ba2ef314ed8b5a6), TO_LIMB_T(0x5b5491c05102f0e7), + TO_LIMB_T(0xdf6e99707d2a0079), TO_LIMB_T(0x0784151ed7605524) }, + { TO_LIMB_T(0x494e212870f72741), TO_LIMB_T(0xab9be52fbda43021), + TO_LIMB_T(0x26f5577994e34c3d), TO_LIMB_T(0x049dfee82aefbd60), + TO_LIMB_T(0x65dadd7828505289), TO_LIMB_T(0x0e93d431ea011aeb) }, + { TO_LIMB_T(0x90ee774bd6a74d45), TO_LIMB_T(0x7ada1c8a41bfb185), + TO_LIMB_T(0x0f1a8953b325f464), TO_LIMB_T(0x104c24211be4805c), + TO_LIMB_T(0x169139d319ea7a8f), TO_LIMB_T(0x09f20ead8e532bf6) }, + { TO_LIMB_T(0x6ddd93e2f43626b7), TO_LIMB_T(0xa5482c9aa1ccd7bd), + TO_LIMB_T(0x143245631883f4bd), TO_LIMB_T(0x2e0a94ccf77ec0db), + TO_LIMB_T(0xb0282d480e56489f), TO_LIMB_T(0x18f4bfcbb4368929) }, + { TO_LIMB_T(0x23c5f0c953402dfd), TO_LIMB_T(0x7a43ff6958ce4fe9), + TO_LIMB_T(0x2c390d3d2da5df63), TO_LIMB_T(0xd0df5c98e1f9d70f), + TO_LIMB_T(0xffd89869a572b297), TO_LIMB_T(0x1277ffc72f25e8fe) }, + { TO_LIMB_T(0x79f4f0490f06a8a6), TO_LIMB_T(0x85f894a88030fd81), + TO_LIMB_T(0x12da3054b18b6410), TO_LIMB_T(0xe2a57f6505880d65), + TO_LIMB_T(0xbba074f260e400f1), TO_LIMB_T(0x08b76279f621d028) }, + { TO_LIMB_T(0xe67245ba78d5b00b), TO_LIMB_T(0x8456ba9a1f186475), + TO_LIMB_T(0x7888bff6e6b33bb4), TO_LIMB_T(0xe21585b9a30f86cb), + TO_LIMB_T(0x05a69cdcef55feee), TO_LIMB_T(0x09e699dd9adfa5ac) }, + { TO_LIMB_T(0x0de5c357bff57107), TO_LIMB_T(0x0a0db4ae6b1a10b2), + TO_LIMB_T(0xe256bb67b3b3cd8d), TO_LIMB_T(0x8ad456574e9db24f), + TO_LIMB_T(0x0443915f50fd4179), TO_LIMB_T(0x098c4bf7de8b6375) }, + { TO_LIMB_T(0xe6b0617e7dd929c7), TO_LIMB_T(0xfe6e37d442537375), + TO_LIMB_T(0x1dafdeda137a489e), TO_LIMB_T(0xe4efd1ad3f767ceb), + TO_LIMB_T(0x4a51d8667f0fe1cf), TO_LIMB_T(0x054fdf4bbf1d821c) }, + { TO_LIMB_T(0x72db2a50658d767b), TO_LIMB_T(0x8abf91faa257b3d5), + TO_LIMB_T(0xe969d6833764ab47), TO_LIMB_T(0x464170142a1009eb), + TO_LIMB_T(0xb14f01aadb30be2f), TO_LIMB_T(0x18ae6a856f40715d) } + }; + /* ... + * x_den = x'^10 + k_(2,9) * x'^9 + k_(2,8) * x'^8 + ... + k_(2,0) + */ + static const vec384 isogeny_map_x_den[] = { /* (k_(2,*)<<384) % P */ + { TO_LIMB_T(0xb962a077fdb0f945), TO_LIMB_T(0xa6a9740fefda13a0), + TO_LIMB_T(0xc14d568c3ed6c544), TO_LIMB_T(0xb43fc37b908b133e), + TO_LIMB_T(0x9c0b3ac929599016), TO_LIMB_T(0x0165aa6c93ad115f) }, + { TO_LIMB_T(0x23279a3ba506c1d9), TO_LIMB_T(0x92cfca0a9465176a), + TO_LIMB_T(0x3b294ab13755f0ff), TO_LIMB_T(0x116dda1c5070ae93), + TO_LIMB_T(0xed4530924cec2045), TO_LIMB_T(0x083383d6ed81f1ce) }, + { TO_LIMB_T(0x9885c2a6449fecfc), TO_LIMB_T(0x4a2b54ccd37733f0), + TO_LIMB_T(0x17da9ffd8738c142), TO_LIMB_T(0xa0fba72732b3fafd), + TO_LIMB_T(0xff364f36e54b6812), TO_LIMB_T(0x0f29c13c660523e2) }, + { TO_LIMB_T(0xe349cc118278f041), TO_LIMB_T(0xd487228f2f3204fb), + TO_LIMB_T(0xc9d325849ade5150), TO_LIMB_T(0x43a92bd69c15c2df), + TO_LIMB_T(0x1c2c7844bc417be4), TO_LIMB_T(0x12025184f407440c) }, + { TO_LIMB_T(0x587f65ae6acb057b), TO_LIMB_T(0x1444ef325140201f), + TO_LIMB_T(0xfbf995e71270da49), TO_LIMB_T(0xccda066072436a42), + TO_LIMB_T(0x7408904f0f186bb2), TO_LIMB_T(0x13b93c63edf6c015) }, + { TO_LIMB_T(0xfb918622cd141920), TO_LIMB_T(0x4a4c64423ecaddb4), + TO_LIMB_T(0x0beb232927f7fb26), TO_LIMB_T(0x30f94df6f83a3dc2), + TO_LIMB_T(0xaeedd424d780f388), TO_LIMB_T(0x06cc402dd594bbeb) }, + { TO_LIMB_T(0xd41f761151b23f8f), TO_LIMB_T(0x32a92465435719b3), + TO_LIMB_T(0x64f436e888c62cb9), TO_LIMB_T(0xdf70a9a1f757c6e4), + TO_LIMB_T(0x6933a38d5b594c81), TO_LIMB_T(0x0c6f7f7237b46606) }, + { TO_LIMB_T(0x693c08747876c8f7), TO_LIMB_T(0x22c9850bf9cf80f0), + TO_LIMB_T(0x8e9071dab950c124), TO_LIMB_T(0x89bc62d61c7baf23), + TO_LIMB_T(0xbc6be2d8dad57c23), TO_LIMB_T(0x17916987aa14a122) }, + { TO_LIMB_T(0x1be3ff439c1316fd), TO_LIMB_T(0x9965243a7571dfa7), + TO_LIMB_T(0xc7f7f62962f5cd81), TO_LIMB_T(0x32c6aa9af394361c), + TO_LIMB_T(0xbbc2ee18e1c227f4), TO_LIMB_T(0x0c102cbac531bb34) }, + { TO_LIMB_T(0x997614c97bacbf07), TO_LIMB_T(0x61f86372b99192c0), + TO_LIMB_T(0x5b8c95fc14353fc3), TO_LIMB_T(0xca2b066c2a87492f), + TO_LIMB_T(0x16178f5bbf698711), TO_LIMB_T(0x12a6dcd7f0f4e0e8) } + }; + /* + * y = y' * y_num / y_den, where + * y_num = k_(3,15) * x'^15 + k_(3,14) * x'^14 + k_(3,13) * x'^13 + + * ... + k_(3,0) + * ... + */ + static const vec384 isogeny_map_y_num[] = { /* (k_(3,*)<<384) % P */ + { TO_LIMB_T(0x2b567ff3e2837267), TO_LIMB_T(0x1d4d9e57b958a767), + TO_LIMB_T(0xce028fea04bd7373), TO_LIMB_T(0xcc31a30a0b6cd3df), + TO_LIMB_T(0x7d7b18a682692693), TO_LIMB_T(0x0d300744d42a0310) }, + { TO_LIMB_T(0x99c2555fa542493f), TO_LIMB_T(0xfe7f53cc4874f878), + TO_LIMB_T(0x5df0608b8f97608a), TO_LIMB_T(0x14e03832052b49c8), + TO_LIMB_T(0x706326a6957dd5a4), TO_LIMB_T(0x0a8dadd9c2414555) }, + { TO_LIMB_T(0x13d942922a5cf63a), TO_LIMB_T(0x357e33e36e261e7d), + TO_LIMB_T(0xcf05a27c8456088d), TO_LIMB_T(0x0000bd1de7ba50f0), + TO_LIMB_T(0x83d0c7532f8c1fde), TO_LIMB_T(0x13f70bf38bbf2905) }, + { TO_LIMB_T(0x5c57fd95bfafbdbb), TO_LIMB_T(0x28a359a65e541707), + TO_LIMB_T(0x3983ceb4f6360b6d), TO_LIMB_T(0xafe19ff6f97e6d53), + TO_LIMB_T(0xb3468f4550192bf7), TO_LIMB_T(0x0bb6cde49d8ba257) }, + { TO_LIMB_T(0x590b62c7ff8a513f), TO_LIMB_T(0x314b4ce372cacefd), + TO_LIMB_T(0x6bef32ce94b8a800), TO_LIMB_T(0x6ddf84a095713d5f), + TO_LIMB_T(0x64eace4cb0982191), TO_LIMB_T(0x0386213c651b888d) }, + { TO_LIMB_T(0xa5310a31111bbcdd), TO_LIMB_T(0xa14ac0f5da148982), + TO_LIMB_T(0xf9ad9cc95423d2e9), TO_LIMB_T(0xaa6ec095283ee4a7), + TO_LIMB_T(0xcf5b1f022e1c9107), TO_LIMB_T(0x01fddf5aed881793) }, + { TO_LIMB_T(0x65a572b0d7a7d950), TO_LIMB_T(0xe25c2d8183473a19), + TO_LIMB_T(0xc2fcebe7cb877dbd), TO_LIMB_T(0x05b2d36c769a89b0), + TO_LIMB_T(0xba12961be86e9efb), TO_LIMB_T(0x07eb1b29c1dfde1f) }, + { TO_LIMB_T(0x93e09572f7c4cd24), TO_LIMB_T(0x364e929076795091), + TO_LIMB_T(0x8569467e68af51b5), TO_LIMB_T(0xa47da89439f5340f), + TO_LIMB_T(0xf4fa918082e44d64), TO_LIMB_T(0x0ad52ba3e6695a79) }, + { TO_LIMB_T(0x911429844e0d5f54), TO_LIMB_T(0xd03f51a3516bb233), + TO_LIMB_T(0x3d587e5640536e66), TO_LIMB_T(0xfa86d2a3a9a73482), + TO_LIMB_T(0xa90ed5adf1ed5537), TO_LIMB_T(0x149c9c326a5e7393) }, + { TO_LIMB_T(0x462bbeb03c12921a), TO_LIMB_T(0xdc9af5fa0a274a17), + TO_LIMB_T(0x9a558ebde836ebed), TO_LIMB_T(0x649ef8f11a4fae46), + TO_LIMB_T(0x8100e1652b3cdc62), TO_LIMB_T(0x1862bd62c291dacb) }, + { TO_LIMB_T(0x05c9b8ca89f12c26), TO_LIMB_T(0x0194160fa9b9ac4f), + TO_LIMB_T(0x6a643d5a6879fa2c), TO_LIMB_T(0x14665bdd8846e19d), + TO_LIMB_T(0xbb1d0d53af3ff6bf), TO_LIMB_T(0x12c7e1c3b28962e5) }, + { TO_LIMB_T(0xb55ebf900b8a3e17), TO_LIMB_T(0xfedc77ec1a9201c4), + TO_LIMB_T(0x1f07db10ea1a4df4), TO_LIMB_T(0x0dfbd15dc41a594d), + TO_LIMB_T(0x389547f2334a5391), TO_LIMB_T(0x02419f98165871a4) }, + { TO_LIMB_T(0xb416af000745fc20), TO_LIMB_T(0x8e563e9d1ea6d0f5), + TO_LIMB_T(0x7c763e17763a0652), TO_LIMB_T(0x01458ef0159ebbef), + TO_LIMB_T(0x8346fe421f96bb13), TO_LIMB_T(0x0d2d7b829ce324d2) }, + { TO_LIMB_T(0x93096bb538d64615), TO_LIMB_T(0x6f2a2619951d823a), + TO_LIMB_T(0x8f66b3ea59514fa4), TO_LIMB_T(0xf563e63704f7092f), + TO_LIMB_T(0x724b136c4cf2d9fa), TO_LIMB_T(0x046959cfcfd0bf49) }, + { TO_LIMB_T(0xea748d4b6e405346), TO_LIMB_T(0x91e9079c2c02d58f), + TO_LIMB_T(0x41064965946d9b59), TO_LIMB_T(0xa06731f1d2bbe1ee), + TO_LIMB_T(0x07f897e267a33f1b), TO_LIMB_T(0x1017290919210e5f) }, + { TO_LIMB_T(0x872aa6c17d985097), TO_LIMB_T(0xeecc53161264562a), + TO_LIMB_T(0x07afe37afff55002), TO_LIMB_T(0x54759078e5be6838), + TO_LIMB_T(0xc4b92d15db8acca8), TO_LIMB_T(0x106d87d1b51d13b9) } + }; + /* ... + * y_den = x'^15 + k_(4,14) * x'^14 + k_(4,13) * x'^13 + ... + k_(4,0) + */ + static const vec384 isogeny_map_y_den[] = { /* (k_(4,*)<<384) % P */ + { TO_LIMB_T(0xeb6c359d47e52b1c), TO_LIMB_T(0x18ef5f8a10634d60), + TO_LIMB_T(0xddfa71a0889d5b7e), TO_LIMB_T(0x723e71dcc5fc1323), + TO_LIMB_T(0x52f45700b70d5c69), TO_LIMB_T(0x0a8b981ee47691f1) }, + { TO_LIMB_T(0x616a3c4f5535b9fb), TO_LIMB_T(0x6f5f037395dbd911), + TO_LIMB_T(0xf25f4cc5e35c65da), TO_LIMB_T(0x3e50dffea3c62658), + TO_LIMB_T(0x6a33dca523560776), TO_LIMB_T(0x0fadeff77b6bfe3e) }, + { TO_LIMB_T(0x2be9b66df470059c), TO_LIMB_T(0x24a2c159a3d36742), + TO_LIMB_T(0x115dbe7ad10c2a37), TO_LIMB_T(0xb6634a652ee5884d), + TO_LIMB_T(0x04fe8bb2b8d81af4), TO_LIMB_T(0x01c2a7a256fe9c41) }, + { TO_LIMB_T(0xf27bf8ef3b75a386), TO_LIMB_T(0x898b367476c9073f), + TO_LIMB_T(0x24482e6b8c2f4e5f), TO_LIMB_T(0xc8e0bbd6fe110806), + TO_LIMB_T(0x59b0c17f7631448a), TO_LIMB_T(0x11037cd58b3dbfbd) }, + { TO_LIMB_T(0x31c7912ea267eec6), TO_LIMB_T(0x1dbf6f1c5fcdb700), + TO_LIMB_T(0xd30d4fe3ba86fdb1), TO_LIMB_T(0x3cae528fbee9a2a4), + TO_LIMB_T(0xb1cce69b6aa9ad9a), TO_LIMB_T(0x044393bb632d94fb) }, + { TO_LIMB_T(0xc66ef6efeeb5c7e8), TO_LIMB_T(0x9824c289dd72bb55), + TO_LIMB_T(0x71b1a4d2f119981d), TO_LIMB_T(0x104fc1aafb0919cc), + TO_LIMB_T(0x0e49df01d942a628), TO_LIMB_T(0x096c3a09773272d4) }, + { TO_LIMB_T(0x9abc11eb5fadeff4), TO_LIMB_T(0x32dca50a885728f0), + TO_LIMB_T(0xfb1fa3721569734c), TO_LIMB_T(0xc4b76271ea6506b3), + TO_LIMB_T(0xd466a75599ce728e), TO_LIMB_T(0x0c81d4645f4cb6ed) }, + { TO_LIMB_T(0x4199f10e5b8be45b), TO_LIMB_T(0xda64e495b1e87930), + TO_LIMB_T(0xcb353efe9b33e4ff), TO_LIMB_T(0x9e9efb24aa6424c6), + TO_LIMB_T(0xf08d33680a237465), TO_LIMB_T(0x0d3378023e4c7406) }, + { TO_LIMB_T(0x7eb4ae92ec74d3a5), TO_LIMB_T(0xc341b4aa9fac3497), + TO_LIMB_T(0x5be603899e907687), TO_LIMB_T(0x03bfd9cca75cbdeb), + TO_LIMB_T(0x564c2935a96bfa93), TO_LIMB_T(0x0ef3c33371e2fdb5) }, + { TO_LIMB_T(0x7ee91fd449f6ac2e), TO_LIMB_T(0xe5d5bd5cb9357a30), + TO_LIMB_T(0x773a8ca5196b1380), TO_LIMB_T(0xd0fda172174ed023), + TO_LIMB_T(0x6cb95e0fa776aead), TO_LIMB_T(0x0d22d5a40cec7cff) }, + { TO_LIMB_T(0xf727e09285fd8519), TO_LIMB_T(0xdc9d55a83017897b), + TO_LIMB_T(0x7549d8bd057894ae), TO_LIMB_T(0x178419613d90d8f8), + TO_LIMB_T(0xfce95ebdeb5b490a), TO_LIMB_T(0x0467ffaef23fc49e) }, + { TO_LIMB_T(0xc1769e6a7c385f1b), TO_LIMB_T(0x79bc930deac01c03), + TO_LIMB_T(0x5461c75a23ede3b5), TO_LIMB_T(0x6e20829e5c230c45), + TO_LIMB_T(0x828e0f1e772a53cd), TO_LIMB_T(0x116aefa749127bff) }, + { TO_LIMB_T(0x101c10bf2744c10a), TO_LIMB_T(0xbbf18d053a6a3154), + TO_LIMB_T(0xa0ecf39ef026f602), TO_LIMB_T(0xfc009d4996dc5153), + TO_LIMB_T(0xb9000209d5bd08d3), TO_LIMB_T(0x189e5fe4470cd73c) }, + { TO_LIMB_T(0x7ebd546ca1575ed2), TO_LIMB_T(0xe47d5a981d081b55), + TO_LIMB_T(0x57b2b625b6d4ca21), TO_LIMB_T(0xb0a1ba04228520cc), + TO_LIMB_T(0x98738983c2107ff3), TO_LIMB_T(0x13dddbc4799d81d6) }, + { TO_LIMB_T(0x09319f2e39834935), TO_LIMB_T(0x039e952cbdb05c21), + TO_LIMB_T(0x55ba77a9a2f76493), TO_LIMB_T(0xfd04e3dfc6086467), + TO_LIMB_T(0xfb95832e7d78742e), TO_LIMB_T(0x0ef9c24eccaf5e0e) } + }; + vec384 Zz_powers[15], map[15], xn, xd, yn, yd; + + /* lay down Z^2 powers in descending order */ + sqr_fp(Zz_powers[14], p->Z); /* ZZ^1 */ +#ifdef __OPTIMIZE_SIZE__ + for (size_t i = 14; i > 0; i--) + mul_fp(Zz_powers[i-1], Zz_powers[i], Zz_powers[14]); +#else + sqr_fp(Zz_powers[13], Zz_powers[14]); /* ZZ^2 1+1 */ + mul_fp(Zz_powers[12], Zz_powers[14], Zz_powers[13]);/* ZZ^3 2+1 */ + sqr_fp(Zz_powers[11], Zz_powers[13]); /* ZZ^4 2+2 */ + mul_fp(Zz_powers[10], Zz_powers[13], Zz_powers[12]);/* ZZ^5 2+3 */ + sqr_fp(Zz_powers[9], Zz_powers[12]); /* ZZ^6 3+3 */ + mul_fp(Zz_powers[8], Zz_powers[12], Zz_powers[11]);/* ZZ^7 3+4 */ + sqr_fp(Zz_powers[7], Zz_powers[11]); /* ZZ^8 4+4 */ + mul_fp(Zz_powers[6], Zz_powers[11], Zz_powers[10]);/* ZZ^9 4+5 */ + sqr_fp(Zz_powers[5], Zz_powers[10]); /* ZZ^10 5+5 */ + mul_fp(Zz_powers[4], Zz_powers[10], Zz_powers[9]); /* ZZ^11 5+6 */ + sqr_fp(Zz_powers[3], Zz_powers[9]); /* ZZ^12 6+6 */ + mul_fp(Zz_powers[2], Zz_powers[9], Zz_powers[8]); /* ZZ^13 6+7 */ + sqr_fp(Zz_powers[1], Zz_powers[8]); /* ZZ^14 7+7 */ + mul_fp(Zz_powers[0], Zz_powers[8], Zz_powers[7]); /* ZZ^15 7+8 */ +#endif + + map_fp_times_Zz(map, isogeny_map_x_num, Zz_powers + 4, 11); + mul_fp(xn, p->X, isogeny_map_x_num[11]); + add_fp(xn, xn, map[10]); + map_fp(xn, p->X, map, 10); + + map_fp_times_Zz(map, isogeny_map_x_den, Zz_powers + 5, 10); + add_fp(xd, p->X, map[9]); + map_fp(xd, p->X, map, 9); + mul_fp(xd, xd, Zz_powers[14]); /* xd *= Z^2 */ + + map_fp_times_Zz(map, isogeny_map_y_num, Zz_powers, 15); + mul_fp(yn, p->X, isogeny_map_y_num[15]); + add_fp(yn, yn, map[14]); + map_fp(yn, p->X, map, 14); + mul_fp(yn, yn, p->Y); /* yn *= Y */ + + map_fp_times_Zz(map, isogeny_map_y_den, Zz_powers, 15); + add_fp(yd, p->X, map[14]); + map_fp(yd, p->X, map, 14); + mul_fp(Zz_powers[14], Zz_powers[14], p->Z); + mul_fp(yd, yd, Zz_powers[14]); /* yd *= Z^3 */ + + /* convert (xn, xd, yn, yd) to Jacobian coordinates */ + mul_fp(out->Z, xd, yd); /* Z = xd * yd */ + mul_fp(out->X, xn, yd); + mul_fp(out->X, out->X, out->Z); /* X = xn * xd * yd^2 */ + sqr_fp(out->Y, out->Z); + mul_fp(out->Y, out->Y, xd); + mul_fp(out->Y, out->Y, yn); /* Y = yn * xd^3 * yd^2 */ +} + +static void map_to_isogenous_E1(POINTonE1 *p, const vec384 u) +{ + static const vec384 minus_A = { /* P - A */ + TO_LIMB_T(0x8a9955f1650a005a), TO_LIMB_T(0x9865b3d192cfe93c), + TO_LIMB_T(0xaed3ed0f3ef3c441), TO_LIMB_T(0x3c962ef33d92c442), + TO_LIMB_T(0x22e438dbd74f94a2), TO_LIMB_T(0x04acbc265478c915) + }; + static const vec384 Z = { /* (11<<384) % P */ + TO_LIMB_T(0x886c00000023ffdc), TO_LIMB_T(0x0f70008d3090001d), + TO_LIMB_T(0x77672417ed5828c3), TO_LIMB_T(0x9dac23e943dc1740), + TO_LIMB_T(0x50553f1b9c131521), TO_LIMB_T(0x078c712fbe0ab6e8) + }; + static const vec384 sqrt_minus_ZZZ = { + TO_LIMB_T(0x43b571cad3215f1f), TO_LIMB_T(0xccb460ef1c702dc2), + TO_LIMB_T(0x742d884f4f97100b), TO_LIMB_T(0xdb2c3e3238a3382b), + TO_LIMB_T(0xe40f3fa13fce8f88), TO_LIMB_T(0x0073a2af9892a2ff) + }; + static const vec384 ZxA = { + TO_LIMB_T(0x7f674ea0a8915178), TO_LIMB_T(0xb0f945fc13b8fa65), + TO_LIMB_T(0x4b46759a38e87d76), TO_LIMB_T(0x2e7a929641bbb6a1), + TO_LIMB_T(0x1668ddfa462bf6b6), TO_LIMB_T(0x00960e2ed1cf294c) + }; + vec384 uu, tv2, x2n, gx1, gxd, y2; +#if 0 + vec384 xn, x1n, xd, y, y1, Zuu, tv4; +#else +# define xn p->X +# define y p->Y +# define xd p->Z +# define x1n xn +# define y1 y +# define Zuu x2n +# define tv4 y1 +#endif +#define sgn0_fp(a) (sgn0_pty_mont_384((a), BLS12_381_P, p0) & 1) + limb_t e1, e2; + + /* + * as per map_to_curve() from poc/sswu_opt.sage at + * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve + */ + /* x numerator variants */ + sqr_fp(uu, u); /* uu = u^2 */ + mul_fp(Zuu, Z, uu); /* Zuu = Z * uu */ + sqr_fp(tv2, Zuu); /* tv2 = Zuu^2 */ + add_fp(tv2, tv2, Zuu); /* tv2 = tv2 + Zuu */ + add_fp(x1n, tv2, BLS12_381_Rx.p); /* x1n = tv2 + 1 */ + mul_fp(x1n, x1n, Bprime_E1); /* x1n = x1n * B */ + mul_fp(x2n, Zuu, x1n); /* x2n = Zuu * x1n */ + + /* x denumenator */ + mul_fp(xd, minus_A, tv2); /* xd = -A * tv2 */ + e1 = vec_is_zero(xd, sizeof(xd)); /* e1 = xd == 0 */ + vec_select(xd, ZxA, xd, sizeof(xd), e1); /* # If xd == 0, set xd = Z*A */ + + /* y numerators variants */ + sqr_fp(tv2, xd); /* tv2 = xd^2 */ + mul_fp(gxd, xd, tv2); /* gxd = xd^3 */ + mul_fp(tv2, Aprime_E1, tv2); /* tv2 = A * tv2 */ + sqr_fp(gx1, x1n); /* gx1 = x1n^2 */ + add_fp(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1n^2 + A*xd^2 */ + mul_fp(gx1, gx1, x1n); /* gx1 = gx1 * x1n # x1n^3 + A*x1n*xd^2 */ + mul_fp(tv2, Bprime_E1, gxd); /* tv2 = B * gxd */ + add_fp(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1^3 + A*x1*xd^2 + B*xd^3 */ + sqr_fp(tv4, gxd); /* tv4 = gxd^2 */ + mul_fp(tv2, gx1, gxd); /* tv2 = gx1 * gxd */ + mul_fp(tv4, tv4, tv2); /* tv4 = tv4 * tv2 # gx1*gxd^3 */ + e2 = recip_sqrt_fp(y1, tv4); /* y1 = tv4^c1 # (gx1*gxd^3)^((p-3)/4) */ + mul_fp(y1, y1, tv2); /* y1 = y1 * tv2 # gx1*gxd*y1 */ + mul_fp(y2, y1, sqrt_minus_ZZZ); /* y2 = y1 * c2 # y2 = y1*sqrt(-Z^3) */ + mul_fp(y2, y2, uu); /* y2 = y2 * uu */ + mul_fp(y2, y2, u); /* y2 = y2 * u */ + + /* choose numerators */ + vec_select(xn, x1n, x2n, sizeof(xn), e2); /* xn = e2 ? x1n : x2n */ + vec_select(y, y1, y2, sizeof(y), e2); /* y = e2 ? y1 : y2 */ + + e1 = sgn0_fp(u); + e2 = sgn0_fp(y); + cneg_fp(y, y, e1^e2); /* fix sign of y */ + /* return (xn, xd, y, 1) */ + + /* convert (xn, xd, y, 1) to Jacobian projective coordinates */ + mul_fp(p->X, xn, xd); /* X = xn * xd */ + mul_fp(p->Y, y, gxd); /* Y = y * xd^3 */ +#ifndef xd + vec_copy(p->Z, xd, sizeof(xd)); /* Z = xd */ +#else +# undef xn +# undef y +# undef xd +# undef x1n +# undef y1 +# undef Zuu +# undef tv4 +#endif +#undef sgn0_fp +} + +static void POINTonE1_dbl_n_add(POINTonE1 *out, size_t n, const POINTonE1 *p) +{ + while(n--) + POINTonE1_double(out, out); + POINTonE1_add(out, out, p); +} + +/* + * |u|, |v| are expected to be in Montgomery representation + */ +static void map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v) +{ + POINTonE1 p; + + map_to_isogenous_E1(&p, u); + + if (v != NULL) { + map_to_isogenous_E1(out, v); /* borrow |out| */ + POINTonE1_dadd(&p, &p, out, Aprime_E1); + } + + isogeny_map_to_E1(&p, &p); /* sprinkle isogenous powder */ + + /* clear the cofactor by multiplying |p| by 1-z, 0xd201000000010001 */ + POINTonE1_double(out, &p); /* 0x2 */ + POINTonE1_add(out, out, &p); /* 0x3 */ + POINTonE1_dbl_n_add(out, 2, &p); /* 0xd */ + POINTonE1_dbl_n_add(out, 3, &p); /* 0x69 */ + POINTonE1_dbl_n_add(out, 9, &p); /* 0xd201 */ + POINTonE1_dbl_n_add(out, 32, &p); /* 0xd20100000001 */ + POINTonE1_dbl_n_add(out, 16, &p); /* 0xd201000000010001 */ +} + +void blst_map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v) +{ map_to_g1(out, u, v); } + +static void Encode_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384 u[1]; + + hash_to_field(u, 1, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g1(p, u[0], NULL); +} + +void blst_encode_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Encode_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static void Hash_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384 u[2]; + + hash_to_field(u, 2, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g1(p, u[0], u[1]); +} + +void blst_hash_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Hash_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +#ifdef __OPTIMIZE_SIZE__ +static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out, + const POINTonE1 *in) +{ + static const limb_t zz_minus_1_div_by_3[] = { + TO_LIMB_T(0x0000000055555555), TO_LIMB_T(0x396c8c005555e156) + }; + size_t n = 126-1; + const POINTonE1 *dblin = in; + + while(n--) { + POINTonE1_double(out, dblin); dblin = out; + if (is_bit_set(zz_minus_1_div_by_3, n)) + POINTonE1_add(out, out, in); + } +} +#else +static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out, + const POINTonE1 *in) +{ + POINTonE1 t3, t5, t7, t11, t85; + + POINTonE1_double(&t7, in); /* 2P */ + POINTonE1_add(&t3, &t7, in); /* 3P */ + POINTonE1_add(&t5, &t3, &t7); /* 5P */ + POINTonE1_add(&t7, &t5, &t7); /* 7P */ + POINTonE1_double(&t85, &t5); /* 10P */ + POINTonE1_add(&t11, &t85, in); /* 11P */ + POINTonE1_dbl_n_add(&t85, 3, &t5); /* 0x55P */ + /* (-0xd201000000010000^2 - 1) / 3 */ + POINTonE1_double(out, &t7); /* 0xe */ + POINTonE1_dbl_n_add(out, 5, &t11); /* 0x1cb */ + POINTonE1_dbl_n_add(out, 3, &t3); /* 0xe5b */ + POINTonE1_dbl_n_add(out, 3, in); /* 0x72d9 */ + POINTonE1_dbl_n_add(out, 5, &t3); /* 0xe5b23 */ + POINTonE1_dbl_n_add(out, 18, &t85); /* 0x396c8c0055 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555 */ + POINTonE1_dbl_n_add(out, 3, &t7); /* 0x1cb646002aaaf */ + POINTonE1_dbl_n_add(out, 7, &t5); /* 0xe5b23001555785 */ + POINTonE1_dbl_n_add(out, 5, &t11); /* 0x1cb646002aaaf0ab */ + POINTonE1_dbl_n_add(out, 41, &t85); /* 0x396c8c005555e1560000000055 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e156000000005555 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e15600000000555555 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e1560000000055555555 */ +} +#endif + +static void sigma(POINTonE1 *out, const POINTonE1 *in) +{ + static const vec384 beta = { /* such that beta^3 - 1 = 0 */ + /* -1/2 * (1 + sqrt(-3)) = ((P-2)^(P-2)) * (1 + (P-3)^((P+1)/4)) */ + /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4 + 897d29650fb85f9b409427eb4f49fffd8bfd00000000aaac << 384) % P */ + TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), + TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), + TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) + }; + + mul_fp(out->X, in->X, beta); + vec_copy(out->Y, in->Y, sizeof(out->Y)); + vec_copy(out->Z, BLS12_381_Rx.p, sizeof(out->Z)); +} + +static limb_t POINTonE1_in_G1(const POINTonE1 *p) +{ + POINTonE1 t0, t1, t2; + + /* Bowe, S., "Faster subgroup checks for BLS12-381" */ + sigma(&t0, p); /* σ(P) */ + sigma(&t1, &t0); /* σ²(P) */ + + POINTonE1_double(&t0, &t0); /* 2σ(P) */ + POINTonE1_add_affine(&t2, &t1, p); /* P + σ²(P) */ + POINTonE1_cneg(&t2, 1); /* - P - σ²(P) */ + POINTonE1_add(&t2, &t2, &t0); /* 2σ(P) - P - σ²(P) */ + POINTonE1_times_zz_minus_1_div_by_3(&t0, &t2); + POINTonE1_cneg(&t1, 1); + POINTonE1_add(&t0, &t0, &t1); /* [(z²-1)/3](2σ(P) - P - σ²(P)) */ + /* - σ²(P) */ + return vec_is_zero(t0.Z, sizeof(t0.Z)); +} + +limb_t blst_p1_affine_in_g1(const POINTonE1_affine *p) +{ return POINTonE1_in_G1((const POINTonE1 *)p); } diff --git a/src/map_to_g2.c b/src/map_to_g2.c new file mode 100644 index 00000000..ab06d408 --- /dev/null +++ b/src/map_to_g2.c @@ -0,0 +1,496 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" + +/* + * y^2 = x^3 + A'*x + B', isogenous one + */ +static const vec384x Aprime_E2 = { /* 240*i */ + { 0 }, + { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285), + TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601), + TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) } +}; +static const vec384x Bprime_E2 = { /* 1012 + 1012*i */ + { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4), + TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba), + TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) }, + { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4), + TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba), + TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) } +}; + +static void map_fp2_times_Zz(vec384x map[], const vec384x isogeny_map[], + const vec384x Zz_powers[], size_t n) +{ + while (n--) + mul_fp2(map[n], isogeny_map[n], Zz_powers[n]); +} + +static void map_fp2(vec384x acc, const vec384x x, const vec384x map[], size_t n) +{ + while (n--) { + mul_fp2(acc, acc, x); + add_fp2(acc, acc, map[n]); + } +} + +static void isogeny_map_to_E2(POINTonE2 *out, const POINTonE2 *p) +{ + /* + * x = x_num / x_den, where + * x_num = k_(1,3) * x'^3 + k_(1,2) * x'^2 + k_(1,1) * x' + k_(1,0) + * ... + */ + static const vec384x isogeny_map_x_num[] = { /* (k_(1,*)<<384) % P */ + {{ TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e), + TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062), + TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) }, + { TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e), + TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062), + TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) }}, + {{ 0 }, + { TO_LIMB_T(0x5fe55555554c71d0), TO_LIMB_T(0x873fffdd236aaaa3), + TO_LIMB_T(0x6a6b4619b26ef918), TO_LIMB_T(0x21c2888408874945), + TO_LIMB_T(0x2836cda7028cabc5), TO_LIMB_T(0x0ac73310a7fd5abd) }}, + {{ TO_LIMB_T(0x0a0c5555555971c3), TO_LIMB_T(0xdb0c00101f9eaaae), + TO_LIMB_T(0xb1fb2f941d797997), TO_LIMB_T(0xd3960742ef416e1c), + TO_LIMB_T(0xb70040e2c20556f4), TO_LIMB_T(0x149d7861e581393b) }, + { TO_LIMB_T(0xaff2aaaaaaa638e8), TO_LIMB_T(0x439fffee91b55551), + TO_LIMB_T(0xb535a30cd9377c8c), TO_LIMB_T(0x90e144420443a4a2), + TO_LIMB_T(0x941b66d3814655e2), TO_LIMB_T(0x0563998853fead5e) }}, + {{ TO_LIMB_T(0x40aac71c71c725ed), TO_LIMB_T(0x190955557a84e38e), + TO_LIMB_T(0xd817050a8f41abc3), TO_LIMB_T(0xd86485d4c87f6fb1), + TO_LIMB_T(0x696eb479f885d059), TO_LIMB_T(0x198e1a74328002d2) }, + { 0 }} + }; + /* ... + * x_den = x'^2 + k_(2,1) * x' + k_(2,0) + */ + static const vec384x isogeny_map_x_den[] = { /* (k_(2,*)<<384) % P */ + {{ 0 }, + { TO_LIMB_T(0x1f3affffff13ab97), TO_LIMB_T(0xf25bfc611da3ff3e), + TO_LIMB_T(0xca3757cb3819b208), TO_LIMB_T(0x3e6427366f8cec18), + TO_LIMB_T(0x03977bc86095b089), TO_LIMB_T(0x04f69db13f39a952) }}, + {{ TO_LIMB_T(0x447600000027552e), TO_LIMB_T(0xdcb8009a43480020), + TO_LIMB_T(0x6f7ee9ce4a6e8b59), TO_LIMB_T(0xb10330b7c0a95bc6), + TO_LIMB_T(0x6140b1fcfb1e54b7), TO_LIMB_T(0x0381be097f0bb4e1) }, + { TO_LIMB_T(0x7588ffffffd8557d), TO_LIMB_T(0x41f3ff646e0bffdf), + TO_LIMB_T(0xf7b1e8d2ac426aca), TO_LIMB_T(0xb3741acd32dbb6f8), + TO_LIMB_T(0xe9daf5b9482d581f), TO_LIMB_T(0x167f53e0ba7431b8) }} + }; + /* + * y = y' * y_num / y_den, where + * y_num = k_(3,3) * x'^3 + k_(3,2) * x'^2 + k_(3,1) * x' + k_(3,0) + * ... + */ + static const vec384x isogeny_map_y_num[] = { /* (k_(3,*)<<384) % P */ + {{ TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2), + TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1), + TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) }, + { TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2), + TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1), + TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) }}, + {{ 0 }, + { TO_LIMB_T(0xbf0a71c71c91b406), TO_LIMB_T(0x4d6d55d28b7638fd), + TO_LIMB_T(0x9d82f98e5f205aee), TO_LIMB_T(0xa27aa27b1d1a18d5), + TO_LIMB_T(0x02c3b2b2d2938e86), TO_LIMB_T(0x0c7d13420b09807f) }}, + {{ TO_LIMB_T(0xd7f9555555531c74), TO_LIMB_T(0x21cffff748daaaa8), + TO_LIMB_T(0x5a9ad1866c9bbe46), TO_LIMB_T(0x4870a2210221d251), + TO_LIMB_T(0x4a0db369c0a32af1), TO_LIMB_T(0x02b1ccc429ff56af) }, + { TO_LIMB_T(0xe205aaaaaaac8e37), TO_LIMB_T(0xfcdc000768795556), + TO_LIMB_T(0x0c96011a8a1537dd), TO_LIMB_T(0x1c06a963f163406e), + TO_LIMB_T(0x010df44c82a881e6), TO_LIMB_T(0x174f45260f808feb) }}, + {{ TO_LIMB_T(0xa470bda12f67f35c), TO_LIMB_T(0xc0fe38e23327b425), + TO_LIMB_T(0xc9d3d0f2c6f0678d), TO_LIMB_T(0x1c55c9935b5a982e), + TO_LIMB_T(0x27f6c0e2f0746764), TO_LIMB_T(0x117c5e6e28aa9054) }, + { 0 }} + }; + /* ... + * y_den = x'^3 + k_(4,2) * x'^2 + k_(4,1) * x' + k_(4,0) + */ + static const vec384x isogeny_map_y_den[] = { /* (k_(4,*)<<384) % P */ + {{ TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75), + TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5), + TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) }, + { TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75), + TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5), + TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) }}, + {{ 0 }, + { TO_LIMB_T(0x5db0fffffd3b02c5), TO_LIMB_T(0xd713f52358ebfdba), + TO_LIMB_T(0x5ea60761a84d161a), TO_LIMB_T(0xbb2c75a34ea6c44a), + TO_LIMB_T(0x0ac6735921c1119b), TO_LIMB_T(0x0ee3d913bdacfbf6) }}, + {{ TO_LIMB_T(0x66b10000003affc5), TO_LIMB_T(0xcb1400e764ec0030), + TO_LIMB_T(0xa73e5eb56fa5d106), TO_LIMB_T(0x8984c913a0fe09a9), + TO_LIMB_T(0x11e10afb78ad7f13), TO_LIMB_T(0x05429d0e3e918f52) }, + { TO_LIMB_T(0x534dffffffc4aae6), TO_LIMB_T(0x5397ff174c67ffcf), + TO_LIMB_T(0xbff273eb870b251d), TO_LIMB_T(0xdaf2827152870915), + TO_LIMB_T(0x393a9cbaca9e2dc3), TO_LIMB_T(0x14be74dbfaee5748) }} + }; + vec384x Zz_powers[3], map[3], xn, xd, yn, yd; + + /* lay down Z^2 powers in descending order */ + sqr_fp2(Zz_powers[2], p->Z); /* ZZ^1 */ + sqr_fp2(Zz_powers[1], Zz_powers[2]); /* ZZ^2 1+1 */ + mul_fp2(Zz_powers[0], Zz_powers[2], Zz_powers[1]); /* ZZ^3 2+1 */ + + map_fp2_times_Zz(map, isogeny_map_x_num, Zz_powers, 3); + mul_fp2(xn, p->X, isogeny_map_x_num[3]); + add_fp2(xn, xn, map[2]); + map_fp2(xn, p->X, map, 2); + + map_fp2_times_Zz(map, isogeny_map_x_den, Zz_powers + 1, 2); + add_fp2(xd, p->X, map[1]); + map_fp2(xd, p->X, map, 1); + mul_fp2(xd, xd, Zz_powers[2]); /* xd *= Z^2 */ + + map_fp2_times_Zz(map, isogeny_map_y_num, Zz_powers, 3); + mul_fp2(yn, p->X, isogeny_map_y_num[3]); + add_fp2(yn, yn, map[2]); + map_fp2(yn, p->X, map, 2); + mul_fp2(yn, yn, p->Y); /* yn *= Y */ + + map_fp2_times_Zz(map, isogeny_map_y_den, Zz_powers, 3); + add_fp2(yd, p->X, map[2]); + map_fp2(yd, p->X, map, 2); + mul_fp2(Zz_powers[2], Zz_powers[2], p->Z); + mul_fp2(yd, yd, Zz_powers[2]); /* yd *= Z^3 */ + + /* convert (xn, xd, yn, yd) to Jacobian coordinates */ + mul_fp2(out->Z, xd, yd); /* Z = xd * yd */ + mul_fp2(out->X, xn, yd); + mul_fp2(out->X, out->X, out->Z); /* X = xn * xd * yd^2 */ + sqr_fp2(out->Y, out->Z); + mul_fp2(out->Y, out->Y, xd); + mul_fp2(out->Y, out->Y, yn); /* Y = yn * xd^3 * yd^2 */ +} + +static void map_to_isogenous_E2(POINTonE2 *p, const vec384x u) +{ + static const vec384x minus_A = { + { 0 }, + { TO_LIMB_T(0xd4c4fffffcec5869), TO_LIMB_T(0x1da3f3eed25bfd79), + TO_LIMB_T(0x7fa833c5136fff67), TO_LIMB_T(0x59261433cd540cbd), + TO_LIMB_T(0x48450f5f2b84682c), TO_LIMB_T(0x07e05d00bf959233) } + }; + static const vec384x Z = { /* -2 - i */ + { TO_LIMB_T(0x87ebfffffff9555c), TO_LIMB_T(0x656fffe5da8ffffa), + TO_LIMB_T(0x0fd0749345d33ad2), TO_LIMB_T(0xd951e663066576f4), + TO_LIMB_T(0xde291a3d41e980d3), TO_LIMB_T(0x0815664c7dfe040d) }, + { TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd), + TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a), + TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206) } + }; + static const vec384x sqrt_ZZZ = { /* (Z^3)^((P^2+7)/16) */ + { TO_LIMB_T(0x019af5f980a3680c), TO_LIMB_T(0x4ed7da0e66063afa), + TO_LIMB_T(0x600354723b5d9972), TO_LIMB_T(0x8b2f958b20d09d72), + TO_LIMB_T(0x0474938f02d461db), TO_LIMB_T(0x0dcf8b9e0684ab1c) }, + { TO_LIMB_T(0x486f252db11dd19c), TO_LIMB_T(0x791ffda2c3d18950), + TO_LIMB_T(0x5af6c27debf95eb4), TO_LIMB_T(0x73b1fd8f2a929cde), + TO_LIMB_T(0xfc59602a1a90b871), TO_LIMB_T(0x08d7daafa8baddb3) } + }; + static const vec384x recip_ZZZ = { /* 1/(Z^3) */ + { TO_LIMB_T(0x65018f5c28f598eb), TO_LIMB_T(0xe6020417f022d916), + TO_LIMB_T(0xd6327313288369c7), TO_LIMB_T(0x622ded8eb447156f), + TO_LIMB_T(0xe52a2aee72c2a01f), TO_LIMB_T(0x089812fb8481ffe4) }, + { TO_LIMB_T(0x2574eb851eb8619f), TO_LIMB_T(0xdba2e97912925604), + TO_LIMB_T(0x67e495a909e7a18e), TO_LIMB_T(0xdf2da23b8145b8f7), + TO_LIMB_T(0xcf5d3728310ebf6d), TO_LIMB_T(0x11be446236f4c116) } + }; + static const vec384x ZxA = { /* 240 - 480*i */ + { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285), + TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601), + TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) }, + { TO_LIMB_T(0xa989fffff9d8b0d2), TO_LIMB_T(0x3b47e7dda4b7faf3), + TO_LIMB_T(0xff50678a26dffece), TO_LIMB_T(0xb24c28679aa8197a), + TO_LIMB_T(0x908a1ebe5708d058), TO_LIMB_T(0x0fc0ba017f2b2466) } + }; + vec384x uu, tv2, tv3, tv4, x2n, gx1, gxd, y2; +#if 0 + vec384x xn, x1n, xd, y, y1, Zuu; +#else +# define xn p->X +# define y p->Y +# define xd p->Z +# define x1n xn +# define y1 y +# define Zuu x2n +#endif +#define sgn0_fp2(a) (sgn0_pty_mont_384x((a), BLS12_381_P, p0) & 1) + limb_t e1, e2; + + /* + * as per map_to_curve() from poc/sswu_opt.sage at + * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve + * with 9mod16 twists... + */ + /* x numerator variants */ + sqr_fp2(uu, u); /* uu = u^2 */ + mul_fp2(Zuu, Z, uu); /* Zuu = Z * uu */ + sqr_fp2(tv2, Zuu); /* tv2 = Zuu^2 */ + add_fp2(tv2, tv2, Zuu); /* tv2 = tv2 + Zuu */ + add_fp2(x1n, tv2, BLS12_381_Rx.p2); /* x1n = tv2 + 1 */ + mul_fp2(x1n, x1n, Bprime_E2); /* x1n = x1n * B */ + mul_fp2(x2n, Zuu, x1n); /* x2n = Zuu * x1n */ + + /* x denumenator */ + mul_fp2(xd, minus_A, tv2); /* xd = -A * tv2 */ + e1 = vec_is_zero(xd, sizeof(xd)); /* e1 = xd == 0 */ + vec_select(xd, ZxA, xd, sizeof(xd), e1); /* # If xd == 0, set xd = Z*A */ + + /* y numerators variants */ + sqr_fp2(tv2, xd); /* tv2 = xd^2 */ + mul_fp2(gxd, xd, tv2); /* gxd = xd^3 */ + mul_fp2(tv2, Aprime_E2, tv2); /* tv2 = A * tv2 */ + sqr_fp2(gx1, x1n); /* gx1 = x1n^2 */ + add_fp2(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1n^2 + A*xd^2 */ + mul_fp2(gx1, gx1, x1n); /* gx1 = gx1 * x1n # x1n^3 + A*x1n*xd^2 */ + mul_fp2(tv2, Bprime_E2, gxd); /* tv2 = B * gxd */ + add_fp2(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1^3 + A*x1*xd^2 + B*xd^3 */ + sqr_fp2(tv4, gxd); /* tv4 = gxd^2 */ + mul_fp2(tv2, gx1, gxd); /* tv2 = gx1 * gxd */ + mul_fp2(tv4, tv4, tv2); /* tv4 = tv4 * tv2 # gx1*gxd^3 */ + e2 = recip_sqrt_fp2(y1, tv4); /* y1 = tv4^c1 # (gx1*gxd^3)^((p^2-9)/16) */ + mul_fp2(y2, y1, sqrt_ZZZ); /* y2 = y1 * c2 # y2 = y1*sqrt(Z^3) */ + mul_fp2(tv4, tv4, recip_ZZZ); + mul_fp2(tv3, y2, tv4); + (void)sqrt_align_fp2(y2, y2, tv3, tv4); + mul_fp2(y1, y1, tv2); /* y1 = y1 * tv2 # gx1*gxd*y1 */ + mul_fp2(y2, y2, tv2); /* y2 = y2 * tv2 # gx1*gxd*y2 */ + mul_fp2(y2, y2, uu); /* y2 = y2 * uu */ + mul_fp2(y2, y2, u); /* y2 = y2 * u */ + + /* choose numerators */ + vec_select(xn, x1n, x2n, sizeof(xn), e2); /* xn = e2 ? x1n : x2n */ + vec_select(y, y1, y2, sizeof(y), e2); /* y = e2 ? y1 : y2 */ + + e1 = sgn0_fp2(u); + e2 = sgn0_fp2(y); + cneg_fp2(y, y, e1^e2); /* fix sign of y */ + /* return (xn, xd, y, 1) */ + + /* convert (xn, xd, y, 1) to Jacobian projective coordinates */ + mul_fp2(p->X, xn, xd); /* X = xn * xd */ + mul_fp2(p->Y, y, gxd); /* Y = y * xd^3 */ +#ifndef xd + vec_copy(p->Z, xd, sizeof(xd)); /* Z = xd */ +#else +# undef xn +# undef y +# undef xd +# undef x1n +# undef y1 +# undef Zuu +# undef tv4 +#endif +#undef sgn0_fp2 +} + +#if 0 +static const limb_t h_eff[] = { + TO_LIMB_T(0xe8020005aaa95551), TO_LIMB_T(0x59894c0adebbf6b4), + TO_LIMB_T(0xe954cbc06689f6a3), TO_LIMB_T(0x2ec0ec69d7477c1a), + TO_LIMB_T(0x6d82bf015d1212b0), TO_LIMB_T(0x329c2f178731db95), + TO_LIMB_T(0x9986ff031508ffe1), TO_LIMB_T(0x88e2a8e9145ad768), + TO_LIMB_T(0x584c6a0ea91b3528), TO_LIMB_T(0x0bc69f08f2ee75b3) +}; + +static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p) +{ POINTonE2_mult_w5(out, p, h_eff, 636); } +#else +/* + * As per suggestions in "7. Clearing the cofactor" at + * https://tools.ietf.org/html/draft-irtf-cfrg-hash-to-curve-06 + */ +static const vec384x iwsc = { /* 1/(1 + i) = 1/2 - 1/2*i */ + { TO_LIMB_T(0x1804000000015554), TO_LIMB_T(0x855000053ab00001), + TO_LIMB_T(0x633cb57c253c276f), TO_LIMB_T(0x6e22d1ec31ebb502), + TO_LIMB_T(0xd3916126f2d14ca2), TO_LIMB_T(0x17fbb8571a006596) }, + { TO_LIMB_T(0xa1fafffffffe5557), TO_LIMB_T(0x995bfff976a3fffe), + TO_LIMB_T(0x03f41d24d174ceb4), TO_LIMB_T(0xf6547998c1995dbd), + TO_LIMB_T(0x778a468f507a6034), TO_LIMB_T(0x020559931f7f8103) } +}; + +static const vec384x k_cx = { /* iwsc^((P-1)/3) */ + { 0 }, + { /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4 + 897d29650fb85f9b409427eb4f49fffd8bfd00000000aaad << 384) % P */ + TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5), + TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024), + TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a) } +}; + +static const vec384x k_cy = { /* iwsc^((P-1)/2) */ + { /* (0x135203e60180a68ee2e9c448d77a2cd91c3dedd930b1cf60 + ef396489f61eb45e304466cf3e67fa0af1ee7b04121bdea2 << 384) % P */ + TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), + TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), + TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, + { /* (0x06af0e0437ff400b6831e36d6bd17ffe48395dabc2d3435e + 77f76e17009241c5ee67992f72ec05f4c81084fbede3cc09 << 384) % P */ + TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }, +}; + +static void qi_x_iwsc(vec384x out, const vec384x in) +{ + mul_fp2(out, in, iwsc); + mul_fp(out[0], out[0], k_cx[1]); + mul_fp(out[1], out[1], k_cx[1]); + neg_fp(out[1], out[1]); +} + +static void qi_y_iwsc(vec384x out, const vec384x in) +{ + vec384x t; + + mul_fp2(t, in, iwsc); + add_fp(out[0], t[0], t[1]); + sub_fp(out[1], t[0], t[1]); + mul_fp(out[0], out[0], k_cy[1]); + mul_fp(out[1], out[1], k_cy[1]); +} + +static void psi(POINTonE2 *out, const POINTonE2 *in) +{ + vec384x xn, xd, yn, yd; + + sqr_fp2(xd, in->Z); /* xd = Z^2 */ + mul_fp2(yd, xd, in->Z); /* yd = Z^3 */ + + qi_x_iwsc(xn, in->X); mul_fp2(xn, xn, k_cx); + qi_x_iwsc(xd, xd); + + qi_y_iwsc(yn, in->Y); mul_fp2(yn, yn, k_cy); + qi_y_iwsc(yd, yd); + + /* convert (xn, xd, yn, yd) to Jacobian coordinates */ + mul_fp2(out->Z, xd, yd); /* Z = xd * yd */ + mul_fp2(out->X, xn, yd); + mul_fp2(out->X, out->X, out->Z); /* X = xn * xd * yd^2 */ + sqr_fp2(out->Y, out->Z); + mul_fp2(out->Y, out->Y, xd); + mul_fp2(out->Y, out->Y, yn); /* Y = yn * xd^3 * yd^2 */ +} + +static void POINTonE2_add_n_dbl(POINTonE2 *out, const POINTonE2 *p, size_t n) +{ + POINTonE2_add(out, out, p); + while(n--) + POINTonE2_double(out, out); +} + +static void POINTonE2_times_minus_z(POINTonE2 *out, const POINTonE2 *in) +{ + POINTonE2_double(out, in); /* 1: 0x2 */ + POINTonE2_add_n_dbl(out, in, 2); /* 2..4: 0x3..0xc */ + POINTonE2_add_n_dbl(out, in, 3); /* 5..8: 0xd..0x68 */ + POINTonE2_add_n_dbl(out, in, 9); /* 9..18: 0x69..0xd200 */ + POINTonE2_add_n_dbl(out, in, 32); /* 19..51: ..0xd20100000000 */ + POINTonE2_add_n_dbl(out, in, 16); /* 52..68: ..0xd201000000010000 */ +} + +static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p) +{ + POINTonE2 t0, t1; + + /* A.Budroni, F.Pintore, "Efficient hash maps to G2 on BLS curves" */ + POINTonE2_double(out, p); /* out = 2P */ + psi(out, out); /* out = Ψ(2P) */ + psi(out, out); /* out = Ψ²(2P) */ + + vec_copy(&t0, p, sizeof(t0)); + POINTonE2_cneg(&t0, 1); /* t0 = -P */ + psi(&t1, &t0); /* t1 = -Ψ(P) */ + POINTonE2_add(out, out, &t0); /* out = Ψ²(2P) - P */ + POINTonE2_add(out, out, &t1); /* out = Ψ²(2P) - P - Ψ(P) */ + + POINTonE2_times_minus_z(&t0, p); /* t0 = [-z]P */ + POINTonE2_add(&t0, &t0, p); /* t0 = [-z + 1]P */ + POINTonE2_add(&t0, &t0, &t1); /* t0 = [-z + 1]P - Ψ(P) */ + POINTonE2_times_minus_z(&t1, &t0); /* t1 = [z² - z]P + [z]Ψ(P) */ + POINTonE2_add(out, out, &t1); /* out = [z² - z - 1]P */ + /* + [z - 1]Ψ(P) */ + /* + Ψ²(2P) */ +} +#endif + +/* + * |u|, |v| are expected to be in Montgomery representation + */ +static void map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v) +{ + POINTonE2 p; + + map_to_isogenous_E2(&p, u); + + if (v != NULL) { + map_to_isogenous_E2(out, v); /* borrow |out| */ + POINTonE2_dadd(&p, &p, out, Aprime_E2); + } + + isogeny_map_to_E2(&p, &p); /* sprinkle isogenous powder */ + clear_cofactor(out, &p); +} + +void blst_map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v) +{ map_to_g2(out, u, v); } + +static void Encode_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384x u[1]; + + hash_to_field(u[0], 2, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g2(p, u[0], NULL); +} + +void blst_encode_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Encode_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static void Hash_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384x u[2]; + + hash_to_field(u[0], 4, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g2(p, u[0], u[1]); +} + +void blst_hash_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Hash_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static limb_t POINTonE2_in_G2(const POINTonE2 *p) +{ + POINTonE2 t0, t1, t2; + + vec_copy(t0.X, p->X, 2*sizeof(t0.X)); + vec_copy(t0.Z, BLS12_381_Rx.p2, sizeof(t0.Z)); + + /* Bowe, S., "Faster subgroup checks for BLS12-381" */ + psi(&t0, &t0); /* Ψ(P) */ + psi(&t0, &t0); /* Ψ²(P) */ + psi(&t1, &t0); /* Ψ³(P) */ + + POINTonE2_times_minus_z(&t2, &t1); + POINTonE2_add(&t0, &t0, &t2); + POINTonE2_cneg(&t0, 1); + POINTonE2_add_affine(&t0, &t0, p); /* [z]Ψ³(P) - Ψ²(P) + P */ + + return vec_is_zero(t0.Z, sizeof(t0.Z)); +} + +limb_t blst_p2_affine_in_g2(const POINTonE2_affine *p) +{ return POINTonE2_in_G2((const POINTonE2 *)p); } diff --git a/src/pairing.c b/src/pairing.c new file mode 100644 index 00000000..a7652a88 --- /dev/null +++ b/src/pairing.c @@ -0,0 +1,407 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" + +/* + * Line evaluations from https://eprint.iacr.org/2010/354.pdf + * with a twist moving common expression to line_by_Px2. + */ +static void line_add(vec384fp6 line, POINTonE2 *T, const POINTonE2 *R, + const POINTonE2_affine *Q) +{ + vec384x Z1Z1, U2, S2, H, HH, I, J, V; +#if 1 +# define r line[1] +#else + vec384x r; +#endif + + /* + * https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl + * with XYZ3 being |T|, XYZ1 - |R|, XY2 - |Q|, i.e. Q is affine + */ + sqr_fp2(Z1Z1, R->Z); /* Z1Z1 = Z1^2 */ + mul_fp2(U2, Q->X, Z1Z1); /* U2 = X2*Z1Z1 */ + + mul_fp2(S2, Q->Y, R->Z); + mul_fp2(S2, S2, Z1Z1); /* S2 = Y2*Z1*Z1Z1 */ + + sub_fp2(H, U2, R->X); /* H = U2-X1 */ + + sqr_fp2(HH, H); /* HH = H^2 */ + add_fp2(I, HH, HH); + add_fp2(I, I, I); /* I = 4*HH */ + + mul_fp2(J, H, I); /* J = H*I */ + + sub_fp2(r, S2, R->Y); + add_fp2(r, r, r); /* r = 2*(S2-Y1) */ + + mul_fp2(V, R->X, I); /* V = X1*I */ + + sqr_fp2(T->X, r); + sub_fp2(T->X, T->X, J); + sub_fp2(T->X, T->X, V); + sub_fp2(T->X, T->X, V); /* X3 = r^2-J-2*V */ + + mul_fp2(J, J, R->Y); + sub_fp2(T->Y, V, T->X); + mul_fp2(T->Y, T->Y, r); + sub_fp2(T->Y, T->Y, J); + sub_fp2(T->Y, T->Y, J); /* Y3 = r*(V-X3)-2*Y1*J */ + + add_fp2(T->Z, R->Z, H); + sqr_fp2(T->Z, T->Z); + sub_fp2(T->Z, T->Z, Z1Z1); + sub_fp2(T->Z, T->Z, HH); /* Z3 = (Z1+H)^2-Z1Z1-HH */ + + /* + * line evaluation + */ + mul_fp2(I, r, Q->X); + mul_fp2(J, Q->Y, T->Z); + sub_fp2(I, I, J); + add_fp2(line[0], I, I); /* 2*(r*X2 - Y2*Z3) */ +#ifdef r +# undef r +#else + vec_copy(line[1], r, sizeof(r)); +#endif + vec_copy(line[2], T->Z, sizeof(T->Z)); +} + +static void line_dbl(vec384fp6 line, POINTonE2 *T, const POINTonE2 *Q) +{ + vec384x ZZ, A, B, C, D, E, F; + + /* + * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-alnr + */ + sqr_fp2(A, Q->X); /* A = X1^2 */ + sqr_fp2(B, Q->Y); /* B = Y1^2 */ + sqr_fp2(ZZ, Q->Z); /* ZZ = Z1^2 */ + sqr_fp2(C, B); /* C = B^2 */ + + add_fp2(D, Q->X, B); /* X1+B */ + sqr_fp2(D, D); /* (X1+B)^2 */ + sub_fp2(D, D, A); /* (X1+B)^2-A */ + sub_fp2(D, D, C); /* (X1+B)^2-A-C */ + add_fp2(D, D, D); /* D = 2*((X1+B)^2-A-C) */ + + mul_by_3_fp2(E, A); /* E = 3*A */ + sqr_fp2(F, E); /* F = E^2 */ + + add_fp2(line[0], E, Q->X); /* 3*A+X1 for line evaluation */ + + sub_fp2(T->X, F, D); + sub_fp2(T->X, T->X, D); /* X3 = F-2*D */ + + add_fp2(T->Z, Q->Y, Q->Z); + sqr_fp2(T->Z, T->Z); + sub_fp2(T->Z, T->Z, B); + sub_fp2(T->Z, T->Z, ZZ); /* Z3 = (Y1+Z1)^2-B-ZZ */ + + mul_by_8_fp2(C, C); /* 8*C */ + sub_fp2(T->Y, D, T->X); /* D-X3 */ + mul_fp2(T->Y, T->Y, E); /* E*(D-X3) */ + sub_fp2(T->Y, T->Y, C); /* Y3 = E*(D-X3)-8*C */ + + /* + * line evaluation + */ + sqr_fp2(line[0], line[0]); + sub_fp2(line[0], line[0], A); + sub_fp2(line[0], line[0], F); /* (3*A+X1)^2 - X1^2 - 9*A^2 */ + lshift_fp2(B, B, 2); + sub_fp2(line[0], line[0], B); /* 6*X1^3 - 4*Y1^2 */ + + mul_fp2(line[1], E, ZZ); /* 3*X1^2 * Z1^2 */ + + mul_fp2(line[2], T->Z, ZZ); /* Z3 * Z1^2 */ +} + +static void line_by_Px2(vec384fp6 line, const POINTonE1_affine *Px2) +{ + mul_fp(line[1][0], line[1][0], Px2->X); /* "b01" *= -2*P->X */ + mul_fp(line[1][1], line[1][1], Px2->X); + + mul_fp(line[2][0], line[2][0], Px2->Y); /* "b11" *= 2*P->Y */ + mul_fp(line[2][1], line[2][1], Px2->Y); +} + +#if 0 +static void add_n_dbl(vec384fp12 ret, POINTonE2 *T, const POINTonE2_affine *Q, + const POINTonE1_affine *Px2, vec384fp6 line, size_t n) +{ + line_add(line, T, T, Q); line_by_Px2(line, Px2); + mul_by_xy00z0_fp12(ret, ret, line); + while (n--) { + sqr_fp12(ret, ret); + line_dbl(line, T, T); line_by_Px2(line, Px2); + mul_by_xy00z0_fp12(ret, ret, line); + } +} + +static void miller_loop(vec384fp12 ret, const POINTonE2 *Q, const POINTonE1 *P) +{ +#define Q ((const POINTonE2_affine *)Q) + POINTonE2 T[1]; + POINTonE1_affine Px2[1]; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2->X, P->X, P->X); + neg_fp(Px2->X, Px2->X); + add_fp(Px2->Y, P->Y, P->Y); + + vec_copy(T->X, Q->X, 2*sizeof(T->X)); + vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z)); + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + line_dbl(line, T, T); /* 0x2 */ + line_by_Px2(line, Px2); + vec_zero(ret, sizeof(vec384fp12)); + vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); + vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); + add_n_dbl(ret, T, Q, Px2, line, 2); /* ..0xc */ + add_n_dbl(ret, T, Q, Px2, line, 3); /* ..0x68 */ + add_n_dbl(ret, T, Q, Px2, line, 9); /* ..0xd200 */ + add_n_dbl(ret, T, Q, Px2, line, 32); /* ..0xd20100000000 */ + add_n_dbl(ret, T, Q, Px2, line, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ +#undef Q +} +#endif + +static void start_dbl_n(vec384fp12 ret, POINTonE2 T[], + const POINTonE1_affine Px2[], size_t n) +{ + size_t i; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + line_dbl(line, T+0, T+0); line_by_Px2(line, Px2+0); + vec_zero(ret, sizeof(vec384fp12)); + vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); + vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); + + for (i = 1; i < n; i++) { + line_dbl(line, T+i, T+i); line_by_Px2(line, Px2+i); + mul_by_xy00z0_fp12(ret, ret, line); + } +} + +static void add_n_dbl_n(vec384fp12 ret, POINTonE2 T[], + const POINTonE2_affine Q[], + const POINTonE1_affine Px2[], + size_t n, size_t k) +{ + size_t i; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + for (i = 0; i < n; i++) { + line_add(line, T+i, T+i, Q+i); line_by_Px2(line, Px2+i); + mul_by_xy00z0_fp12(ret, ret, line); + } + while (k--) { + sqr_fp12(ret, ret); + for (i = 0; i < n; i++) { + line_dbl(line, T+i, T+i); line_by_Px2(line, Px2+i); + mul_by_xy00z0_fp12(ret, ret, line); + } + } +} + +static void miller_loop_n(vec384fp12 ret, const POINTonE2_affine Q[], + const POINTonE1_affine P[], size_t n) +{ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 + POINTonE2 *T = alloca(n*sizeof(POINTonE2)); + POINTonE1_affine *Px2 = alloca(n*sizeof(POINTonE1_affine)); +#else + POINTonE2 T[n]; + POINTonE1_affine Px2[n]; +#endif + size_t i; + + for (i = 0; i < n; i++) { + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2[i].X, P[i].X, P[i].X); + neg_fp(Px2[i].X, Px2[i].X); + add_fp(Px2[i].Y, P[i].Y, P[i].Y); + + vec_copy(T[i].X, Q[i].X, 2*sizeof(T[i].X)); + vec_copy(T[i].Z, BLS12_381_Rx.p2, sizeof(T[i].Z)); + } + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + start_dbl_n(ret, T, Px2, n); /* 0x2 */ + add_n_dbl_n(ret, T, Q, Px2, n, 2); /* ..0xc */ + add_n_dbl_n(ret, T, Q, Px2, n, 3); /* ..0x68 */ + add_n_dbl_n(ret, T, Q, Px2, n, 9); /* ..0xd200 */ + add_n_dbl_n(ret, T, Q, Px2, n, 32); /* ..0xd20100000000 */ + add_n_dbl_n(ret, T, Q, Px2, n, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ +} + +static void pre_add_n_dbl(vec384fp6 lines[], POINTonE2 *T, + const POINTonE2_affine *Q, + size_t n) +{ + line_add(lines++[0], T, T, Q); + while (n--) + line_dbl(lines++[0], T, T); +} + +static void precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q) +{ + POINTonE2 T[1]; + + vec_copy(T->X, Q->X, 2*sizeof(T->X)); + vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z)); + + line_dbl(Qlines[0], T, T); /* 0x2 */ + pre_add_n_dbl(Qlines + 1, T, Q, 2); /* ..0xc */ + pre_add_n_dbl(Qlines + 4, T, Q, 3); /* ..0x68 */ + pre_add_n_dbl(Qlines + 8, T, Q, 9); /* ..0xd200 */ + pre_add_n_dbl(Qlines + 18, T, Q, 32); /* ..0xd20100000000 */ + pre_add_n_dbl(Qlines + 51, T, Q, 16); /* ..0xd201000000010000 */ +} + +static void post_line_by_Px2(vec384fp6 out, const vec384fp6 in, + const POINTonE1_affine *Px2) +{ + vec_copy(out[0], in[0], sizeof(out[0])); + + mul_fp(out[1][0], in[1][0], Px2->X); /* "b01" *= -2*P->X */ + mul_fp(out[1][1], in[1][1], Px2->X); + + mul_fp(out[2][0], in[2][0], Px2->Y); /* "b11" *= 2*P->Y */ + mul_fp(out[2][1], in[2][1], Px2->Y); +} + +static void post_add_n_dbl(vec384fp12 ret, const vec384fp6 lines[], + const POINTonE1_affine *Px2, size_t n) +{ + vec384fp6 line; + + post_line_by_Px2(line, lines++[0], Px2); + mul_by_xy00z0_fp12(ret, ret, line); + while (n--) { + sqr_fp12(ret, ret); + post_line_by_Px2(line, lines++[0], Px2); + mul_by_xy00z0_fp12(ret, ret, line); + } +} + +static void miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68], + const POINTonE1_affine *P) +{ + POINTonE1_affine Px2[1]; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2->X, P->X, P->X); + neg_fp(Px2->X, Px2->X); + add_fp(Px2->Y, P->Y, P->Y); + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + post_line_by_Px2(line, Qlines[0], Px2); /* 0x2 */ + vec_zero(ret, sizeof(vec384fp12)); + vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); + vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); + post_add_n_dbl(ret, Qlines + 1, Px2, 2); /* ..0xc */ + post_add_n_dbl(ret, Qlines + 4, Px2, 3); /* ..0x68 */ + post_add_n_dbl(ret, Qlines + 8, Px2, 9); /* ..0xd200 */ + post_add_n_dbl(ret, Qlines + 18, Px2, 32); /* ..0xd20100000000 */ + post_add_n_dbl(ret, Qlines + 51, Px2, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ +} + +#ifdef INTERNAL_TESTMODE +static void miller_loop_alt(vec384fp12 ret, const POINTonE2_affine *Q, + const POINTonE1_affine *P) +{ + vec384fp6 lines[68]; + + precompute_lines(lines, Q); + miller_loop_lines(ret, lines, P); +} +#endif + +static void mul_n_sqr(vec384fp12 ret, const vec384fp12 a, size_t n) +{ + mul_fp12(ret, ret, a); + while (n--) + cyclotomic_sqr_fp12(ret, ret); +} + +static void raise_to_z_div_by_2(vec384fp12 ret, const vec384fp12 a) +{ + cyclotomic_sqr_fp12(ret, a); /* 0x2 */ + mul_n_sqr(ret, a, 2); /* ..0xc */ + mul_n_sqr(ret, a, 3); /* ..0x68 */ + mul_n_sqr(ret, a, 9); /* ..0xd200 */ + mul_n_sqr(ret, a, 32); /* ..0xd20100000000 */ + mul_n_sqr(ret, a, 16-1); /* ..0x6900800000008000 */ + conjugate_fp12(ret); /* account for z being negative */ +} + +#define raise_to_z(a, b) (raise_to_z_div_by_2(a, b), cyclotomic_sqr_fp12(a, a)) + +/* + * Adaptation from /pairing/src/bls12_381/mod.rs + */ +static void final_exp(vec384fp12 ret, const vec384fp12 f) +{ + vec384fp12 y0, y1, y2, y3; + + vec_copy(y1, f, sizeof(y1)); + conjugate_fp12(y1); + inverse_fp12(y2, f); + mul_fp12(ret, y1, y2); + frobenius_map_fp12(y2, ret, 2); + mul_fp12(ret, ret, y2); + + cyclotomic_sqr_fp12(y0, ret); + raise_to_z(y1, y0); + raise_to_z_div_by_2(y2, y1); + vec_copy(y3, ret, sizeof(y3)); + conjugate_fp12(y3); + mul_fp12(y1, y1, y3); + conjugate_fp12(y1); + mul_fp12(y1, y1, y2); + raise_to_z(y2, y1); + raise_to_z(y3, y2); + conjugate_fp12(y1); + mul_fp12(y3, y3, y1); + conjugate_fp12(y1); + frobenius_map_fp12(y1, y1, 3); + frobenius_map_fp12(y2, y2, 2); + mul_fp12(y1, y1, y2); + raise_to_z(y2, y3); + mul_fp12(y2, y2, y0); + mul_fp12(y2, y2, ret); + mul_fp12(y1, y1, y2); + frobenius_map_fp12(y2, y3, 1); + mul_fp12(ret, y1, y2); +} + +void blst_miller_loop(vec384fp12 ret, const POINTonE2_affine *Q, + const POINTonE1_affine *P) +{ miller_loop_n(ret, Q, P, 1); } + +void blst_final_exp(vec384fp12 ret, const vec384fp12 f) +{ final_exp(ret, f); } + +void blst_precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q) +{ precompute_lines(Qlines, Q); } + +void blst_miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68], + const POINTonE1_affine *P) +{ miller_loop_lines(ret, Qlines, P); } diff --git a/src/point.h b/src/point.h new file mode 100644 index 00000000..863e8be1 --- /dev/null +++ b/src/point.h @@ -0,0 +1,66 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_POINT_H__ +#define __BLS12_381_ASM_POINT_H__ + +#include "vect.h" + +#define DECLARE_POINT(ptype, bits) \ +typedef struct { vec##bits X,Y,Z; } ptype; \ +typedef struct { vec##bits X,Y; } ptype##_affine; \ +\ +static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2, \ + const vec##bits a4); \ +static void ptype##_dadd_affine(ptype *out, const ptype *p1, \ + const ptype *p2); \ +static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2); \ +static void ptype##_add_affine(ptype *out, const ptype *p1, \ + const ptype *p2); \ +static void ptype##_double(ptype *out, const ptype *p1); \ +static void ptype##_mult_w5(ptype *out, const ptype *point, \ + const limb_t *scalar, size_t nbits); \ +static void ptype##_mult_ladder(ptype *out, const ptype *point, \ + const limb_t *scalar, size_t nbits); \ +static void ptype##_affine_mult_ladder(ptype *out, const ptype *p_aff, \ + const limb_t *scalar, \ + size_t nbits); \ +static void ptype##_cneg(ptype *p, limb_t cbit); \ +static void ptype##_to_affine(ptype##_affine *out, const ptype *in); \ +static void ptype##_from_Jacobian(ptype *out, const ptype *in); \ +\ +static inline void ptype##_cswap(ptype *restrict a, \ + ptype *restrict b, limb_t cbit) { \ + vec_cswap(a, b, sizeof(ptype), cbit); \ +} \ +static inline void ptype##_ccopy(ptype *restrict a, \ + const ptype *restrict b, limb_t cbit) {\ + vec_select(a, b, a, sizeof(ptype), cbit); \ +} + +#define DECLARE_PRIVATE_POINTXZ(ptype, bits) \ +typedef struct { vec##bits X,Z; } ptype##xz; \ +\ +static void ptype##xz_ladder_pre(ptype##xz *out, const ptype *in); \ +static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s, \ + const ptype##xz *p); \ +static void ptype##xz_ladder_post(ptype *ret, \ + const ptype##xz *r, const ptype##xz *s, \ + const ptype##xz *p, const vec##bits Y1);\ +\ +static inline void ptype##xz_cswap(ptype##xz *restrict a, \ + ptype##xz *restrict b, limb_t cbit) {\ + vec_cswap(a, b, sizeof(ptype##xz), cbit); \ +} + +DECLARE_POINT(POINTonE1, 384) + +DECLARE_POINT(POINTonE2, 384x) + +#ifdef __GNUC__ +# pragma GCC diagnostic ignored "-Wunused-function" +#endif + +#endif diff --git a/src/recip-addchain.h b/src/recip-addchain.h new file mode 100644 index 00000000..e4e436a3 --- /dev/null +++ b/src/recip-addchain.h @@ -0,0 +1,489 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * The "magic" number is BLS12_381_P-2. Exponentiation to which yields + * reciprocal to input base. + * + * Generated with 'addchain 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559785' + * https://github.com/kwantam/addchain + * + * # Bos-Coster (win=4) : 461 (16) <<< + * # Bos-Coster (win=3) : 464 ( 9) + * # Bos-Coster (win=8) : 469 (35) + * # Bos-Coster (win=5) : 463 (28) + * # Bos-Coster (win=9) : 467 (32) + * # Bos-Coster (win=7) : 462 (27) + * # Yacobi : 481 (31) + * # Bos-Coster (win=10) : 475 (30) + * # Bos-Coster (win=6) : 463 (32) + * # Bos-Coster (win=2) : 489 ( 5) + * # Bergeron-Berstel-Brlek-Duboc : 498 ( 5) + */ + +#define RECIPROCAL_MOD_BLS12_381_P(out, inp, ptype) do { \ +ptype t[16]; \ +vec_copy(t[1], inp, sizeof(ptype)); /* 0: 1 */\ +sqr(t[0], t[1]); /* 1: 2 */\ +mul(t[9], t[0], t[1]); /* 2: 3 */\ +sqr(t[5], t[0]); /* 3: 4 */\ +mul(t[2], t[9], t[0]); /* 4: 5 */\ +mul(t[7], t[5], t[9]); /* 5: 7 */\ +mul(t[10], t[2], t[5]); /* 6: 9 */\ +mul(t[13], t[7], t[5]); /* 7: b */\ +mul(t[4], t[10], t[5]); /* 8: d */\ +mul(t[8], t[13], t[5]); /* 9: f */\ +mul(t[15], t[4], t[5]); /* 10: 11 */\ +mul(t[11], t[8], t[5]); /* 11: 13 */\ +mul(t[3], t[15], t[5]); /* 12: 15 */\ +mul(t[12], t[11], t[5]); /* 13: 17 */\ +sqr(t[0], t[4]); /* 14: 1a */\ +mul(t[14], t[12], t[5]); /* 15: 1b */\ +mul(t[6], t[0], t[9]); /* 16: 1d */\ +mul(t[5], t[0], t[2]); /* 17: 1f */\ +/* sqr(t[0], t[0]); */ /* 18: 34 */\ +/* sqr(t[0], t[0]); */ /* 19: 68 */\ +/* sqr(t[0], t[0]); */ /* 20: d0 */\ +/* sqr(t[0], t[0]); */ /* 21: 1a0 */\ +/* sqr(t[0], t[0]); */ /* 22: 340 */\ +/* sqr(t[0], t[0]); */ /* 23: 680 */\ +/* sqr(t[0], t[0]); */ /* 24: d00 */\ +/* sqr(t[0], t[0]); */ /* 25: 1a00 */\ +/* sqr(t[0], t[0]); */ /* 26: 3400 */\ +/* sqr(t[0], t[0]); */ /* 27: 6800 */\ +/* sqr(t[0], t[0]); */ /* 28: d000 */\ +/* sqr(t[0], t[0]); */ /* 29: 1a000 */\ +sqr_n_mul(t[0], t[0], 12, t[15]); /* 30: 1a011 */\ +/* sqr(t[0], t[0]); */ /* 31: 34022 */\ +/* sqr(t[0], t[0]); */ /* 32: 68044 */\ +/* sqr(t[0], t[0]); */ /* 33: d0088 */\ +/* sqr(t[0], t[0]); */ /* 34: 1a0110 */\ +/* sqr(t[0], t[0]); */ /* 35: 340220 */\ +/* sqr(t[0], t[0]); */ /* 36: 680440 */\ +/* sqr(t[0], t[0]); */ /* 37: d00880 */\ +sqr_n_mul(t[0], t[0], 7, t[8]); /* 38: d0088f */\ +/* sqr(t[0], t[0]); */ /* 39: 1a0111e */\ +/* sqr(t[0], t[0]); */ /* 40: 340223c */\ +/* sqr(t[0], t[0]); */ /* 41: 6804478 */\ +/* sqr(t[0], t[0]); */ /* 42: d0088f0 */\ +sqr_n_mul(t[0], t[0], 4, t[2]); /* 43: d0088f5 */\ +/* sqr(t[0], t[0]); */ /* 44: 1a0111ea */\ +/* sqr(t[0], t[0]); */ /* 45: 340223d4 */\ +/* sqr(t[0], t[0]); */ /* 46: 680447a8 */\ +/* sqr(t[0], t[0]); */ /* 47: d0088f50 */\ +/* sqr(t[0], t[0]); */ /* 48: 1a0111ea0 */\ +/* sqr(t[0], t[0]); */ /* 49: 340223d40 */\ +sqr_n_mul(t[0], t[0], 6, t[7]); /* 50: 340223d47 */\ +/* sqr(t[0], t[0]); */ /* 51: 680447a8e */\ +/* sqr(t[0], t[0]); */ /* 52: d0088f51c */\ +/* sqr(t[0], t[0]); */ /* 53: 1a0111ea38 */\ +/* sqr(t[0], t[0]); */ /* 54: 340223d470 */\ +/* sqr(t[0], t[0]); */ /* 55: 680447a8e0 */\ +/* sqr(t[0], t[0]); */ /* 56: d0088f51c0 */\ +/* sqr(t[0], t[0]); */ /* 57: 1a0111ea380 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 58: 1a0111ea397 */\ +/* sqr(t[0], t[0]); */ /* 59: 340223d472e */\ +/* sqr(t[0], t[0]); */ /* 60: 680447a8e5c */\ +/* sqr(t[0], t[0]); */ /* 61: d0088f51cb8 */\ +/* sqr(t[0], t[0]); */ /* 62: 1a0111ea3970 */\ +/* sqr(t[0], t[0]); */ /* 63: 340223d472e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 64: 340223d472ff */\ +/* sqr(t[0], t[0]); */ /* 65: 680447a8e5fe */\ +/* sqr(t[0], t[0]); */ /* 66: d0088f51cbfc */\ +sqr_n_mul(t[0], t[0], 2, t[9]); /* 67: d0088f51cbff */\ +/* sqr(t[0], t[0]); */ /* 68: 1a0111ea397fe */\ +/* sqr(t[0], t[0]); */ /* 69: 340223d472ffc */\ +/* sqr(t[0], t[0]); */ /* 70: 680447a8e5ff8 */\ +/* sqr(t[0], t[0]); */ /* 71: d0088f51cbff0 */\ +/* sqr(t[0], t[0]); */ /* 72: 1a0111ea397fe0 */\ +/* sqr(t[0], t[0]); */ /* 73: 340223d472ffc0 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 74: 340223d472ffcd */\ +/* sqr(t[0], t[0]); */ /* 75: 680447a8e5ff9a */\ +/* sqr(t[0], t[0]); */ /* 76: d0088f51cbff34 */\ +/* sqr(t[0], t[0]); */ /* 77: 1a0111ea397fe68 */\ +/* sqr(t[0], t[0]); */ /* 78: 340223d472ffcd0 */\ +/* sqr(t[0], t[0]); */ /* 79: 680447a8e5ff9a0 */\ +/* sqr(t[0], t[0]); */ /* 80: d0088f51cbff340 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 81: d0088f51cbff34d */\ +/* sqr(t[0], t[0]); */ /* 82: 1a0111ea397fe69a */\ +/* sqr(t[0], t[0]); */ /* 83: 340223d472ffcd34 */\ +/* sqr(t[0], t[0]); */ /* 84: 680447a8e5ff9a68 */\ +/* sqr(t[0], t[0]); */ /* 85: d0088f51cbff34d0 */\ +/* sqr(t[0], t[0]); */ /* 86: 1a0111ea397fe69a0 */\ +/* sqr(t[0], t[0]); */ /* 87: 340223d472ffcd340 */\ +sqr_n_mul(t[0], t[0], 6, t[10]); /* 88: 340223d472ffcd349 */\ +/* sqr(t[0], t[0]); */ /* 89: 680447a8e5ff9a692 */\ +/* sqr(t[0], t[0]); */ /* 90: d0088f51cbff34d24 */\ +/* sqr(t[0], t[0]); */ /* 91: 1a0111ea397fe69a48 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 92: 1a0111ea397fe69a4b */\ +/* sqr(t[0], t[0]); */ /* 93: 340223d472ffcd3496 */\ +/* sqr(t[0], t[0]); */ /* 94: 680447a8e5ff9a692c */\ +/* sqr(t[0], t[0]); */ /* 95: d0088f51cbff34d258 */\ +/* sqr(t[0], t[0]); */ /* 96: 1a0111ea397fe69a4b0 */\ +/* sqr(t[0], t[0]); */ /* 97: 340223d472ffcd34960 */\ +/* sqr(t[0], t[0]); */ /* 98: 680447a8e5ff9a692c0 */\ +/* sqr(t[0], t[0]); */ /* 99: d0088f51cbff34d2580 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 100: d0088f51cbff34d258d */\ +/* sqr(t[0], t[0]); */ /* 101: 1a0111ea397fe69a4b1a */\ +/* sqr(t[0], t[0]); */ /* 102: 340223d472ffcd349634 */\ +/* sqr(t[0], t[0]); */ /* 103: 680447a8e5ff9a692c68 */\ +/* sqr(t[0], t[0]); */ /* 104: d0088f51cbff34d258d0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 105: d0088f51cbff34d258dd */\ +/* sqr(t[0], t[0]); */ /* 106: 1a0111ea397fe69a4b1ba */\ +/* sqr(t[0], t[0]); */ /* 107: 340223d472ffcd3496374 */\ +/* sqr(t[0], t[0]); */ /* 108: 680447a8e5ff9a692c6e8 */\ +/* sqr(t[0], t[0]); */ /* 109: d0088f51cbff34d258dd0 */\ +/* sqr(t[0], t[0]); */ /* 110: 1a0111ea397fe69a4b1ba0 */\ +/* sqr(t[0], t[0]); */ /* 111: 340223d472ffcd34963740 */\ +sqr_n_mul(t[0], t[0], 6, t[8]); /* 112: 340223d472ffcd3496374f */\ +/* sqr(t[0], t[0]); */ /* 113: 680447a8e5ff9a692c6e9e */\ +/* sqr(t[0], t[0]); */ /* 114: d0088f51cbff34d258dd3c */\ +/* sqr(t[0], t[0]); */ /* 115: 1a0111ea397fe69a4b1ba78 */\ +/* sqr(t[0], t[0]); */ /* 116: 340223d472ffcd3496374f0 */\ +/* sqr(t[0], t[0]); */ /* 117: 680447a8e5ff9a692c6e9e0 */\ +/* sqr(t[0], t[0]); */ /* 118: d0088f51cbff34d258dd3c0 */\ +sqr_n_mul(t[0], t[0], 6, t[14]); /* 119: d0088f51cbff34d258dd3db */\ +/* sqr(t[0], t[0]); */ /* 120: 1a0111ea397fe69a4b1ba7b6 */\ +/* sqr(t[0], t[0]); */ /* 121: 340223d472ffcd3496374f6c */\ +/* sqr(t[0], t[0]); */ /* 122: 680447a8e5ff9a692c6e9ed8 */\ +sqr_n_mul(t[0], t[0], 3, t[1]); /* 123: 680447a8e5ff9a692c6e9ed9 */\ +/* sqr(t[0], t[0]); */ /* 124: d0088f51cbff34d258dd3db2 */\ +/* sqr(t[0], t[0]); */ /* 125: 1a0111ea397fe69a4b1ba7b64 */\ +/* sqr(t[0], t[0]); */ /* 126: 340223d472ffcd3496374f6c8 */\ +/* sqr(t[0], t[0]); */ /* 127: 680447a8e5ff9a692c6e9ed90 */\ +/* sqr(t[0], t[0]); */ /* 128: d0088f51cbff34d258dd3db20 */\ +/* sqr(t[0], t[0]); */ /* 129: 1a0111ea397fe69a4b1ba7b640 */\ +/* sqr(t[0], t[0]); */ /* 130: 340223d472ffcd3496374f6c80 */\ +/* sqr(t[0], t[0]); */ /* 131: 680447a8e5ff9a692c6e9ed900 */\ +sqr_n_mul(t[0], t[0], 8, t[4]); /* 132: 680447a8e5ff9a692c6e9ed90d */\ +/* sqr(t[0], t[0]); */ /* 133: d0088f51cbff34d258dd3db21a */\ +/* sqr(t[0], t[0]); */ /* 134: 1a0111ea397fe69a4b1ba7b6434 */\ +/* sqr(t[0], t[0]); */ /* 135: 340223d472ffcd3496374f6c868 */\ +/* sqr(t[0], t[0]); */ /* 136: 680447a8e5ff9a692c6e9ed90d0 */\ +/* sqr(t[0], t[0]); */ /* 137: d0088f51cbff34d258dd3db21a0 */\ +/* sqr(t[0], t[0]); */ /* 138: 1a0111ea397fe69a4b1ba7b64340 */\ +/* sqr(t[0], t[0]); */ /* 139: 340223d472ffcd3496374f6c8680 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 140: 340223d472ffcd3496374f6c8697 */\ +/* sqr(t[0], t[0]); */ /* 141: 680447a8e5ff9a692c6e9ed90d2e */\ +/* sqr(t[0], t[0]); */ /* 142: d0088f51cbff34d258dd3db21a5c */\ +/* sqr(t[0], t[0]); */ /* 143: 1a0111ea397fe69a4b1ba7b6434b8 */\ +/* sqr(t[0], t[0]); */ /* 144: 340223d472ffcd3496374f6c86970 */\ +/* sqr(t[0], t[0]); */ /* 145: 680447a8e5ff9a692c6e9ed90d2e0 */\ +sqr_n_mul(t[0], t[0], 5, t[13]); /* 146: 680447a8e5ff9a692c6e9ed90d2eb */\ +/* sqr(t[0], t[0]); */ /* 147: d0088f51cbff34d258dd3db21a5d6 */\ +/* sqr(t[0], t[0]); */ /* 148: 1a0111ea397fe69a4b1ba7b6434bac */\ +/* sqr(t[0], t[0]); */ /* 149: 340223d472ffcd3496374f6c869758 */\ +/* sqr(t[0], t[0]); */ /* 150: 680447a8e5ff9a692c6e9ed90d2eb0 */\ +/* sqr(t[0], t[0]); */ /* 151: d0088f51cbff34d258dd3db21a5d60 */\ +/* sqr(t[0], t[0]); */ /* 152: 1a0111ea397fe69a4b1ba7b6434bac0 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 153: 1a0111ea397fe69a4b1ba7b6434bacd */\ +/* sqr(t[0], t[0]); */ /* 154: 340223d472ffcd3496374f6c869759a */\ +/* sqr(t[0], t[0]); */ /* 155: 680447a8e5ff9a692c6e9ed90d2eb34 */\ +/* sqr(t[0], t[0]); */ /* 156: d0088f51cbff34d258dd3db21a5d668 */\ +/* sqr(t[0], t[0]); */ /* 157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\ +/* sqr(t[0], t[0]); */ /* 158: 340223d472ffcd3496374f6c869759a0 */\ +/* sqr(t[0], t[0]); */ /* 159: 680447a8e5ff9a692c6e9ed90d2eb340 */\ +sqr_n_mul(t[0], t[0], 6, t[6]); /* 160: 680447a8e5ff9a692c6e9ed90d2eb35d */\ +/* sqr(t[0], t[0]); */ /* 161: d0088f51cbff34d258dd3db21a5d66ba */\ +/* sqr(t[0], t[0]); */ /* 162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\ +/* sqr(t[0], t[0]); */ /* 163: 340223d472ffcd3496374f6c869759ae8 */\ +/* sqr(t[0], t[0]); */ /* 164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\ +sqr_n_mul(t[0], t[0], 4, t[10]); /* 165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\ +/* sqr(t[0], t[0]); */ /* 166: d0088f51cbff34d258dd3db21a5d66bb2 */\ +/* sqr(t[0], t[0]); */ /* 167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\ +/* sqr(t[0], t[0]); */ /* 168: 340223d472ffcd3496374f6c869759aec8 */\ +/* sqr(t[0], t[0]); */ /* 169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\ +/* sqr(t[0], t[0]); */ /* 170: d0088f51cbff34d258dd3db21a5d66bb20 */\ +/* sqr(t[0], t[0]); */ /* 171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\ +/* sqr(t[0], t[0]); */ /* 172: 340223d472ffcd3496374f6c869759aec80 */\ +/* sqr(t[0], t[0]); */ /* 173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\ +sqr_n_mul(t[0], t[0], 8, t[6]); /* 174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\ +/* sqr(t[0], t[0]); */ /* 175: d0088f51cbff34d258dd3db21a5d66bb23a */\ +/* sqr(t[0], t[0]); */ /* 176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\ +/* sqr(t[0], t[0]); */ /* 177: 340223d472ffcd3496374f6c869759aec8e8 */\ +/* sqr(t[0], t[0]); */ /* 178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\ +/* sqr(t[0], t[0]); */ /* 180: d0088f51cbff34d258dd3db21a5d66bb23ba */\ +/* sqr(t[0], t[0]); */ /* 181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\ +/* sqr(t[0], t[0]); */ /* 182: 340223d472ffcd3496374f6c869759aec8ee8 */\ +/* sqr(t[0], t[0]); */ /* 183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\ +/* sqr(t[0], t[0]); */ /* 184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\ +/* sqr(t[0], t[0]); */ /* 185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\ +/* sqr(t[0], t[0]); */ /* 186: 340223d472ffcd3496374f6c869759aec8ee80 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 187: 340223d472ffcd3496374f6c869759aec8ee97 */\ +/* sqr(t[0], t[0]); */ /* 188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\ +/* sqr(t[0], t[0]); */ /* 189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\ +/* sqr(t[0], t[0]); */ /* 190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\ +/* sqr(t[0], t[0]); */ /* 191: 340223d472ffcd3496374f6c869759aec8ee970 */\ +/* sqr(t[0], t[0]); */ /* 192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\ +/* sqr(t[0], t[0]); */ /* 193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\ +/* sqr(t[0], t[0]); */ /* 194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\ +/* sqr(t[0], t[0]); */ /* 195: 340223d472ffcd3496374f6c869759aec8ee9700 */\ +/* sqr(t[0], t[0]); */ /* 196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\ +sqr_n_mul(t[0], t[0], 9, t[11]); /* 197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\ +/* sqr(t[0], t[0]); */ /* 198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\ +/* sqr(t[0], t[0]); */ /* 199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\ +sqr_n_mul(t[0], t[0], 2, t[9]); /* 200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\ +/* sqr(t[0], t[0]); */ /* 201: 340223d472ffcd3496374f6c869759aec8ee9709e */\ +/* sqr(t[0], t[0]); */ /* 202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\ +/* sqr(t[0], t[0]); */ /* 203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\ +/* sqr(t[0], t[0]); */ /* 204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\ +/* sqr(t[0], t[0]); */ /* 205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\ +sqr_n_mul(t[0], t[0], 5, t[7]); /* 206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\ +/* sqr(t[0], t[0]); */ /* 207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\ +/* sqr(t[0], t[0]); */ /* 208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\ +/* sqr(t[0], t[0]); */ /* 209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\ +/* sqr(t[0], t[0]); */ /* 210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\ +/* sqr(t[0], t[0]); */ /* 211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\ +/* sqr(t[0], t[0]); */ /* 212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\ +/* sqr(t[0], t[0]); */ /* 213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\ +sqr_n_mul(t[0], t[0], 7, t[2]); /* 214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\ +/* sqr(t[0], t[0]); */ /* 215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\ +/* sqr(t[0], t[0]); */ /* 216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\ +/* sqr(t[0], t[0]); */ /* 217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\ +/* sqr(t[0], t[0]); */ /* 218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\ +/* sqr(t[0], t[0]); */ /* 219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\ +/* sqr(t[0], t[0]); */ /* 220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\ +/* sqr(t[0], t[0]); */ /* 221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\ +sqr_n_mul(t[0], t[0], 7, t[10]); /* 222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\ +/* sqr(t[0], t[0]); */ /* 223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\ +/* sqr(t[0], t[0]); */ /* 224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\ +/* sqr(t[0], t[0]); */ /* 225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\ +/* sqr(t[0], t[0]); */ /* 226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\ +/* sqr(t[0], t[0]); */ /* 227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\ +/* sqr(t[0], t[0]); */ /* 228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\ +sqr_n_mul(t[0], t[0], 6, t[12]); /* 229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\ +/* sqr(t[0], t[0]); */ /* 230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\ +/* sqr(t[0], t[0]); */ /* 231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\ +/* sqr(t[0], t[0]); */ /* 232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\ +/* sqr(t[0], t[0]); */ /* 233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\ +/* sqr(t[0], t[0]); */ /* 234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\ +/* sqr(t[0], t[0]); */ /* 236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\ +/* sqr(t[0], t[0]); */ /* 237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\ +/* sqr(t[0], t[0]); */ /* 238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\ +/* sqr(t[0], t[0]); */ /* 239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\ +/* sqr(t[0], t[0]); */ /* 240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[11]); /* 241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\ +/* sqr(t[0], t[0]); */ /* 242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\ +/* sqr(t[0], t[0]); */ /* 243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\ +/* sqr(t[0], t[0]); */ /* 244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\ +/* sqr(t[0], t[0]); */ /* 245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\ +/* sqr(t[0], t[0]); */ /* 246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\ +sqr_n_mul(t[0], t[0], 5, t[11]); /* 247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\ +/* sqr(t[0], t[0]); */ /* 248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\ +/* sqr(t[0], t[0]); */ /* 249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\ +/* sqr(t[0], t[0]); */ /* 250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\ +/* sqr(t[0], t[0]); */ /* 251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\ +/* sqr(t[0], t[0]); */ /* 252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\ +/* sqr(t[0], t[0]); */ /* 253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\ +/* sqr(t[0], t[0]); */ /* 254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\ +/* sqr(t[0], t[0]); */ /* 255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\ +sqr_n_mul(t[0], t[0], 8, t[4]); /* 256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\ +/* sqr(t[0], t[0]); */ /* 257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\ +/* sqr(t[0], t[0]); */ /* 258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\ +/* sqr(t[0], t[0]); */ /* 259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\ +/* sqr(t[0], t[0]); */ /* 260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\ +/* sqr(t[0], t[0]); */ /* 261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\ +/* sqr(t[0], t[0]); */ /* 262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\ +/* sqr(t[0], t[0]); */ /* 263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\ +sqr_n_mul(t[0], t[0], 7, t[3]); /* 264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\ +/* sqr(t[0], t[0]); */ /* 265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\ +/* sqr(t[0], t[0]); */ /* 266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\ +/* sqr(t[0], t[0]); */ /* 267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\ +/* sqr(t[0], t[0]); */ /* 268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\ +/* sqr(t[0], t[0]); */ /* 269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\ +/* sqr(t[0], t[0]); */ /* 270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\ +/* sqr(t[0], t[0]); */ /* 271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\ +/* sqr(t[0], t[0]); */ /* 272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\ +/* sqr(t[0], t[0]); */ /* 273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\ +sqr_n_mul(t[0], t[0], 9, t[8]); /* 274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\ +/* sqr(t[0], t[0]); */ /* 275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\ +/* sqr(t[0], t[0]); */ /* 276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\ +/* sqr(t[0], t[0]); */ /* 277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\ +/* sqr(t[0], t[0]); */ /* 278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\ +/* sqr(t[0], t[0]); */ /* 279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\ +/* sqr(t[0], t[0]); */ /* 281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\ +/* sqr(t[0], t[0]); */ /* 282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\ +/* sqr(t[0], t[0]); */ /* 283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\ +/* sqr(t[0], t[0]); */ /* 285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\ +/* sqr(t[0], t[0]); */ /* 286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\ +/* sqr(t[0], t[0]); */ /* 287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\ +/* sqr(t[0], t[0]); */ /* 288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\ +/* sqr(t[0], t[0]); */ /* 289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\ +/* sqr(t[0], t[0]); */ /* 290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\ +/* sqr(t[0], t[0]); */ /* 291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\ +/* sqr(t[0], t[0]); */ /* 292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\ +sqr_n_mul(t[0], t[0], 8, t[8]); /* 293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\ +/* sqr(t[0], t[0]); */ /* 294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\ +/* sqr(t[0], t[0]); */ /* 295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\ +/* sqr(t[0], t[0]); */ /* 296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\ +/* sqr(t[0], t[0]); */ /* 298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\ +/* sqr(t[0], t[0]); */ /* 299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\ +/* sqr(t[0], t[0]); */ /* 300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\ +/* sqr(t[0], t[0]); */ /* 301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\ +/* sqr(t[0], t[0]); */ /* 302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\ +/* sqr(t[0], t[0]); */ /* 303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\ +/* sqr(t[0], t[0]); */ /* 304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\ +sqr_n_mul(t[0], t[0], 7, t[10]); /* 305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\ +/* sqr(t[0], t[0]); */ /* 306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\ +/* sqr(t[0], t[0]); */ /* 307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\ +/* sqr(t[0], t[0]); */ /* 308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\ +/* sqr(t[0], t[0]); */ /* 309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\ +/* sqr(t[0], t[0]); */ /* 310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\ +/* sqr(t[0], t[0]); */ /* 311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\ +/* sqr(t[0], t[0]); */ /* 312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\ +/* sqr(t[0], t[0]); */ /* 313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\ +/* sqr(t[0], t[0]); */ /* 314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\ +sqr_n_mul(t[0], t[0], 9, t[8]); /* 315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\ +/* sqr(t[0], t[0]); */ /* 316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\ +/* sqr(t[0], t[0]); */ /* 317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\ +/* sqr(t[0], t[0]); */ /* 318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\ +/* sqr(t[0], t[0]); */ /* 319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\ +/* sqr(t[0], t[0]); */ /* 320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\ +/* sqr(t[0], t[0]); */ /* 321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\ +/* sqr(t[0], t[0]); */ /* 323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\ +/* sqr(t[0], t[0]); */ /* 324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\ +/* sqr(t[0], t[0]); */ /* 325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\ +/* sqr(t[0], t[0]); */ /* 326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\ +/* sqr(t[0], t[0]); */ /* 327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\ +/* sqr(t[0], t[0]); */ /* 328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\ +sqr_n_mul(t[0], t[0], 6, t[5]); /* 329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\ +/* sqr(t[0], t[0]); */ /* 330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\ +/* sqr(t[0], t[0]); */ /* 331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\ +/* sqr(t[0], t[0]); */ /* 332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\ +/* sqr(t[0], t[0]); */ /* 333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\ +/* sqr(t[0], t[0]); */ /* 334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\ +/* sqr(t[0], t[0]); */ /* 336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\ +/* sqr(t[0], t[0]); */ /* 337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\ +/* sqr(t[0], t[0]); */ /* 338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\ +/* sqr(t[0], t[0]); */ /* 339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\ +/* sqr(t[0], t[0]); */ /* 340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\ +/* sqr(t[0], t[0]); */ /* 342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\ +/* sqr(t[0], t[0]); */ /* 343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\ +/* sqr(t[0], t[0]); */ /* 344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\ +/* sqr(t[0], t[0]); */ /* 345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\ +/* sqr(t[0], t[0]); */ /* 347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\ +/* sqr(t[0], t[0]); */ /* 348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\ +/* sqr(t[0], t[0]); */ /* 349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\ +/* sqr(t[0], t[0]); */ /* 351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\ +/* sqr(t[0], t[0]); */ /* 352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\ +/* sqr(t[0], t[0]); */ /* 353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\ +/* sqr(t[0], t[0]); */ /* 354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\ +/* sqr(t[0], t[0]); */ /* 355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\ +/* sqr(t[0], t[0]); */ /* 356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\ +/* sqr(t[0], t[0]); */ /* 357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\ +/* sqr(t[0], t[0]); */ /* 358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\ +/* sqr(t[0], t[0]); */ /* 360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\ +/* sqr(t[0], t[0]); */ /* 361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\ +/* sqr(t[0], t[0]); */ /* 362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\ +/* sqr(t[0], t[0]); */ /* 363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\ +/* sqr(t[0], t[0]); */ /* 364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\ +/* sqr(t[0], t[0]); */ /* 365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\ +/* sqr(t[0], t[0]); */ /* 366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\ +sqr_n_mul(t[0], t[0], 7, t[5]); /* 367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\ +/* sqr(t[0], t[0]); */ /* 368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\ +/* sqr(t[0], t[0]); */ /* 369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\ +/* sqr(t[0], t[0]); */ /* 370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\ +/* sqr(t[0], t[0]); */ /* 371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\ +/* sqr(t[0], t[0]); */ /* 372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\ +/* sqr(t[0], t[0]); */ /* 374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\ +/* sqr(t[0], t[0]); */ /* 375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\ +/* sqr(t[0], t[0]); */ /* 376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\ +/* sqr(t[0], t[0]); */ /* 377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\ +/* sqr(t[0], t[0]); */ /* 378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\ +/* sqr(t[0], t[0]); */ /* 380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\ +/* sqr(t[0], t[0]); */ /* 381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\ +/* sqr(t[0], t[0]); */ /* 382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\ +/* sqr(t[0], t[0]); */ /* 383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[8]); /* 384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\ +/* sqr(t[0], t[0]); */ /* 385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\ +/* sqr(t[0], t[0]); */ /* 386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\ +/* sqr(t[0], t[0]); */ /* 387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\ +/* sqr(t[0], t[0]); */ /* 388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[7]); /* 389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\ +/* sqr(t[0], t[0]); */ /* 390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\ +/* sqr(t[0], t[0]); */ /* 391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\ +/* sqr(t[0], t[0]); */ /* 392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\ +/* sqr(t[0], t[0]); */ /* 393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\ +/* sqr(t[0], t[0]); */ /* 394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\ +/* sqr(t[0], t[0]); */ /* 395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\ +/* sqr(t[0], t[0]); */ /* 396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\ +sqr_n_mul(t[0], t[0], 7, t[5]); /* 397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\ +/* sqr(t[0], t[0]); */ /* 398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\ +/* sqr(t[0], t[0]); */ /* 399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\ +/* sqr(t[0], t[0]); */ /* 400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\ +/* sqr(t[0], t[0]); */ /* 401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\ +/* sqr(t[0], t[0]); */ /* 402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\ +/* sqr(t[0], t[0]); */ /* 404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\ +/* sqr(t[0], t[0]); */ /* 405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\ +/* sqr(t[0], t[0]); */ /* 406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\ +/* sqr(t[0], t[0]); */ /* 407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\ +/* sqr(t[0], t[0]); */ /* 408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\ +/* sqr(t[0], t[0]); */ /* 410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\ +/* sqr(t[0], t[0]); */ /* 411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\ +/* sqr(t[0], t[0]); */ /* 412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\ +/* sqr(t[0], t[0]); */ /* 413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\ +/* sqr(t[0], t[0]); */ /* 414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\ +/* sqr(t[0], t[0]); */ /* 416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\ +/* sqr(t[0], t[0]); */ /* 417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\ +/* sqr(t[0], t[0]); */ /* 418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\ +/* sqr(t[0], t[0]); */ /* 419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\ +/* sqr(t[0], t[0]); */ /* 420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\ +/* sqr(t[0], t[0]); */ /* 422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\ +/* sqr(t[0], t[0]); */ /* 423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\ +/* sqr(t[0], t[0]); */ /* 424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\ +/* sqr(t[0], t[0]); */ /* 425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\ +/* sqr(t[0], t[0]); */ /* 426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\ +/* sqr(t[0], t[0]); */ /* 428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\ +/* sqr(t[0], t[0]); */ /* 429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\ +/* sqr(t[0], t[0]); */ /* 430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\ +/* sqr(t[0], t[0]); */ /* 431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\ +/* sqr(t[0], t[0]); */ /* 432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\ +/* sqr(t[0], t[0]); */ /* 434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\ +/* sqr(t[0], t[0]); */ /* 435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\ +/* sqr(t[0], t[0]); */ /* 436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\ +/* sqr(t[0], t[0]); */ /* 437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\ +/* sqr(t[0], t[0]); */ /* 438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\ +/* sqr(t[0], t[0]); */ /* 440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\ +/* sqr(t[0], t[0]); */ /* 441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\ +/* sqr(t[0], t[0]); */ /* 442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\ +/* sqr(t[0], t[0]); */ /* 443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\ +/* sqr(t[0], t[0]); */ /* 445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\ +/* sqr(t[0], t[0]); */ /* 446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\ +/* sqr(t[0], t[0]); */ /* 447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\ +/* sqr(t[0], t[0]); */ /* 448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\ +/* sqr(t[0], t[0]); */ /* 449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\ +/* sqr(t[0], t[0]); */ /* 450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\ +/* sqr(t[0], t[0]); */ /* 452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\ +/* sqr(t[0], t[0]); */ /* 453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\ +/* sqr(t[0], t[0]); */ /* 454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\ +/* sqr(t[0], t[0]); */ /* 455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\ +sqr_n_mul(t[0], t[0], 4, t[2]); /* 456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\ +/* sqr(t[0], t[0]); */ /* 457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\ +/* sqr(t[0], t[0]); */ /* 458: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd554 */\ +/* sqr(t[0], t[0]); */ /* 459: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa8 */\ +sqr_n_mul(out, t[0], 3, t[1]); /* 460: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9 */\ +} while(0) diff --git a/src/server.c b/src/server.c new file mode 100644 index 00000000..73938dc7 --- /dev/null +++ b/src/server.c @@ -0,0 +1,20 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "keygen.c" +#include "hash_to_field.c" +#include "e1.c" +#include "exp.c" +#include "map_to_g1.c" +#include "e2.c" +#include "exp2.c" +#include "map_to_g2.c" +#include "fp12_tower.c" +#include "pairing.c" +#include "aggregate.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" diff --git a/src/sha256.h b/src/sha256.h new file mode 100644 index 00000000..5db30f80 --- /dev/null +++ b/src/sha256.h @@ -0,0 +1,130 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_SHA256_H__ +#define __BLS12_381_ASM_SHA256_H__ + +#include + +void sha256_hcopy(unsigned int dst[8], const unsigned int src[8]); +void sha256_bcopy(void *dst, const void *src, size_t len); + +/* + * If SHA256_CTX conflicts with something, just redefine it to alternative + * custom name prior including this header. + */ +typedef struct { + unsigned int h[8]; + unsigned long long N; + unsigned char buf[64]; + size_t off; +} SHA256_CTX; + +#ifdef __SHA__ /* -msha */ +# define sha256_block_data_order sha256_block_data_order_shaext +#endif +void sha256_block_data_order(unsigned int *h, const void *inp, size_t blocks); + +static void sha256_init_h(unsigned int h[8]) +{ + h[0] = 0x6a09e667U; + h[1] = 0xbb67ae85U; + h[2] = 0x3c6ef372U; + h[3] = 0xa54ff53aU; + h[4] = 0x510e527fU; + h[5] = 0x9b05688cU; + h[6] = 0x1f83d9abU; + h[7] = 0x5be0cd19U; +} + +static void sha256_init(SHA256_CTX *ctx) +{ + sha256_init_h(ctx->h); + ctx->N = 0; + vec_zero(ctx->buf, sizeof(ctx->buf)); + ctx->off = 0; +} + +static void sha256_update(SHA256_CTX *ctx, const void *_inp, size_t len) +{ + size_t n; + const unsigned char *inp = _inp; + + ctx->N += len; + + if ((n = ctx->off)) { + size_t rem = sizeof(ctx->buf) - n; + + if (rem > len) { + sha256_bcopy(ctx->buf + n, inp, len); + ctx->off += len; + return; + } else { + sha256_bcopy(ctx->buf + n, inp, rem); + inp += rem; + len -= rem; + sha256_block_data_order(ctx->h, ctx->buf, 1); + vec_zero(ctx->buf, sizeof(ctx->buf)); + ctx->off = 0; + } + } + + n = len / sizeof(ctx->buf); + if (n > 0) { + sha256_block_data_order(ctx->h, inp, n); + n *= sizeof(ctx->buf); + inp += n; + len -= n; + } + + if (len) + sha256_bcopy(ctx->buf, inp, ctx->off = len); +} + +#define __TOBE32(ptr, val) ((ptr)[0] = (unsigned char)((val)>>24), \ + (ptr)[1] = (unsigned char)((val)>>16), \ + (ptr)[2] = (unsigned char)((val)>>8), \ + (ptr)[3] = (unsigned char)(val)) + +#if 1 +void sha256_emit(unsigned char md[32], const unsigned int h[8]); +#else +static void sha256_emit(unsigned char md[32], const unsigned int h[8]) +{ + unsigned int h_i; + + h_i = h[0]; __TOBE32(md + 0, h_i); + h_i = h[1]; __TOBE32(md + 4, h_i); + h_i = h[2]; __TOBE32(md + 8, h_i); + h_i = h[3]; __TOBE32(md + 12, h_i); + h_i = h[4]; __TOBE32(md + 16, h_i); + h_i = h[5]; __TOBE32(md + 20, h_i); + h_i = h[6]; __TOBE32(md + 24, h_i); + h_i = h[7]; __TOBE32(md + 28, h_i); +} +#endif + +static void sha256_final(unsigned char md[32], SHA256_CTX *ctx) +{ + unsigned long long bits = ctx->N * 8; + size_t n = ctx->off; + unsigned char *tail; + + ctx->buf[n++] = 0x80; + + if (n > (sizeof(ctx->buf) - 8)) { + sha256_block_data_order(ctx->h, ctx->buf, 1); + vec_zero(ctx->buf, sizeof(ctx->buf)); + } + + tail = ctx->buf + sizeof(ctx->buf) - 8; + __TOBE32(tail, (unsigned int)(bits >> 32)); + __TOBE32(tail + 4, (unsigned int)bits); + sha256_block_data_order(ctx->h, ctx->buf, 1); + sha256_emit(md, ctx->h); +} + +#undef __TOBE32 +#endif diff --git a/src/sqrt-addchain.h b/src/sqrt-addchain.h new file mode 100644 index 00000000..4e7f0beb --- /dev/null +++ b/src/sqrt-addchain.h @@ -0,0 +1,489 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * The "magic" number is (BLS12_381_P-3)/4. Exponentiation to which + * yields reciprocal of sqrt(x), which is used in simplified Shallue- + * van de Woestijne-Ulas map-to-curve method, but it's trivial to adapt + * it for more "traditional" sqrt(x) as 'x*ret' (or for is_square(x) + * as 'x*ret^2==1'). + * + * Generated with 'addchain 1000602388805416848354447456433976039139220704984751971333014534031007912622709466110671907282253916009473568139946' + * https://github.com/kwantam/addchain + * + * # Bos-Coster (win=4) : 458 (16) <<< + * # Bos-Coster (win=5) : 460 (28) + * # Bos-Coster (win=6) : 461 (33) + * # Bos-Coster (win=7) : 460 (28) + * # Bos-Coster (win=3) : 462 ( 9) + * # Bos-Coster (win=8) : 466 (34) + * # Bos-Coster (win=9) : 464 (31) + * # Yacobi : 478 (31) + * # Bos-Coster (win=10) : 473 (30) + * # Bos-Coster (win=2) : 486 ( 5) + * # Bergeron-Berstel-Brlek-Duboc : 489 ( 5) + */ + +#define RECIP_SQRT_MOD_BLS12_381_P(out, inp, ptype) do { \ +ptype t[16]; \ +vec_copy(t[13], inp, sizeof(ptype));/* 0: 1 */\ +sqr(t[0], t[13]); /* 1: 2 */\ +mul(t[8], t[0], t[13]); /* 2: 3 */\ +sqr(t[4], t[0]); /* 3: 4 */\ +mul(t[1], t[8], t[0]); /* 4: 5 */\ +mul(t[6], t[4], t[8]); /* 5: 7 */\ +mul(t[9], t[1], t[4]); /* 6: 9 */\ +mul(t[12], t[6], t[4]); /* 7: b */\ +mul(t[3], t[9], t[4]); /* 8: d */\ +mul(t[7], t[12], t[4]); /* 9: f */\ +mul(t[15], t[3], t[4]); /* 10: 11 */\ +mul(t[10], t[7], t[4]); /* 11: 13 */\ +mul(t[2], t[15], t[4]); /* 12: 15 */\ +mul(t[11], t[10], t[4]); /* 13: 17 */\ +sqr(t[0], t[3]); /* 14: 1a */\ +mul(t[14], t[11], t[4]); /* 15: 1b */\ +mul(t[5], t[0], t[8]); /* 16: 1d */\ +mul(t[4], t[0], t[1]); /* 17: 1f */\ +/* sqr(t[0], t[0]); */ /* 18: 34 */\ +/* sqr(t[0], t[0]); */ /* 19: 68 */\ +/* sqr(t[0], t[0]); */ /* 20: d0 */\ +/* sqr(t[0], t[0]); */ /* 21: 1a0 */\ +/* sqr(t[0], t[0]); */ /* 22: 340 */\ +/* sqr(t[0], t[0]); */ /* 23: 680 */\ +/* sqr(t[0], t[0]); */ /* 24: d00 */\ +/* sqr(t[0], t[0]); */ /* 25: 1a00 */\ +/* sqr(t[0], t[0]); */ /* 26: 3400 */\ +/* sqr(t[0], t[0]); */ /* 27: 6800 */\ +/* sqr(t[0], t[0]); */ /* 28: d000 */\ +/* sqr(t[0], t[0]); */ /* 29: 1a000 */\ +sqr_n_mul(t[0], t[0], 12, t[15]); /* 30: 1a011 */\ +/* sqr(t[0], t[0]); */ /* 31: 34022 */\ +/* sqr(t[0], t[0]); */ /* 32: 68044 */\ +/* sqr(t[0], t[0]); */ /* 33: d0088 */\ +/* sqr(t[0], t[0]); */ /* 34: 1a0110 */\ +/* sqr(t[0], t[0]); */ /* 35: 340220 */\ +/* sqr(t[0], t[0]); */ /* 36: 680440 */\ +/* sqr(t[0], t[0]); */ /* 37: d00880 */\ +sqr_n_mul(t[0], t[0], 7, t[7]); /* 38: d0088f */\ +/* sqr(t[0], t[0]); */ /* 39: 1a0111e */\ +/* sqr(t[0], t[0]); */ /* 40: 340223c */\ +/* sqr(t[0], t[0]); */ /* 41: 6804478 */\ +/* sqr(t[0], t[0]); */ /* 42: d0088f0 */\ +sqr_n_mul(t[0], t[0], 4, t[1]); /* 43: d0088f5 */\ +/* sqr(t[0], t[0]); */ /* 44: 1a0111ea */\ +/* sqr(t[0], t[0]); */ /* 45: 340223d4 */\ +/* sqr(t[0], t[0]); */ /* 46: 680447a8 */\ +/* sqr(t[0], t[0]); */ /* 47: d0088f50 */\ +/* sqr(t[0], t[0]); */ /* 48: 1a0111ea0 */\ +/* sqr(t[0], t[0]); */ /* 49: 340223d40 */\ +sqr_n_mul(t[0], t[0], 6, t[6]); /* 50: 340223d47 */\ +/* sqr(t[0], t[0]); */ /* 51: 680447a8e */\ +/* sqr(t[0], t[0]); */ /* 52: d0088f51c */\ +/* sqr(t[0], t[0]); */ /* 53: 1a0111ea38 */\ +/* sqr(t[0], t[0]); */ /* 54: 340223d470 */\ +/* sqr(t[0], t[0]); */ /* 55: 680447a8e0 */\ +/* sqr(t[0], t[0]); */ /* 56: d0088f51c0 */\ +/* sqr(t[0], t[0]); */ /* 57: 1a0111ea380 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 58: 1a0111ea397 */\ +/* sqr(t[0], t[0]); */ /* 59: 340223d472e */\ +/* sqr(t[0], t[0]); */ /* 60: 680447a8e5c */\ +/* sqr(t[0], t[0]); */ /* 61: d0088f51cb8 */\ +/* sqr(t[0], t[0]); */ /* 62: 1a0111ea3970 */\ +/* sqr(t[0], t[0]); */ /* 63: 340223d472e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 64: 340223d472ff */\ +/* sqr(t[0], t[0]); */ /* 65: 680447a8e5fe */\ +/* sqr(t[0], t[0]); */ /* 66: d0088f51cbfc */\ +sqr_n_mul(t[0], t[0], 2, t[8]); /* 67: d0088f51cbff */\ +/* sqr(t[0], t[0]); */ /* 68: 1a0111ea397fe */\ +/* sqr(t[0], t[0]); */ /* 69: 340223d472ffc */\ +/* sqr(t[0], t[0]); */ /* 70: 680447a8e5ff8 */\ +/* sqr(t[0], t[0]); */ /* 71: d0088f51cbff0 */\ +/* sqr(t[0], t[0]); */ /* 72: 1a0111ea397fe0 */\ +/* sqr(t[0], t[0]); */ /* 73: 340223d472ffc0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 74: 340223d472ffcd */\ +/* sqr(t[0], t[0]); */ /* 75: 680447a8e5ff9a */\ +/* sqr(t[0], t[0]); */ /* 76: d0088f51cbff34 */\ +/* sqr(t[0], t[0]); */ /* 77: 1a0111ea397fe68 */\ +/* sqr(t[0], t[0]); */ /* 78: 340223d472ffcd0 */\ +/* sqr(t[0], t[0]); */ /* 79: 680447a8e5ff9a0 */\ +/* sqr(t[0], t[0]); */ /* 80: d0088f51cbff340 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 81: d0088f51cbff34d */\ +/* sqr(t[0], t[0]); */ /* 82: 1a0111ea397fe69a */\ +/* sqr(t[0], t[0]); */ /* 83: 340223d472ffcd34 */\ +/* sqr(t[0], t[0]); */ /* 84: 680447a8e5ff9a68 */\ +/* sqr(t[0], t[0]); */ /* 85: d0088f51cbff34d0 */\ +/* sqr(t[0], t[0]); */ /* 86: 1a0111ea397fe69a0 */\ +/* sqr(t[0], t[0]); */ /* 87: 340223d472ffcd340 */\ +sqr_n_mul(t[0], t[0], 6, t[9]); /* 88: 340223d472ffcd349 */\ +/* sqr(t[0], t[0]); */ /* 89: 680447a8e5ff9a692 */\ +/* sqr(t[0], t[0]); */ /* 90: d0088f51cbff34d24 */\ +/* sqr(t[0], t[0]); */ /* 91: 1a0111ea397fe69a48 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 92: 1a0111ea397fe69a4b */\ +/* sqr(t[0], t[0]); */ /* 93: 340223d472ffcd3496 */\ +/* sqr(t[0], t[0]); */ /* 94: 680447a8e5ff9a692c */\ +/* sqr(t[0], t[0]); */ /* 95: d0088f51cbff34d258 */\ +/* sqr(t[0], t[0]); */ /* 96: 1a0111ea397fe69a4b0 */\ +/* sqr(t[0], t[0]); */ /* 97: 340223d472ffcd34960 */\ +/* sqr(t[0], t[0]); */ /* 98: 680447a8e5ff9a692c0 */\ +/* sqr(t[0], t[0]); */ /* 99: d0088f51cbff34d2580 */\ +sqr_n_mul(t[0], t[0], 7, t[3]); /* 100: d0088f51cbff34d258d */\ +/* sqr(t[0], t[0]); */ /* 101: 1a0111ea397fe69a4b1a */\ +/* sqr(t[0], t[0]); */ /* 102: 340223d472ffcd349634 */\ +/* sqr(t[0], t[0]); */ /* 103: 680447a8e5ff9a692c68 */\ +/* sqr(t[0], t[0]); */ /* 104: d0088f51cbff34d258d0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 105: d0088f51cbff34d258dd */\ +/* sqr(t[0], t[0]); */ /* 106: 1a0111ea397fe69a4b1ba */\ +/* sqr(t[0], t[0]); */ /* 107: 340223d472ffcd3496374 */\ +/* sqr(t[0], t[0]); */ /* 108: 680447a8e5ff9a692c6e8 */\ +/* sqr(t[0], t[0]); */ /* 109: d0088f51cbff34d258dd0 */\ +/* sqr(t[0], t[0]); */ /* 110: 1a0111ea397fe69a4b1ba0 */\ +/* sqr(t[0], t[0]); */ /* 111: 340223d472ffcd34963740 */\ +sqr_n_mul(t[0], t[0], 6, t[7]); /* 112: 340223d472ffcd3496374f */\ +/* sqr(t[0], t[0]); */ /* 113: 680447a8e5ff9a692c6e9e */\ +/* sqr(t[0], t[0]); */ /* 114: d0088f51cbff34d258dd3c */\ +/* sqr(t[0], t[0]); */ /* 115: 1a0111ea397fe69a4b1ba78 */\ +/* sqr(t[0], t[0]); */ /* 116: 340223d472ffcd3496374f0 */\ +/* sqr(t[0], t[0]); */ /* 117: 680447a8e5ff9a692c6e9e0 */\ +/* sqr(t[0], t[0]); */ /* 118: d0088f51cbff34d258dd3c0 */\ +sqr_n_mul(t[0], t[0], 6, t[14]); /* 119: d0088f51cbff34d258dd3db */\ +/* sqr(t[0], t[0]); */ /* 120: 1a0111ea397fe69a4b1ba7b6 */\ +/* sqr(t[0], t[0]); */ /* 121: 340223d472ffcd3496374f6c */\ +/* sqr(t[0], t[0]); */ /* 122: 680447a8e5ff9a692c6e9ed8 */\ +sqr_n_mul(t[0], t[0], 3, t[13]); /* 123: 680447a8e5ff9a692c6e9ed9 */\ +/* sqr(t[0], t[0]); */ /* 124: d0088f51cbff34d258dd3db2 */\ +/* sqr(t[0], t[0]); */ /* 125: 1a0111ea397fe69a4b1ba7b64 */\ +/* sqr(t[0], t[0]); */ /* 126: 340223d472ffcd3496374f6c8 */\ +/* sqr(t[0], t[0]); */ /* 127: 680447a8e5ff9a692c6e9ed90 */\ +/* sqr(t[0], t[0]); */ /* 128: d0088f51cbff34d258dd3db20 */\ +/* sqr(t[0], t[0]); */ /* 129: 1a0111ea397fe69a4b1ba7b640 */\ +/* sqr(t[0], t[0]); */ /* 130: 340223d472ffcd3496374f6c80 */\ +/* sqr(t[0], t[0]); */ /* 131: 680447a8e5ff9a692c6e9ed900 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 132: 680447a8e5ff9a692c6e9ed90d */\ +/* sqr(t[0], t[0]); */ /* 133: d0088f51cbff34d258dd3db21a */\ +/* sqr(t[0], t[0]); */ /* 134: 1a0111ea397fe69a4b1ba7b6434 */\ +/* sqr(t[0], t[0]); */ /* 135: 340223d472ffcd3496374f6c868 */\ +/* sqr(t[0], t[0]); */ /* 136: 680447a8e5ff9a692c6e9ed90d0 */\ +/* sqr(t[0], t[0]); */ /* 137: d0088f51cbff34d258dd3db21a0 */\ +/* sqr(t[0], t[0]); */ /* 138: 1a0111ea397fe69a4b1ba7b64340 */\ +/* sqr(t[0], t[0]); */ /* 139: 340223d472ffcd3496374f6c8680 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 140: 340223d472ffcd3496374f6c8697 */\ +/* sqr(t[0], t[0]); */ /* 141: 680447a8e5ff9a692c6e9ed90d2e */\ +/* sqr(t[0], t[0]); */ /* 142: d0088f51cbff34d258dd3db21a5c */\ +/* sqr(t[0], t[0]); */ /* 143: 1a0111ea397fe69a4b1ba7b6434b8 */\ +/* sqr(t[0], t[0]); */ /* 144: 340223d472ffcd3496374f6c86970 */\ +/* sqr(t[0], t[0]); */ /* 145: 680447a8e5ff9a692c6e9ed90d2e0 */\ +sqr_n_mul(t[0], t[0], 5, t[12]); /* 146: 680447a8e5ff9a692c6e9ed90d2eb */\ +/* sqr(t[0], t[0]); */ /* 147: d0088f51cbff34d258dd3db21a5d6 */\ +/* sqr(t[0], t[0]); */ /* 148: 1a0111ea397fe69a4b1ba7b6434bac */\ +/* sqr(t[0], t[0]); */ /* 149: 340223d472ffcd3496374f6c869758 */\ +/* sqr(t[0], t[0]); */ /* 150: 680447a8e5ff9a692c6e9ed90d2eb0 */\ +/* sqr(t[0], t[0]); */ /* 151: d0088f51cbff34d258dd3db21a5d60 */\ +/* sqr(t[0], t[0]); */ /* 152: 1a0111ea397fe69a4b1ba7b6434bac0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 153: 1a0111ea397fe69a4b1ba7b6434bacd */\ +/* sqr(t[0], t[0]); */ /* 154: 340223d472ffcd3496374f6c869759a */\ +/* sqr(t[0], t[0]); */ /* 155: 680447a8e5ff9a692c6e9ed90d2eb34 */\ +/* sqr(t[0], t[0]); */ /* 156: d0088f51cbff34d258dd3db21a5d668 */\ +/* sqr(t[0], t[0]); */ /* 157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\ +/* sqr(t[0], t[0]); */ /* 158: 340223d472ffcd3496374f6c869759a0 */\ +/* sqr(t[0], t[0]); */ /* 159: 680447a8e5ff9a692c6e9ed90d2eb340 */\ +sqr_n_mul(t[0], t[0], 6, t[5]); /* 160: 680447a8e5ff9a692c6e9ed90d2eb35d */\ +/* sqr(t[0], t[0]); */ /* 161: d0088f51cbff34d258dd3db21a5d66ba */\ +/* sqr(t[0], t[0]); */ /* 162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\ +/* sqr(t[0], t[0]); */ /* 163: 340223d472ffcd3496374f6c869759ae8 */\ +/* sqr(t[0], t[0]); */ /* 164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\ +sqr_n_mul(t[0], t[0], 4, t[9]); /* 165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\ +/* sqr(t[0], t[0]); */ /* 166: d0088f51cbff34d258dd3db21a5d66bb2 */\ +/* sqr(t[0], t[0]); */ /* 167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\ +/* sqr(t[0], t[0]); */ /* 168: 340223d472ffcd3496374f6c869759aec8 */\ +/* sqr(t[0], t[0]); */ /* 169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\ +/* sqr(t[0], t[0]); */ /* 170: d0088f51cbff34d258dd3db21a5d66bb20 */\ +/* sqr(t[0], t[0]); */ /* 171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\ +/* sqr(t[0], t[0]); */ /* 172: 340223d472ffcd3496374f6c869759aec80 */\ +/* sqr(t[0], t[0]); */ /* 173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\ +sqr_n_mul(t[0], t[0], 8, t[5]); /* 174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\ +/* sqr(t[0], t[0]); */ /* 175: d0088f51cbff34d258dd3db21a5d66bb23a */\ +/* sqr(t[0], t[0]); */ /* 176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\ +/* sqr(t[0], t[0]); */ /* 177: 340223d472ffcd3496374f6c869759aec8e8 */\ +/* sqr(t[0], t[0]); */ /* 178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\ +/* sqr(t[0], t[0]); */ /* 180: d0088f51cbff34d258dd3db21a5d66bb23ba */\ +/* sqr(t[0], t[0]); */ /* 181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\ +/* sqr(t[0], t[0]); */ /* 182: 340223d472ffcd3496374f6c869759aec8ee8 */\ +/* sqr(t[0], t[0]); */ /* 183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\ +/* sqr(t[0], t[0]); */ /* 184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\ +/* sqr(t[0], t[0]); */ /* 185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\ +/* sqr(t[0], t[0]); */ /* 186: 340223d472ffcd3496374f6c869759aec8ee80 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 187: 340223d472ffcd3496374f6c869759aec8ee97 */\ +/* sqr(t[0], t[0]); */ /* 188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\ +/* sqr(t[0], t[0]); */ /* 189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\ +/* sqr(t[0], t[0]); */ /* 190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\ +/* sqr(t[0], t[0]); */ /* 191: 340223d472ffcd3496374f6c869759aec8ee970 */\ +/* sqr(t[0], t[0]); */ /* 192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\ +/* sqr(t[0], t[0]); */ /* 193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\ +/* sqr(t[0], t[0]); */ /* 194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\ +/* sqr(t[0], t[0]); */ /* 195: 340223d472ffcd3496374f6c869759aec8ee9700 */\ +/* sqr(t[0], t[0]); */ /* 196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\ +sqr_n_mul(t[0], t[0], 9, t[10]); /* 197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\ +/* sqr(t[0], t[0]); */ /* 198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\ +/* sqr(t[0], t[0]); */ /* 199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\ +sqr_n_mul(t[0], t[0], 2, t[8]); /* 200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\ +/* sqr(t[0], t[0]); */ /* 201: 340223d472ffcd3496374f6c869759aec8ee9709e */\ +/* sqr(t[0], t[0]); */ /* 202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\ +/* sqr(t[0], t[0]); */ /* 203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\ +/* sqr(t[0], t[0]); */ /* 204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\ +/* sqr(t[0], t[0]); */ /* 205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\ +/* sqr(t[0], t[0]); */ /* 207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\ +/* sqr(t[0], t[0]); */ /* 208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\ +/* sqr(t[0], t[0]); */ /* 209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\ +/* sqr(t[0], t[0]); */ /* 210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\ +/* sqr(t[0], t[0]); */ /* 211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\ +/* sqr(t[0], t[0]); */ /* 212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\ +/* sqr(t[0], t[0]); */ /* 213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\ +sqr_n_mul(t[0], t[0], 7, t[1]); /* 214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\ +/* sqr(t[0], t[0]); */ /* 215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\ +/* sqr(t[0], t[0]); */ /* 216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\ +/* sqr(t[0], t[0]); */ /* 217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\ +/* sqr(t[0], t[0]); */ /* 218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\ +/* sqr(t[0], t[0]); */ /* 219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\ +/* sqr(t[0], t[0]); */ /* 220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\ +/* sqr(t[0], t[0]); */ /* 221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\ +sqr_n_mul(t[0], t[0], 7, t[9]); /* 222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\ +/* sqr(t[0], t[0]); */ /* 223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\ +/* sqr(t[0], t[0]); */ /* 224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\ +/* sqr(t[0], t[0]); */ /* 225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\ +/* sqr(t[0], t[0]); */ /* 226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\ +/* sqr(t[0], t[0]); */ /* 227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\ +/* sqr(t[0], t[0]); */ /* 228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\ +sqr_n_mul(t[0], t[0], 6, t[11]); /* 229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\ +/* sqr(t[0], t[0]); */ /* 230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\ +/* sqr(t[0], t[0]); */ /* 231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\ +/* sqr(t[0], t[0]); */ /* 232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\ +/* sqr(t[0], t[0]); */ /* 233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\ +/* sqr(t[0], t[0]); */ /* 234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\ +/* sqr(t[0], t[0]); */ /* 236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\ +/* sqr(t[0], t[0]); */ /* 237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\ +/* sqr(t[0], t[0]); */ /* 238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\ +/* sqr(t[0], t[0]); */ /* 239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\ +/* sqr(t[0], t[0]); */ /* 240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[10]); /* 241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\ +/* sqr(t[0], t[0]); */ /* 242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\ +/* sqr(t[0], t[0]); */ /* 243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\ +/* sqr(t[0], t[0]); */ /* 244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\ +/* sqr(t[0], t[0]); */ /* 245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\ +/* sqr(t[0], t[0]); */ /* 246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\ +sqr_n_mul(t[0], t[0], 5, t[10]); /* 247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\ +/* sqr(t[0], t[0]); */ /* 248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\ +/* sqr(t[0], t[0]); */ /* 249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\ +/* sqr(t[0], t[0]); */ /* 250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\ +/* sqr(t[0], t[0]); */ /* 251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\ +/* sqr(t[0], t[0]); */ /* 252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\ +/* sqr(t[0], t[0]); */ /* 253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\ +/* sqr(t[0], t[0]); */ /* 254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\ +/* sqr(t[0], t[0]); */ /* 255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\ +/* sqr(t[0], t[0]); */ /* 257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\ +/* sqr(t[0], t[0]); */ /* 258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\ +/* sqr(t[0], t[0]); */ /* 259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\ +/* sqr(t[0], t[0]); */ /* 260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\ +/* sqr(t[0], t[0]); */ /* 261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\ +/* sqr(t[0], t[0]); */ /* 262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\ +/* sqr(t[0], t[0]); */ /* 263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\ +sqr_n_mul(t[0], t[0], 7, t[2]); /* 264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\ +/* sqr(t[0], t[0]); */ /* 265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\ +/* sqr(t[0], t[0]); */ /* 266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\ +/* sqr(t[0], t[0]); */ /* 267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\ +/* sqr(t[0], t[0]); */ /* 268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\ +/* sqr(t[0], t[0]); */ /* 269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\ +/* sqr(t[0], t[0]); */ /* 270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\ +/* sqr(t[0], t[0]); */ /* 271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\ +/* sqr(t[0], t[0]); */ /* 272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\ +/* sqr(t[0], t[0]); */ /* 273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\ +sqr_n_mul(t[0], t[0], 9, t[7]); /* 274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\ +/* sqr(t[0], t[0]); */ /* 275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\ +/* sqr(t[0], t[0]); */ /* 276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\ +/* sqr(t[0], t[0]); */ /* 277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\ +/* sqr(t[0], t[0]); */ /* 278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\ +/* sqr(t[0], t[0]); */ /* 279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\ +sqr_n_mul(t[0], t[0], 5, t[3]); /* 280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\ +/* sqr(t[0], t[0]); */ /* 281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\ +/* sqr(t[0], t[0]); */ /* 282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\ +/* sqr(t[0], t[0]); */ /* 283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\ +/* sqr(t[0], t[0]); */ /* 285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\ +/* sqr(t[0], t[0]); */ /* 286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\ +/* sqr(t[0], t[0]); */ /* 287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\ +/* sqr(t[0], t[0]); */ /* 288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\ +/* sqr(t[0], t[0]); */ /* 289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\ +/* sqr(t[0], t[0]); */ /* 290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\ +/* sqr(t[0], t[0]); */ /* 291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\ +/* sqr(t[0], t[0]); */ /* 292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\ +sqr_n_mul(t[0], t[0], 8, t[7]); /* 293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\ +/* sqr(t[0], t[0]); */ /* 294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\ +/* sqr(t[0], t[0]); */ /* 295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\ +/* sqr(t[0], t[0]); */ /* 296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\ +/* sqr(t[0], t[0]); */ /* 298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\ +/* sqr(t[0], t[0]); */ /* 299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\ +/* sqr(t[0], t[0]); */ /* 300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\ +/* sqr(t[0], t[0]); */ /* 301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\ +/* sqr(t[0], t[0]); */ /* 302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\ +/* sqr(t[0], t[0]); */ /* 303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\ +/* sqr(t[0], t[0]); */ /* 304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\ +sqr_n_mul(t[0], t[0], 7, t[9]); /* 305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\ +/* sqr(t[0], t[0]); */ /* 306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\ +/* sqr(t[0], t[0]); */ /* 307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\ +/* sqr(t[0], t[0]); */ /* 308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\ +/* sqr(t[0], t[0]); */ /* 309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\ +/* sqr(t[0], t[0]); */ /* 310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\ +/* sqr(t[0], t[0]); */ /* 311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\ +/* sqr(t[0], t[0]); */ /* 312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\ +/* sqr(t[0], t[0]); */ /* 313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\ +/* sqr(t[0], t[0]); */ /* 314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\ +sqr_n_mul(t[0], t[0], 9, t[7]); /* 315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\ +/* sqr(t[0], t[0]); */ /* 316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\ +/* sqr(t[0], t[0]); */ /* 317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\ +/* sqr(t[0], t[0]); */ /* 318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\ +/* sqr(t[0], t[0]); */ /* 319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\ +/* sqr(t[0], t[0]); */ /* 320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\ +/* sqr(t[0], t[0]); */ /* 321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\ +/* sqr(t[0], t[0]); */ /* 323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\ +/* sqr(t[0], t[0]); */ /* 324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\ +/* sqr(t[0], t[0]); */ /* 325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\ +/* sqr(t[0], t[0]); */ /* 326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\ +/* sqr(t[0], t[0]); */ /* 327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\ +/* sqr(t[0], t[0]); */ /* 328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\ +/* sqr(t[0], t[0]); */ /* 330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\ +/* sqr(t[0], t[0]); */ /* 331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\ +/* sqr(t[0], t[0]); */ /* 332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\ +/* sqr(t[0], t[0]); */ /* 333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\ +/* sqr(t[0], t[0]); */ /* 334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\ +/* sqr(t[0], t[0]); */ /* 336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\ +/* sqr(t[0], t[0]); */ /* 337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\ +/* sqr(t[0], t[0]); */ /* 338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\ +/* sqr(t[0], t[0]); */ /* 339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\ +/* sqr(t[0], t[0]); */ /* 340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\ +/* sqr(t[0], t[0]); */ /* 342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\ +/* sqr(t[0], t[0]); */ /* 343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\ +/* sqr(t[0], t[0]); */ /* 344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\ +/* sqr(t[0], t[0]); */ /* 345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\ +/* sqr(t[0], t[0]); */ /* 347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\ +/* sqr(t[0], t[0]); */ /* 348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\ +/* sqr(t[0], t[0]); */ /* 349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\ +/* sqr(t[0], t[0]); */ /* 351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\ +/* sqr(t[0], t[0]); */ /* 352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\ +/* sqr(t[0], t[0]); */ /* 353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\ +/* sqr(t[0], t[0]); */ /* 354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\ +/* sqr(t[0], t[0]); */ /* 355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\ +/* sqr(t[0], t[0]); */ /* 356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\ +/* sqr(t[0], t[0]); */ /* 357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\ +/* sqr(t[0], t[0]); */ /* 358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\ +/* sqr(t[0], t[0]); */ /* 360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\ +/* sqr(t[0], t[0]); */ /* 361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\ +/* sqr(t[0], t[0]); */ /* 362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\ +/* sqr(t[0], t[0]); */ /* 363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\ +/* sqr(t[0], t[0]); */ /* 364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\ +/* sqr(t[0], t[0]); */ /* 365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\ +/* sqr(t[0], t[0]); */ /* 366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\ +/* sqr(t[0], t[0]); */ /* 368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\ +/* sqr(t[0], t[0]); */ /* 369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\ +/* sqr(t[0], t[0]); */ /* 370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\ +/* sqr(t[0], t[0]); */ /* 371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\ +/* sqr(t[0], t[0]); */ /* 372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\ +/* sqr(t[0], t[0]); */ /* 374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\ +/* sqr(t[0], t[0]); */ /* 375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\ +/* sqr(t[0], t[0]); */ /* 376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\ +/* sqr(t[0], t[0]); */ /* 377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\ +/* sqr(t[0], t[0]); */ /* 378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\ +/* sqr(t[0], t[0]); */ /* 380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\ +/* sqr(t[0], t[0]); */ /* 381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\ +/* sqr(t[0], t[0]); */ /* 382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\ +/* sqr(t[0], t[0]); */ /* 383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[7]); /* 384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\ +/* sqr(t[0], t[0]); */ /* 385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\ +/* sqr(t[0], t[0]); */ /* 386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\ +/* sqr(t[0], t[0]); */ /* 387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\ +/* sqr(t[0], t[0]); */ /* 388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[6]); /* 389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\ +/* sqr(t[0], t[0]); */ /* 390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\ +/* sqr(t[0], t[0]); */ /* 391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\ +/* sqr(t[0], t[0]); */ /* 392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\ +/* sqr(t[0], t[0]); */ /* 393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\ +/* sqr(t[0], t[0]); */ /* 394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\ +/* sqr(t[0], t[0]); */ /* 395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\ +/* sqr(t[0], t[0]); */ /* 396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\ +/* sqr(t[0], t[0]); */ /* 398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\ +/* sqr(t[0], t[0]); */ /* 399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\ +/* sqr(t[0], t[0]); */ /* 400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\ +/* sqr(t[0], t[0]); */ /* 401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\ +/* sqr(t[0], t[0]); */ /* 402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\ +/* sqr(t[0], t[0]); */ /* 404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\ +/* sqr(t[0], t[0]); */ /* 405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\ +/* sqr(t[0], t[0]); */ /* 406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\ +/* sqr(t[0], t[0]); */ /* 407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\ +/* sqr(t[0], t[0]); */ /* 408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\ +/* sqr(t[0], t[0]); */ /* 410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\ +/* sqr(t[0], t[0]); */ /* 411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\ +/* sqr(t[0], t[0]); */ /* 412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\ +/* sqr(t[0], t[0]); */ /* 413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\ +/* sqr(t[0], t[0]); */ /* 414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\ +/* sqr(t[0], t[0]); */ /* 416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\ +/* sqr(t[0], t[0]); */ /* 417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\ +/* sqr(t[0], t[0]); */ /* 418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\ +/* sqr(t[0], t[0]); */ /* 419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\ +/* sqr(t[0], t[0]); */ /* 420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\ +/* sqr(t[0], t[0]); */ /* 422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\ +/* sqr(t[0], t[0]); */ /* 423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\ +/* sqr(t[0], t[0]); */ /* 424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\ +/* sqr(t[0], t[0]); */ /* 425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\ +/* sqr(t[0], t[0]); */ /* 426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\ +/* sqr(t[0], t[0]); */ /* 428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\ +/* sqr(t[0], t[0]); */ /* 429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\ +/* sqr(t[0], t[0]); */ /* 430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\ +/* sqr(t[0], t[0]); */ /* 431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\ +/* sqr(t[0], t[0]); */ /* 432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\ +/* sqr(t[0], t[0]); */ /* 434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\ +/* sqr(t[0], t[0]); */ /* 435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\ +/* sqr(t[0], t[0]); */ /* 436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\ +/* sqr(t[0], t[0]); */ /* 437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\ +/* sqr(t[0], t[0]); */ /* 438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\ +/* sqr(t[0], t[0]); */ /* 440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\ +/* sqr(t[0], t[0]); */ /* 441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\ +/* sqr(t[0], t[0]); */ /* 442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\ +/* sqr(t[0], t[0]); */ /* 443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\ +/* sqr(t[0], t[0]); */ /* 445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\ +/* sqr(t[0], t[0]); */ /* 446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\ +/* sqr(t[0], t[0]); */ /* 447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\ +/* sqr(t[0], t[0]); */ /* 448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\ +/* sqr(t[0], t[0]); */ /* 449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\ +/* sqr(t[0], t[0]); */ /* 450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\ +/* sqr(t[0], t[0]); */ /* 452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\ +/* sqr(t[0], t[0]); */ /* 453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\ +/* sqr(t[0], t[0]); */ /* 454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\ +/* sqr(t[0], t[0]); */ /* 455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\ +sqr_n_mul(t[0], t[0], 4, t[1]); /* 456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\ +sqr(out, t[0]); /* 457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\ +} while(0) diff --git a/src/sqrt2-addchain.h b/src/sqrt2-addchain.h new file mode 100644 index 00000000..4adc627b --- /dev/null +++ b/src/sqrt2-addchain.h @@ -0,0 +1,922 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * The "magic" number is (BLS12_381_P^2-9)/16. Exponentiation to which + * yields a candidate value for reciprocal of sqrt(x). + * + * Generated with 'addchain 1001205140483106588246484290269935788605945006208159541241399033561623546780709821462541004956387089373434649096260670658193992783731681621012512651314777238193313314641988297376025498093520728838658813979860931248214124593092835' + * https://github.com/kwantam/addchain + * + * # Bos-Coster (win=6) : 896 (46) + * # Bos-Coster (win=5) : 898 (33) + * # Bos-Coster (win=9) : 905 (66) + * # Bos-Coster (win=8) : 901 (64) + * # Bos-Coster (win=4) : 895 (17) <<< + * # Bos-Coster (win=3) : 909 ( 9) + * # Bos-Coster (win=7) : 900 (62) + * # Yacobi : 918 (34) + * # Bos-Coster (win=10) : 911 (62) + * # Bos-Coster (win=2) : 945 ( 5) + * # Bergeron-Berstel-Brlek-Duboc : 982 ( 5) + */ +#define RECIP_SQRT_MOD_BLS12_381_P2(out, inp, ptype) do { \ +ptype t[17]; \ +vec_copy(t[7], inp, sizeof(ptype)); /* 0: 1 */\ +sqr(t[0], t[7]); /* 1: 2 */\ +mul(t[1], t[0], t[7]); /* 2: 3 */\ +mul(t[15], t[1], t[0]); /* 3: 5 */\ +mul(t[2], t[15], t[0]); /* 4: 7 */\ +mul(t[14], t[2], t[0]); /* 5: 9 */\ +mul(t[13], t[14], t[0]); /* 6: b */\ +mul(t[4], t[13], t[0]); /* 7: d */\ +mul(t[10], t[4], t[0]); /* 8: f */\ +mul(t[9], t[10], t[0]); /* 9: 11 */\ +mul(t[16], t[9], t[0]); /* 10: 13 */\ +mul(t[3], t[16], t[0]); /* 11: 15 */\ +mul(t[6], t[3], t[0]); /* 12: 17 */\ +mul(t[5], t[6], t[0]); /* 13: 19 */\ +mul(t[12], t[5], t[0]); /* 14: 1b */\ +mul(t[8], t[12], t[0]); /* 15: 1d */\ +mul(t[11], t[8], t[0]); /* 16: 1f */\ +/* sqr(t[0], t[3]); */ /* 17: 2a */\ +/* sqr(t[0], t[0]); */ /* 18: 54 */\ +/* sqr(t[0], t[0]); */ /* 19: a8 */\ +sqr_n_mul(t[0], t[3], 3, t[7]); /* 20: a9 */\ +/* sqr(t[0], t[0]); */ /* 21: 152 */\ +/* sqr(t[0], t[0]); */ /* 22: 2a4 */\ +/* sqr(t[0], t[0]); */ /* 23: 548 */\ +/* sqr(t[0], t[0]); */ /* 24: a90 */\ +/* sqr(t[0], t[0]); */ /* 25: 1520 */\ +/* sqr(t[0], t[0]); */ /* 26: 2a40 */\ +/* sqr(t[0], t[0]); */ /* 27: 5480 */\ +/* sqr(t[0], t[0]); */ /* 28: a900 */\ +/* sqr(t[0], t[0]); */ /* 29: 15200 */\ +sqr_n_mul(t[0], t[0], 9, t[12]); /* 30: 1521b */\ +/* sqr(t[0], t[0]); */ /* 31: 2a436 */\ +/* sqr(t[0], t[0]); */ /* 32: 5486c */\ +/* sqr(t[0], t[0]); */ /* 33: a90d8 */\ +/* sqr(t[0], t[0]); */ /* 34: 1521b0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 35: 1521bd */\ +/* sqr(t[0], t[0]); */ /* 36: 2a437a */\ +/* sqr(t[0], t[0]); */ /* 37: 5486f4 */\ +/* sqr(t[0], t[0]); */ /* 38: a90de8 */\ +/* sqr(t[0], t[0]); */ /* 39: 1521bd0 */\ +/* sqr(t[0], t[0]); */ /* 40: 2a437a0 */\ +/* sqr(t[0], t[0]); */ /* 41: 5486f40 */\ +sqr_n_mul(t[0], t[0], 6, t[14]); /* 42: 5486f49 */\ +/* sqr(t[0], t[0]); */ /* 43: a90de92 */\ +/* sqr(t[0], t[0]); */ /* 44: 1521bd24 */\ +/* sqr(t[0], t[0]); */ /* 45: 2a437a48 */\ +/* sqr(t[0], t[0]); */ /* 46: 5486f490 */\ +sqr_n_mul(t[0], t[0], 4, t[2]); /* 47: 5486f497 */\ +/* sqr(t[0], t[0]); */ /* 48: a90de92e */\ +/* sqr(t[0], t[0]); */ /* 49: 1521bd25c */\ +/* sqr(t[0], t[0]); */ /* 50: 2a437a4b8 */\ +/* sqr(t[0], t[0]); */ /* 51: 5486f4970 */\ +/* sqr(t[0], t[0]); */ /* 52: a90de92e0 */\ +sqr_n_mul(t[0], t[0], 5, t[1]); /* 53: a90de92e3 */\ +/* sqr(t[0], t[0]); */ /* 54: 1521bd25c6 */\ +/* sqr(t[0], t[0]); */ /* 55: 2a437a4b8c */\ +/* sqr(t[0], t[0]); */ /* 56: 5486f49718 */\ +/* sqr(t[0], t[0]); */ /* 57: a90de92e30 */\ +/* sqr(t[0], t[0]); */ /* 58: 1521bd25c60 */\ +/* sqr(t[0], t[0]); */ /* 59: 2a437a4b8c0 */\ +/* sqr(t[0], t[0]); */ /* 60: 5486f497180 */\ +/* sqr(t[0], t[0]); */ /* 61: a90de92e300 */\ +sqr_n_mul(t[0], t[0], 8, t[4]); /* 62: a90de92e30d */\ +/* sqr(t[0], t[0]); */ /* 63: 1521bd25c61a */\ +/* sqr(t[0], t[0]); */ /* 64: 2a437a4b8c34 */\ +/* sqr(t[0], t[0]); */ /* 65: 5486f4971868 */\ +/* sqr(t[0], t[0]); */ /* 66: a90de92e30d0 */\ +sqr_n_mul(t[0], t[0], 4, t[2]); /* 67: a90de92e30d7 */\ +/* sqr(t[0], t[0]); */ /* 68: 1521bd25c61ae */\ +/* sqr(t[0], t[0]); */ /* 69: 2a437a4b8c35c */\ +/* sqr(t[0], t[0]); */ /* 70: 5486f497186b8 */\ +/* sqr(t[0], t[0]); */ /* 71: a90de92e30d70 */\ +sqr_n_mul(t[0], t[0], 4, t[10]); /* 72: a90de92e30d7f */\ +/* sqr(t[0], t[0]); */ /* 73: 1521bd25c61afe */\ +/* sqr(t[0], t[0]); */ /* 74: 2a437a4b8c35fc */\ +/* sqr(t[0], t[0]); */ /* 75: 5486f497186bf8 */\ +/* sqr(t[0], t[0]); */ /* 76: a90de92e30d7f0 */\ +/* sqr(t[0], t[0]); */ /* 77: 1521bd25c61afe0 */\ +/* sqr(t[0], t[0]); */ /* 78: 2a437a4b8c35fc0 */\ +/* sqr(t[0], t[0]); */ /* 79: 5486f497186bf80 */\ +/* sqr(t[0], t[0]); */ /* 80: a90de92e30d7f00 */\ +sqr_n_mul(t[0], t[0], 8, t[8]); /* 81: a90de92e30d7f1d */\ +/* sqr(t[0], t[0]); */ /* 82: 1521bd25c61afe3a */\ +/* sqr(t[0], t[0]); */ /* 83: 2a437a4b8c35fc74 */\ +/* sqr(t[0], t[0]); */ /* 84: 5486f497186bf8e8 */\ +/* sqr(t[0], t[0]); */ /* 85: a90de92e30d7f1d0 */\ +/* sqr(t[0], t[0]); */ /* 86: 1521bd25c61afe3a0 */\ +/* sqr(t[0], t[0]); */ /* 87: 2a437a4b8c35fc740 */\ +sqr_n_mul(t[0], t[0], 6, t[13]); /* 88: 2a437a4b8c35fc74b */\ +/* sqr(t[0], t[0]); */ /* 89: 5486f497186bf8e96 */\ +/* sqr(t[0], t[0]); */ /* 90: a90de92e30d7f1d2c */\ +/* sqr(t[0], t[0]); */ /* 91: 1521bd25c61afe3a58 */\ +/* sqr(t[0], t[0]); */ /* 92: 2a437a4b8c35fc74b0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 93: 2a437a4b8c35fc74bd */\ +/* sqr(t[0], t[0]); */ /* 94: 5486f497186bf8e97a */\ +/* sqr(t[0], t[0]); */ /* 95: a90de92e30d7f1d2f4 */\ +/* sqr(t[0], t[0]); */ /* 96: 1521bd25c61afe3a5e8 */\ +sqr_n_mul(t[0], t[0], 3, t[7]); /* 97: 1521bd25c61afe3a5e9 */\ +/* sqr(t[0], t[0]); */ /* 98: 2a437a4b8c35fc74bd2 */\ +/* sqr(t[0], t[0]); */ /* 99: 5486f497186bf8e97a4 */\ +/* sqr(t[0], t[0]); */ /* 100: a90de92e30d7f1d2f48 */\ +/* sqr(t[0], t[0]); */ /* 101: 1521bd25c61afe3a5e90 */\ +/* sqr(t[0], t[0]); */ /* 102: 2a437a4b8c35fc74bd20 */\ +/* sqr(t[0], t[0]); */ /* 103: 5486f497186bf8e97a40 */\ +sqr_n_mul(t[0], t[0], 6, t[10]); /* 104: 5486f497186bf8e97a4f */\ +/* sqr(t[0], t[0]); */ /* 105: a90de92e30d7f1d2f49e */\ +/* sqr(t[0], t[0]); */ /* 106: 1521bd25c61afe3a5e93c */\ +/* sqr(t[0], t[0]); */ /* 107: 2a437a4b8c35fc74bd278 */\ +/* sqr(t[0], t[0]); */ /* 108: 5486f497186bf8e97a4f0 */\ +/* sqr(t[0], t[0]); */ /* 109: a90de92e30d7f1d2f49e0 */\ +/* sqr(t[0], t[0]); */ /* 110: 1521bd25c61afe3a5e93c0 */\ +/* sqr(t[0], t[0]); */ /* 111: 2a437a4b8c35fc74bd2780 */\ +/* sqr(t[0], t[0]); */ /* 112: 5486f497186bf8e97a4f00 */\ +sqr_n_mul(t[0], t[0], 8, t[8]); /* 113: 5486f497186bf8e97a4f1d */\ +/* sqr(t[0], t[0]); */ /* 114: a90de92e30d7f1d2f49e3a */\ +/* sqr(t[0], t[0]); */ /* 115: 1521bd25c61afe3a5e93c74 */\ +/* sqr(t[0], t[0]); */ /* 116: 2a437a4b8c35fc74bd278e8 */\ +/* sqr(t[0], t[0]); */ /* 117: 5486f497186bf8e97a4f1d0 */\ +/* sqr(t[0], t[0]); */ /* 118: a90de92e30d7f1d2f49e3a0 */\ +/* sqr(t[0], t[0]); */ /* 119: 1521bd25c61afe3a5e93c740 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 120: 1521bd25c61afe3a5e93c755 */\ +/* sqr(t[0], t[0]); */ /* 121: 2a437a4b8c35fc74bd278eaa */\ +/* sqr(t[0], t[0]); */ /* 122: 5486f497186bf8e97a4f1d54 */\ +/* sqr(t[0], t[0]); */ /* 123: a90de92e30d7f1d2f49e3aa8 */\ +/* sqr(t[0], t[0]); */ /* 124: 1521bd25c61afe3a5e93c7550 */\ +/* sqr(t[0], t[0]); */ /* 125: 2a437a4b8c35fc74bd278eaa0 */\ +/* sqr(t[0], t[0]); */ /* 126: 5486f497186bf8e97a4f1d540 */\ +/* sqr(t[0], t[0]); */ /* 127: a90de92e30d7f1d2f49e3aa80 */\ +/* sqr(t[0], t[0]); */ /* 128: 1521bd25c61afe3a5e93c75500 */\ +sqr_n_mul(t[0], t[0], 8, t[9]); /* 129: 1521bd25c61afe3a5e93c75511 */\ +/* sqr(t[0], t[0]); */ /* 130: 2a437a4b8c35fc74bd278eaa22 */\ +/* sqr(t[0], t[0]); */ /* 131: 5486f497186bf8e97a4f1d5444 */\ +/* sqr(t[0], t[0]); */ /* 132: a90de92e30d7f1d2f49e3aa888 */\ +/* sqr(t[0], t[0]); */ /* 133: 1521bd25c61afe3a5e93c755110 */\ +/* sqr(t[0], t[0]); */ /* 134: 2a437a4b8c35fc74bd278eaa220 */\ +sqr_n_mul(t[0], t[0], 5, t[10]); /* 135: 2a437a4b8c35fc74bd278eaa22f */\ +/* sqr(t[0], t[0]); */ /* 136: 5486f497186bf8e97a4f1d5445e */\ +/* sqr(t[0], t[0]); */ /* 137: a90de92e30d7f1d2f49e3aa88bc */\ +/* sqr(t[0], t[0]); */ /* 138: 1521bd25c61afe3a5e93c7551178 */\ +/* sqr(t[0], t[0]); */ /* 139: 2a437a4b8c35fc74bd278eaa22f0 */\ +/* sqr(t[0], t[0]); */ /* 140: 5486f497186bf8e97a4f1d5445e0 */\ +/* sqr(t[0], t[0]); */ /* 141: a90de92e30d7f1d2f49e3aa88bc0 */\ +sqr_n_mul(t[0], t[0], 6, t[14]); /* 142: a90de92e30d7f1d2f49e3aa88bc9 */\ +/* sqr(t[0], t[0]); */ /* 143: 1521bd25c61afe3a5e93c75511792 */\ +/* sqr(t[0], t[0]); */ /* 144: 2a437a4b8c35fc74bd278eaa22f24 */\ +/* sqr(t[0], t[0]); */ /* 145: 5486f497186bf8e97a4f1d5445e48 */\ +/* sqr(t[0], t[0]); */ /* 146: a90de92e30d7f1d2f49e3aa88bc90 */\ +/* sqr(t[0], t[0]); */ /* 147: 1521bd25c61afe3a5e93c755117920 */\ +sqr_n_mul(t[0], t[0], 5, t[10]); /* 148: 1521bd25c61afe3a5e93c75511792f */\ +/* sqr(t[0], t[0]); */ /* 149: 2a437a4b8c35fc74bd278eaa22f25e */\ +/* sqr(t[0], t[0]); */ /* 150: 5486f497186bf8e97a4f1d5445e4bc */\ +sqr_n_mul(t[0], t[0], 2, t[7]); /* 151: 5486f497186bf8e97a4f1d5445e4bd */\ +/* sqr(t[0], t[0]); */ /* 152: a90de92e30d7f1d2f49e3aa88bc97a */\ +/* sqr(t[0], t[0]); */ /* 153: 1521bd25c61afe3a5e93c75511792f4 */\ +/* sqr(t[0], t[0]); */ /* 154: 2a437a4b8c35fc74bd278eaa22f25e8 */\ +/* sqr(t[0], t[0]); */ /* 155: 5486f497186bf8e97a4f1d5445e4bd0 */\ +/* sqr(t[0], t[0]); */ /* 156: a90de92e30d7f1d2f49e3aa88bc97a0 */\ +/* sqr(t[0], t[0]); */ /* 157: 1521bd25c61afe3a5e93c75511792f40 */\ +sqr_n_mul(t[0], t[0], 6, t[10]); /* 158: 1521bd25c61afe3a5e93c75511792f4f */\ +/* sqr(t[0], t[0]); */ /* 159: 2a437a4b8c35fc74bd278eaa22f25e9e */\ +/* sqr(t[0], t[0]); */ /* 160: 5486f497186bf8e97a4f1d5445e4bd3c */\ +/* sqr(t[0], t[0]); */ /* 161: a90de92e30d7f1d2f49e3aa88bc97a78 */\ +/* sqr(t[0], t[0]); */ /* 162: 1521bd25c61afe3a5e93c75511792f4f0 */\ +/* sqr(t[0], t[0]); */ /* 163: 2a437a4b8c35fc74bd278eaa22f25e9e0 */\ +/* sqr(t[0], t[0]); */ /* 164: 5486f497186bf8e97a4f1d5445e4bd3c0 */\ +/* sqr(t[0], t[0]); */ /* 165: a90de92e30d7f1d2f49e3aa88bc97a780 */\ +sqr_n_mul(t[0], t[0], 7, t[13]); /* 166: a90de92e30d7f1d2f49e3aa88bc97a78b */\ +/* sqr(t[0], t[0]); */ /* 167: 1521bd25c61afe3a5e93c75511792f4f16 */\ +/* sqr(t[0], t[0]); */ /* 168: 2a437a4b8c35fc74bd278eaa22f25e9e2c */\ +/* sqr(t[0], t[0]); */ /* 169: 5486f497186bf8e97a4f1d5445e4bd3c58 */\ +/* sqr(t[0], t[0]); */ /* 170: a90de92e30d7f1d2f49e3aa88bc97a78b0 */\ +sqr_n_mul(t[0], t[0], 4, t[2]); /* 171: a90de92e30d7f1d2f49e3aa88bc97a78b7 */\ +/* sqr(t[0], t[0]); */ /* 172: 1521bd25c61afe3a5e93c75511792f4f16e */\ +/* sqr(t[0], t[0]); */ /* 173: 2a437a4b8c35fc74bd278eaa22f25e9e2dc */\ +/* sqr(t[0], t[0]); */ /* 174: 5486f497186bf8e97a4f1d5445e4bd3c5b8 */\ +/* sqr(t[0], t[0]); */ /* 175: a90de92e30d7f1d2f49e3aa88bc97a78b70 */\ +/* sqr(t[0], t[0]); */ /* 176: 1521bd25c61afe3a5e93c75511792f4f16e0 */\ +/* sqr(t[0], t[0]); */ /* 177: 2a437a4b8c35fc74bd278eaa22f25e9e2dc0 */\ +sqr_n_mul(t[0], t[0], 6, t[14]); /* 178: 2a437a4b8c35fc74bd278eaa22f25e9e2dc9 */\ +/* sqr(t[0], t[0]); */ /* 179: 5486f497186bf8e97a4f1d5445e4bd3c5b92 */\ +/* sqr(t[0], t[0]); */ /* 180: a90de92e30d7f1d2f49e3aa88bc97a78b724 */\ +/* sqr(t[0], t[0]); */ /* 181: 1521bd25c61afe3a5e93c75511792f4f16e48 */\ +/* sqr(t[0], t[0]); */ /* 182: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90 */\ +/* sqr(t[0], t[0]); */ /* 183: 5486f497186bf8e97a4f1d5445e4bd3c5b920 */\ +/* sqr(t[0], t[0]); */ /* 184: a90de92e30d7f1d2f49e3aa88bc97a78b7240 */\ +/* sqr(t[0], t[0]); */ /* 185: 1521bd25c61afe3a5e93c75511792f4f16e480 */\ +sqr_n_mul(t[0], t[0], 7, t[2]); /* 186: 1521bd25c61afe3a5e93c75511792f4f16e487 */\ +/* sqr(t[0], t[0]); */ /* 187: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e */\ +/* sqr(t[0], t[0]); */ /* 188: 5486f497186bf8e97a4f1d5445e4bd3c5b921c */\ +/* sqr(t[0], t[0]); */ /* 189: a90de92e30d7f1d2f49e3aa88bc97a78b72438 */\ +/* sqr(t[0], t[0]); */ /* 190: 1521bd25c61afe3a5e93c75511792f4f16e4870 */\ +/* sqr(t[0], t[0]); */ /* 191: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e0 */\ +sqr_n_mul(t[0], t[0], 5, t[15]); /* 192: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e5 */\ +/* sqr(t[0], t[0]); */ /* 193: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca */\ +/* sqr(t[0], t[0]); */ /* 194: a90de92e30d7f1d2f49e3aa88bc97a78b724394 */\ +/* sqr(t[0], t[0]); */ /* 195: 1521bd25c61afe3a5e93c75511792f4f16e48728 */\ +/* sqr(t[0], t[0]); */ /* 196: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50 */\ +/* sqr(t[0], t[0]); */ /* 197: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca0 */\ +/* sqr(t[0], t[0]); */ /* 198: a90de92e30d7f1d2f49e3aa88bc97a78b7243940 */\ +/* sqr(t[0], t[0]); */ /* 199: 1521bd25c61afe3a5e93c75511792f4f16e487280 */\ +sqr_n_mul(t[0], t[0], 7, t[2]); /* 200: 1521bd25c61afe3a5e93c75511792f4f16e487287 */\ +/* sqr(t[0], t[0]); */ /* 201: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e */\ +/* sqr(t[0], t[0]); */ /* 202: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1c */\ +/* sqr(t[0], t[0]); */ /* 203: a90de92e30d7f1d2f49e3aa88bc97a78b72439438 */\ +/* sqr(t[0], t[0]); */ /* 204: 1521bd25c61afe3a5e93c75511792f4f16e4872870 */\ +/* sqr(t[0], t[0]); */ /* 205: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e0 */\ +sqr_n_mul(t[0], t[0], 5, t[2]); /* 206: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7 */\ +/* sqr(t[0], t[0]); */ /* 207: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce */\ +/* sqr(t[0], t[0]); */ /* 208: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c */\ +/* sqr(t[0], t[0]); */ /* 209: 1521bd25c61afe3a5e93c75511792f4f16e48728738 */\ +/* sqr(t[0], t[0]); */ /* 210: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e70 */\ +/* sqr(t[0], t[0]); */ /* 211: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce0 */\ +/* sqr(t[0], t[0]); */ /* 212: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c0 */\ +/* sqr(t[0], t[0]); */ /* 213: 1521bd25c61afe3a5e93c75511792f4f16e487287380 */\ +/* sqr(t[0], t[0]); */ /* 214: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e700 */\ +/* sqr(t[0], t[0]); */ /* 215: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce00 */\ +/* sqr(t[0], t[0]); */ /* 216: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c00 */\ +sqr_n_mul(t[0], t[0], 10, t[9]); /* 217: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11 */\ +/* sqr(t[0], t[0]); */ /* 218: 1521bd25c61afe3a5e93c75511792f4f16e4872873822 */\ +/* sqr(t[0], t[0]); */ /* 219: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7044 */\ +/* sqr(t[0], t[0]); */ /* 220: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce088 */\ +sqr_n_mul(t[0], t[0], 3, t[15]); /* 221: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d */\ +/* sqr(t[0], t[0]); */ /* 222: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11a */\ +/* sqr(t[0], t[0]); */ /* 223: 1521bd25c61afe3a5e93c75511792f4f16e48728738234 */\ +/* sqr(t[0], t[0]); */ /* 224: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e70468 */\ +/* sqr(t[0], t[0]); */ /* 225: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d0 */\ +/* sqr(t[0], t[0]); */ /* 226: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11a0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 227: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad */\ +/* sqr(t[0], t[0]); */ /* 228: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a */\ +/* sqr(t[0], t[0]); */ /* 229: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b4 */\ +/* sqr(t[0], t[0]); */ /* 230: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68 */\ +/* sqr(t[0], t[0]); */ /* 231: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad0 */\ +/* sqr(t[0], t[0]); */ /* 232: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a0 */\ +/* sqr(t[0], t[0]); */ /* 233: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b40 */\ +/* sqr(t[0], t[0]); */ /* 234: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d680 */\ +/* sqr(t[0], t[0]); */ /* 235: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad00 */\ +sqr_n_mul(t[0], t[0], 8, t[5]); /* 236: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19 */\ +/* sqr(t[0], t[0]); */ /* 237: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a32 */\ +/* sqr(t[0], t[0]); */ /* 238: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b464 */\ +/* sqr(t[0], t[0]); */ /* 239: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68c8 */\ +/* sqr(t[0], t[0]); */ /* 240: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad190 */\ +/* sqr(t[0], t[0]); */ /* 241: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a320 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 242: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a337 */\ +/* sqr(t[0], t[0]); */ /* 243: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e */\ +/* sqr(t[0], t[0]); */ /* 244: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdc */\ +/* sqr(t[0], t[0]); */ /* 245: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b8 */\ +/* sqr(t[0], t[0]); */ /* 246: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3370 */\ +/* sqr(t[0], t[0]); */ /* 247: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e0 */\ +/* sqr(t[0], t[0]); */ /* 248: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdc0 */\ +sqr_n_mul(t[0], t[0], 6, t[13]); /* 249: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb */\ +/* sqr(t[0], t[0]); */ /* 250: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b96 */\ +/* sqr(t[0], t[0]); */ /* 251: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372c */\ +/* sqr(t[0], t[0]); */ /* 252: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e58 */\ +/* sqr(t[0], t[0]); */ /* 253: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb0 */\ +/* sqr(t[0], t[0]); */ /* 254: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b960 */\ +/* sqr(t[0], t[0]); */ /* 255: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372c0 */\ +sqr_n_mul(t[0], t[0], 6, t[10]); /* 256: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf */\ +/* sqr(t[0], t[0]); */ /* 257: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e */\ +/* sqr(t[0], t[0]); */ /* 258: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c */\ +/* sqr(t[0], t[0]); */ /* 259: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b9678 */\ +/* sqr(t[0], t[0]); */ /* 260: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf0 */\ +/* sqr(t[0], t[0]); */ /* 261: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e0 */\ +/* sqr(t[0], t[0]); */ /* 262: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c0 */\ +sqr_n_mul(t[0], t[0], 6, t[14]); /* 263: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c9 */\ +/* sqr(t[0], t[0]); */ /* 264: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b96792 */\ +/* sqr(t[0], t[0]); */ /* 265: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf24 */\ +/* sqr(t[0], t[0]); */ /* 266: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e48 */\ +/* sqr(t[0], t[0]); */ /* 267: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c90 */\ +/* sqr(t[0], t[0]); */ /* 268: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967920 */\ +/* sqr(t[0], t[0]); */ /* 269: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf240 */\ +/* sqr(t[0], t[0]); */ /* 270: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e480 */\ +sqr_n_mul(t[0], t[0], 7, t[16]); /* 271: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e493 */\ +/* sqr(t[0], t[0]); */ /* 272: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c926 */\ +/* sqr(t[0], t[0]); */ /* 273: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924c */\ +/* sqr(t[0], t[0]); */ /* 274: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf2498 */\ +/* sqr(t[0], t[0]); */ /* 275: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e4930 */\ +/* sqr(t[0], t[0]); */ /* 276: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c9260 */\ +sqr_n_mul(t[0], t[0], 5, t[14]); /* 277: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c9269 */\ +/* sqr(t[0], t[0]); */ /* 278: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d2 */\ +/* sqr(t[0], t[0]); */ /* 279: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4 */\ +/* sqr(t[0], t[0]); */ /* 280: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49348 */\ +/* sqr(t[0], t[0]); */ /* 281: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92690 */\ +/* sqr(t[0], t[0]); */ /* 282: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d20 */\ +/* sqr(t[0], t[0]); */ /* 283: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a40 */\ +sqr_n_mul(t[0], t[0], 6, t[10]); /* 284: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f */\ +/* sqr(t[0], t[0]); */ /* 285: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e */\ +/* sqr(t[0], t[0]); */ /* 286: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693c */\ +/* sqr(t[0], t[0]); */ /* 287: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d278 */\ +/* sqr(t[0], t[0]); */ /* 288: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f0 */\ +/* sqr(t[0], t[0]); */ /* 289: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e0 */\ +/* sqr(t[0], t[0]); */ /* 290: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693c0 */\ +sqr_n_mul(t[0], t[0], 6, t[9]); /* 291: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d1 */\ +/* sqr(t[0], t[0]); */ /* 292: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2 */\ +/* sqr(t[0], t[0]); */ /* 293: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f44 */\ +/* sqr(t[0], t[0]); */ /* 294: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e88 */\ +/* sqr(t[0], t[0]); */ /* 295: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d10 */\ +/* sqr(t[0], t[0]); */ /* 296: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a20 */\ +sqr_n_mul(t[0], t[0], 5, t[10]); /* 297: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f */\ +/* sqr(t[0], t[0]); */ /* 298: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e */\ +/* sqr(t[0], t[0]); */ /* 299: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bc */\ +sqr_n_mul(t[0], t[0], 2, t[7]); /* 300: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd */\ +/* sqr(t[0], t[0]); */ /* 301: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a */\ +/* sqr(t[0], t[0]); */ /* 302: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f4 */\ +/* sqr(t[0], t[0]); */ /* 303: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e8 */\ +/* sqr(t[0], t[0]); */ /* 304: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd0 */\ +/* sqr(t[0], t[0]); */ /* 305: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0 */\ +/* sqr(t[0], t[0]); */ /* 306: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f40 */\ +/* sqr(t[0], t[0]); */ /* 307: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e80 */\ +/* sqr(t[0], t[0]); */ /* 308: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd00 */\ +sqr_n_mul(t[0], t[0], 8, t[15]); /* 309: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd05 */\ +/* sqr(t[0], t[0]); */ /* 310: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a */\ +/* sqr(t[0], t[0]); */ /* 311: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414 */\ +/* sqr(t[0], t[0]); */ /* 312: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e828 */\ +/* sqr(t[0], t[0]); */ /* 313: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050 */\ +/* sqr(t[0], t[0]); */ /* 314: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a0 */\ +/* sqr(t[0], t[0]); */ /* 315: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f4140 */\ +/* sqr(t[0], t[0]); */ /* 316: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e8280 */\ +sqr_n_mul(t[0], t[0], 7, t[15]); /* 317: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e8285 */\ +/* sqr(t[0], t[0]); */ /* 318: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a */\ +/* sqr(t[0], t[0]); */ /* 319: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14 */\ +/* sqr(t[0], t[0]); */ /* 320: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f41428 */\ +/* sqr(t[0], t[0]); */ /* 321: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82850 */\ +sqr_n_mul(t[0], t[0], 4, t[1]); /* 322: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853 */\ +/* sqr(t[0], t[0]); */ /* 323: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a6 */\ +/* sqr(t[0], t[0]); */ /* 324: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c */\ +/* sqr(t[0], t[0]); */ /* 325: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298 */\ +/* sqr(t[0], t[0]); */ /* 326: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e828530 */\ +/* sqr(t[0], t[0]); */ /* 327: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a60 */\ +/* sqr(t[0], t[0]); */ /* 328: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c0 */\ +/* sqr(t[0], t[0]); */ /* 329: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f4142980 */\ +sqr_n_mul(t[0], t[0], 7, t[13]); /* 330: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b */\ +/* sqr(t[0], t[0]); */ /* 331: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e8285316 */\ +/* sqr(t[0], t[0]); */ /* 332: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62c */\ +/* sqr(t[0], t[0]); */ /* 333: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c58 */\ +/* sqr(t[0], t[0]); */ /* 334: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b0 */\ +/* sqr(t[0], t[0]); */ /* 335: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853160 */\ +/* sqr(t[0], t[0]); */ /* 336: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62c0 */\ +sqr_n_mul(t[0], t[0], 6, t[10]); /* 337: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cf */\ +/* sqr(t[0], t[0]); */ /* 338: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59e */\ +/* sqr(t[0], t[0]); */ /* 339: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3c */\ +/* sqr(t[0], t[0]); */ /* 340: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e828531678 */\ +/* sqr(t[0], t[0]); */ /* 341: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cf0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 342: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd */\ +/* sqr(t[0], t[0]); */ /* 343: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa */\ +/* sqr(t[0], t[0]); */ /* 344: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f4 */\ +/* sqr(t[0], t[0]); */ /* 345: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8 */\ +/* sqr(t[0], t[0]); */ /* 346: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd0 */\ +/* sqr(t[0], t[0]); */ /* 347: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa0 */\ +/* sqr(t[0], t[0]); */ /* 348: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f40 */\ +/* sqr(t[0], t[0]); */ /* 349: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e80 */\ +sqr_n_mul(t[0], t[0], 7, t[13]); /* 350: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b */\ +/* sqr(t[0], t[0]); */ /* 351: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16 */\ +/* sqr(t[0], t[0]); */ /* 352: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2c */\ +/* sqr(t[0], t[0]); */ /* 353: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f458 */\ +/* sqr(t[0], t[0]); */ /* 354: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b0 */\ +/* sqr(t[0], t[0]); */ /* 355: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd160 */\ +/* sqr(t[0], t[0]); */ /* 356: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2c0 */\ +sqr_n_mul(t[0], t[0], 6, t[12]); /* 357: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2db */\ +/* sqr(t[0], t[0]); */ /* 358: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b6 */\ +/* sqr(t[0], t[0]); */ /* 359: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6c */\ +/* sqr(t[0], t[0]); */ /* 360: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16d8 */\ +/* sqr(t[0], t[0]); */ /* 361: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2db0 */\ +/* sqr(t[0], t[0]); */ /* 362: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b60 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 363: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b77 */\ +/* sqr(t[0], t[0]); */ /* 364: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee */\ +/* sqr(t[0], t[0]); */ /* 365: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddc */\ +/* sqr(t[0], t[0]); */ /* 366: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb8 */\ +/* sqr(t[0], t[0]); */ /* 367: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b770 */\ +/* sqr(t[0], t[0]); */ /* 368: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee0 */\ +sqr_n_mul(t[0], t[0], 5, t[15]); /* 369: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5 */\ +/* sqr(t[0], t[0]); */ /* 370: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca */\ +/* sqr(t[0], t[0]); */ /* 371: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94 */\ +/* sqr(t[0], t[0]); */ /* 372: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7728 */\ +/* sqr(t[0], t[0]); */ /* 373: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee50 */\ +/* sqr(t[0], t[0]); */ /* 374: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca0 */\ +/* sqr(t[0], t[0]); */ /* 375: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb940 */\ +/* sqr(t[0], t[0]); */ /* 376: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b77280 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 377: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729b */\ +/* sqr(t[0], t[0]); */ /* 378: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee536 */\ +/* sqr(t[0], t[0]); */ /* 379: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6c */\ +/* sqr(t[0], t[0]); */ /* 380: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94d8 */\ +/* sqr(t[0], t[0]); */ /* 381: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729b0 */\ +/* sqr(t[0], t[0]); */ /* 382: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5360 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 383: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377 */\ +/* sqr(t[0], t[0]); */ /* 384: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ee */\ +/* sqr(t[0], t[0]); */ /* 385: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddc */\ +/* sqr(t[0], t[0]); */ /* 386: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bb8 */\ +/* sqr(t[0], t[0]); */ /* 387: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee53770 */\ +/* sqr(t[0], t[0]); */ /* 388: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ee0 */\ +sqr_n_mul(t[0], t[0], 5, t[3]); /* 389: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef5 */\ +/* sqr(t[0], t[0]); */ /* 390: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea */\ +/* sqr(t[0], t[0]); */ /* 391: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4 */\ +/* sqr(t[0], t[0]); */ /* 392: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a8 */\ +/* sqr(t[0], t[0]); */ /* 393: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef50 */\ +sqr_n_mul(t[0], t[0], 4, t[1]); /* 394: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53 */\ +/* sqr(t[0], t[0]); */ /* 395: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea6 */\ +/* sqr(t[0], t[0]); */ /* 396: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c */\ +/* sqr(t[0], t[0]); */ /* 397: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98 */\ +/* sqr(t[0], t[0]); */ /* 398: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef530 */\ +/* sqr(t[0], t[0]); */ /* 399: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea60 */\ +/* sqr(t[0], t[0]); */ /* 400: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c0 */\ +sqr_n_mul(t[0], t[0], 6, t[15]); /* 401: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c5 */\ +/* sqr(t[0], t[0]); */ /* 402: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a */\ +/* sqr(t[0], t[0]); */ /* 403: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef5314 */\ +/* sqr(t[0], t[0]); */ /* 404: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea628 */\ +/* sqr(t[0], t[0]); */ /* 405: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c50 */\ +/* sqr(t[0], t[0]); */ /* 406: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a0 */\ +/* sqr(t[0], t[0]); */ /* 407: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53140 */\ +sqr_n_mul(t[0], t[0], 6, t[14]); /* 408: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149 */\ +/* sqr(t[0], t[0]); */ /* 409: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea6292 */\ +/* sqr(t[0], t[0]); */ /* 410: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524 */\ +/* sqr(t[0], t[0]); */ /* 411: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a48 */\ +/* sqr(t[0], t[0]); */ /* 412: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef531490 */\ +sqr_n_mul(t[0], t[0], 4, t[1]); /* 413: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef531493 */\ +/* sqr(t[0], t[0]); */ /* 414: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926 */\ +/* sqr(t[0], t[0]); */ /* 415: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524c */\ +/* sqr(t[0], t[0]); */ /* 416: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a498 */\ +/* sqr(t[0], t[0]); */ /* 417: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef5314930 */\ +sqr_n_mul(t[0], t[0], 4, t[1]); /* 418: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef5314933 */\ +/* sqr(t[0], t[0]); */ /* 419: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea629266 */\ +/* sqr(t[0], t[0]); */ /* 420: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc */\ +/* sqr(t[0], t[0]); */ /* 421: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a4998 */\ +/* sqr(t[0], t[0]); */ /* 422: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330 */\ +/* sqr(t[0], t[0]); */ /* 423: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea6292660 */\ +/* sqr(t[0], t[0]); */ /* 424: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc0 */\ +/* sqr(t[0], t[0]); */ /* 425: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49980 */\ +/* sqr(t[0], t[0]); */ /* 426: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef531493300 */\ +sqr_n_mul(t[0], t[0], 8, t[14]); /* 427: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef531493309 */\ +/* sqr(t[0], t[0]); */ /* 428: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612 */\ +/* sqr(t[0], t[0]); */ /* 429: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc24 */\ +/* sqr(t[0], t[0]); */ /* 430: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a499848 */\ +/* sqr(t[0], t[0]); */ /* 431: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef5314933090 */\ +/* sqr(t[0], t[0]); */ /* 432: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea629266120 */\ +sqr_n_mul(t[0], t[0], 5, t[10]); /* 433: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f */\ +/* sqr(t[0], t[0]); */ /* 434: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e */\ +/* sqr(t[0], t[0]); */ /* 435: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc */\ +/* sqr(t[0], t[0]); */ /* 436: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978 */\ +/* sqr(t[0], t[0]); */ /* 437: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f0 */\ +/* sqr(t[0], t[0]); */ /* 438: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e0 */\ +/* sqr(t[0], t[0]); */ /* 439: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc0 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 440: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc7 */\ +/* sqr(t[0], t[0]); */ /* 441: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978e */\ +/* sqr(t[0], t[0]); */ /* 442: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1c */\ +/* sqr(t[0], t[0]); */ /* 443: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e38 */\ +/* sqr(t[0], t[0]); */ /* 444: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc70 */\ +/* sqr(t[0], t[0]); */ /* 445: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978e0 */\ +sqr_n_mul(t[0], t[0], 5, t[10]); /* 446: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef */\ +/* sqr(t[0], t[0]); */ /* 447: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de */\ +/* sqr(t[0], t[0]); */ /* 448: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc */\ +/* sqr(t[0], t[0]); */ /* 449: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc778 */\ +/* sqr(t[0], t[0]); */ /* 450: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef0 */\ +/* sqr(t[0], t[0]); */ /* 451: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de0 */\ +/* sqr(t[0], t[0]); */ /* 452: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0 */\ +/* sqr(t[0], t[0]); */ /* 453: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc7780 */\ +/* sqr(t[0], t[0]); */ /* 454: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef00 */\ +/* sqr(t[0], t[0]); */ /* 455: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de00 */\ +/* sqr(t[0], t[0]); */ /* 456: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc00 */\ +/* sqr(t[0], t[0]); */ /* 457: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77800 */\ +/* sqr(t[0], t[0]); */ /* 458: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef000 */\ +sqr_n_mul(t[0], t[0], 12, t[9]); /* 459: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011 */\ +/* sqr(t[0], t[0]); */ /* 460: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de022 */\ +/* sqr(t[0], t[0]); */ /* 461: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc044 */\ +/* sqr(t[0], t[0]); */ /* 462: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc778088 */\ +/* sqr(t[0], t[0]); */ /* 463: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef0110 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 464: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d */\ +/* sqr(t[0], t[0]); */ /* 465: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023a */\ +/* sqr(t[0], t[0]); */ /* 466: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0474 */\ +/* sqr(t[0], t[0]); */ /* 467: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808e8 */\ +/* sqr(t[0], t[0]); */ /* 468: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d0 */\ +/* sqr(t[0], t[0]); */ /* 469: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023a0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 470: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad */\ +/* sqr(t[0], t[0]); */ /* 471: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a */\ +/* sqr(t[0], t[0]); */ /* 472: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb4 */\ +/* sqr(t[0], t[0]); */ /* 473: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68 */\ +/* sqr(t[0], t[0]); */ /* 474: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0 */\ +/* sqr(t[0], t[0]); */ /* 475: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a0 */\ +/* sqr(t[0], t[0]); */ /* 476: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb40 */\ +sqr_n_mul(t[0], t[0], 6, t[1]); /* 477: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb43 */\ +/* sqr(t[0], t[0]); */ /* 478: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d686 */\ +/* sqr(t[0], t[0]); */ /* 479: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c */\ +/* sqr(t[0], t[0]); */ /* 480: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18 */\ +/* sqr(t[0], t[0]); */ /* 481: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430 */\ +/* sqr(t[0], t[0]); */ /* 482: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d6860 */\ +/* sqr(t[0], t[0]); */ /* 483: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c0 */\ +/* sqr(t[0], t[0]); */ /* 484: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a180 */\ +/* sqr(t[0], t[0]); */ /* 485: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb4300 */\ +/* sqr(t[0], t[0]); */ /* 486: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68600 */\ +sqr_n_mul(t[0], t[0], 9, t[5]); /* 487: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619 */\ +/* sqr(t[0], t[0]); */ /* 488: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c32 */\ +/* sqr(t[0], t[0]); */ /* 489: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a1864 */\ +/* sqr(t[0], t[0]); */ /* 490: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430c8 */\ +/* sqr(t[0], t[0]); */ /* 491: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d686190 */\ +/* sqr(t[0], t[0]); */ /* 492: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c320 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 493: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c339 */\ +/* sqr(t[0], t[0]); */ /* 494: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672 */\ +/* sqr(t[0], t[0]); */ /* 495: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce4 */\ +/* sqr(t[0], t[0]); */ /* 496: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c8 */\ +/* sqr(t[0], t[0]); */ /* 497: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390 */\ +/* sqr(t[0], t[0]); */ /* 498: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a186720 */\ +/* sqr(t[0], t[0]); */ /* 499: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce40 */\ +sqr_n_mul(t[0], t[0], 6, t[1]); /* 500: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce43 */\ +/* sqr(t[0], t[0]); */ /* 501: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86 */\ +/* sqr(t[0], t[0]); */ /* 502: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c */\ +/* sqr(t[0], t[0]); */ /* 503: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a1867218 */\ +/* sqr(t[0], t[0]); */ /* 504: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430 */\ +/* sqr(t[0], t[0]); */ /* 505: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c860 */\ +/* sqr(t[0], t[0]); */ /* 506: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c0 */\ +sqr_n_mul(t[0], t[0], 6, t[1]); /* 507: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c3 */\ +/* sqr(t[0], t[0]); */ /* 508: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186 */\ +/* sqr(t[0], t[0]); */ /* 509: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c */\ +/* sqr(t[0], t[0]); */ /* 510: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c8618 */\ +/* sqr(t[0], t[0]); */ /* 511: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30 */\ +/* sqr(t[0], t[0]); */ /* 512: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a186721860 */\ +/* sqr(t[0], t[0]); */ /* 513: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c0 */\ +/* sqr(t[0], t[0]); */ /* 514: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86180 */\ +/* sqr(t[0], t[0]); */ /* 515: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c300 */\ +/* sqr(t[0], t[0]); */ /* 516: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a1867218600 */\ +sqr_n_mul(t[0], t[0], 9, t[6]); /* 517: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a1867218617 */\ +/* sqr(t[0], t[0]); */ /* 518: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e */\ +/* sqr(t[0], t[0]); */ /* 519: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c */\ +/* sqr(t[0], t[0]); */ /* 520: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8 */\ +/* sqr(t[0], t[0]); */ /* 521: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186170 */\ +/* sqr(t[0], t[0]); */ /* 522: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e0 */\ +/* sqr(t[0], t[0]); */ /* 523: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c0 */\ +/* sqr(t[0], t[0]); */ /* 524: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b80 */\ +sqr_n_mul(t[0], t[0], 7, t[10]); /* 525: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f */\ +/* sqr(t[0], t[0]); */ /* 526: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171e */\ +/* sqr(t[0], t[0]); */ /* 527: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3c */\ +/* sqr(t[0], t[0]); */ /* 528: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c78 */\ +/* sqr(t[0], t[0]); */ /* 529: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f0 */\ +/* sqr(t[0], t[0]); */ /* 530: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171e0 */\ +/* sqr(t[0], t[0]); */ /* 531: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3c0 */\ +sqr_n_mul(t[0], t[0], 6, t[5]); /* 532: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d9 */\ +/* sqr(t[0], t[0]); */ /* 533: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b2 */\ +/* sqr(t[0], t[0]); */ /* 534: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f64 */\ +/* sqr(t[0], t[0]); */ /* 535: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171ec8 */\ +/* sqr(t[0], t[0]); */ /* 536: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d90 */\ +/* sqr(t[0], t[0]); */ /* 537: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b20 */\ +sqr_n_mul(t[0], t[0], 5, t[14]); /* 538: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b29 */\ +/* sqr(t[0], t[0]); */ /* 539: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f652 */\ +/* sqr(t[0], t[0]); */ /* 540: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4 */\ +/* sqr(t[0], t[0]); */ /* 541: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d948 */\ +/* sqr(t[0], t[0]); */ /* 542: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b290 */\ +/* sqr(t[0], t[0]); */ /* 543: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6520 */\ +/* sqr(t[0], t[0]); */ /* 544: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca40 */\ +/* sqr(t[0], t[0]); */ /* 545: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d9480 */\ +sqr_n_mul(t[0], t[0], 7, t[6]); /* 546: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d9497 */\ +/* sqr(t[0], t[0]); */ /* 547: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e */\ +/* sqr(t[0], t[0]); */ /* 548: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525c */\ +sqr_n_mul(t[0], t[0], 2, t[7]); /* 549: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d */\ +/* sqr(t[0], t[0]); */ /* 550: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba */\ +/* sqr(t[0], t[0]); */ /* 551: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d94974 */\ +/* sqr(t[0], t[0]); */ /* 552: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e8 */\ +/* sqr(t[0], t[0]); */ /* 553: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0 */\ +/* sqr(t[0], t[0]); */ /* 554: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba0 */\ +/* sqr(t[0], t[0]); */ /* 555: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949740 */\ +/* sqr(t[0], t[0]); */ /* 556: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e80 */\ +/* sqr(t[0], t[0]); */ /* 557: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d00 */\ +sqr_n_mul(t[0], t[0], 8, t[13]); /* 558: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b */\ +/* sqr(t[0], t[0]); */ /* 559: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16 */\ +/* sqr(t[0], t[0]); */ /* 560: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742c */\ +/* sqr(t[0], t[0]); */ /* 561: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e858 */\ +/* sqr(t[0], t[0]); */ /* 562: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b0 */\ +sqr_n_mul(t[0], t[0], 4, t[15]); /* 563: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b5 */\ +/* sqr(t[0], t[0]); */ /* 564: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a */\ +/* sqr(t[0], t[0]); */ /* 565: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d4 */\ +/* sqr(t[0], t[0]); */ /* 566: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a8 */\ +/* sqr(t[0], t[0]); */ /* 567: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50 */\ +/* sqr(t[0], t[0]); */ /* 568: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a0 */\ +/* sqr(t[0], t[0]); */ /* 569: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d40 */\ +/* sqr(t[0], t[0]); */ /* 570: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a80 */\ +sqr_n_mul(t[0], t[0], 7, t[2]); /* 571: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87 */\ +/* sqr(t[0], t[0]); */ /* 572: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e */\ +/* sqr(t[0], t[0]); */ /* 573: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c */\ +/* sqr(t[0], t[0]); */ /* 574: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d438 */\ +/* sqr(t[0], t[0]); */ /* 575: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a870 */\ +/* sqr(t[0], t[0]); */ /* 576: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e0 */\ +/* sqr(t[0], t[0]); */ /* 577: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c0 */\ +/* sqr(t[0], t[0]); */ /* 578: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d4380 */\ +/* sqr(t[0], t[0]); */ /* 579: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a8700 */\ +sqr_n_mul(t[0], t[0], 8, t[14]); /* 580: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a8709 */\ +/* sqr(t[0], t[0]); */ /* 581: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e12 */\ +/* sqr(t[0], t[0]); */ /* 582: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24 */\ +/* sqr(t[0], t[0]); */ /* 583: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848 */\ +/* sqr(t[0], t[0]); */ /* 584: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87090 */\ +/* sqr(t[0], t[0]); */ /* 585: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e120 */\ +/* sqr(t[0], t[0]); */ /* 586: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c240 */\ +/* sqr(t[0], t[0]); */ /* 587: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d438480 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 588: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d */\ +/* sqr(t[0], t[0]); */ /* 589: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a */\ +/* sqr(t[0], t[0]); */ /* 590: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234 */\ +/* sqr(t[0], t[0]); */ /* 591: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c2468 */\ +/* sqr(t[0], t[0]); */ /* 592: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d0 */\ +/* sqr(t[0], t[0]); */ /* 593: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a0 */\ +/* sqr(t[0], t[0]); */ /* 594: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e12340 */\ +/* sqr(t[0], t[0]); */ /* 595: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24680 */\ +/* sqr(t[0], t[0]); */ /* 596: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d00 */\ +/* sqr(t[0], t[0]); */ /* 597: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a00 */\ +/* sqr(t[0], t[0]); */ /* 598: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e123400 */\ +sqr_n_mul(t[0], t[0], 10, t[14]); /* 599: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e123409 */\ +/* sqr(t[0], t[0]); */ /* 600: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c246812 */\ +/* sqr(t[0], t[0]); */ /* 601: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024 */\ +/* sqr(t[0], t[0]); */ /* 602: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a048 */\ +/* sqr(t[0], t[0]); */ /* 603: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234090 */\ +/* sqr(t[0], t[0]); */ /* 604: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c2468120 */\ +/* sqr(t[0], t[0]); */ /* 605: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d0240 */\ +sqr_n_mul(t[0], t[0], 6, t[13]); /* 606: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b */\ +/* sqr(t[0], t[0]); */ /* 607: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a0496 */\ +/* sqr(t[0], t[0]); */ /* 608: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092c */\ +/* sqr(t[0], t[0]); */ /* 609: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681258 */\ +/* sqr(t[0], t[0]); */ /* 610: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b0 */\ +/* sqr(t[0], t[0]); */ /* 611: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04960 */\ +/* sqr(t[0], t[0]); */ /* 612: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092c0 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 613: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd */\ +/* sqr(t[0], t[0]); */ /* 614: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259a */\ +/* sqr(t[0], t[0]); */ /* 615: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b34 */\ +/* sqr(t[0], t[0]); */ /* 616: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a049668 */\ +/* sqr(t[0], t[0]); */ /* 617: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd0 */\ +/* sqr(t[0], t[0]); */ /* 618: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259a0 */\ +/* sqr(t[0], t[0]); */ /* 619: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b340 */\ +sqr_n_mul(t[0], t[0], 6, t[11]); /* 620: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35f */\ +/* sqr(t[0], t[0]); */ /* 621: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966be */\ +/* sqr(t[0], t[0]); */ /* 622: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7c */\ +/* sqr(t[0], t[0]); */ /* 623: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259af8 */\ +/* sqr(t[0], t[0]); */ /* 624: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35f0 */\ +/* sqr(t[0], t[0]); */ /* 625: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966be0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 626: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf9 */\ +/* sqr(t[0], t[0]); */ /* 627: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f2 */\ +/* sqr(t[0], t[0]); */ /* 628: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe4 */\ +/* sqr(t[0], t[0]); */ /* 629: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8 */\ +/* sqr(t[0], t[0]); */ /* 630: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf90 */\ +/* sqr(t[0], t[0]); */ /* 631: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f20 */\ +/* sqr(t[0], t[0]); */ /* 632: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe40 */\ +/* sqr(t[0], t[0]); */ /* 633: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc80 */\ +sqr_n_mul(t[0], t[0], 7, t[10]); /* 634: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f */\ +/* sqr(t[0], t[0]); */ /* 635: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91e */\ +/* sqr(t[0], t[0]); */ /* 636: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23c */\ +/* sqr(t[0], t[0]); */ /* 637: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe478 */\ +/* sqr(t[0], t[0]); */ /* 638: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f0 */\ +/* sqr(t[0], t[0]); */ /* 639: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 640: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed */\ +/* sqr(t[0], t[0]); */ /* 641: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da */\ +/* sqr(t[0], t[0]); */ /* 642: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4 */\ +/* sqr(t[0], t[0]); */ /* 643: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f68 */\ +/* sqr(t[0], t[0]); */ /* 644: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed0 */\ +/* sqr(t[0], t[0]); */ /* 645: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da0 */\ +/* sqr(t[0], t[0]); */ /* 646: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b40 */\ +/* sqr(t[0], t[0]); */ /* 647: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f680 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 648: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f */\ +/* sqr(t[0], t[0]); */ /* 649: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e */\ +/* sqr(t[0], t[0]); */ /* 650: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7c */\ +/* sqr(t[0], t[0]); */ /* 651: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f8 */\ +/* sqr(t[0], t[0]); */ /* 652: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f0 */\ +/* sqr(t[0], t[0]); */ /* 653: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e0 */\ +sqr_n_mul(t[0], t[0], 5, t[2]); /* 654: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e7 */\ +/* sqr(t[0], t[0]); */ /* 655: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce */\ +/* sqr(t[0], t[0]); */ /* 656: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c */\ +/* sqr(t[0], t[0]); */ /* 657: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38 */\ +/* sqr(t[0], t[0]); */ /* 658: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e70 */\ +/* sqr(t[0], t[0]); */ /* 659: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce0 */\ +/* sqr(t[0], t[0]); */ /* 660: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c0 */\ +/* sqr(t[0], t[0]); */ /* 661: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f380 */\ +/* sqr(t[0], t[0]); */ /* 662: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e700 */\ +sqr_n_mul(t[0], t[0], 8, t[12]); /* 663: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b */\ +/* sqr(t[0], t[0]); */ /* 664: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36 */\ +/* sqr(t[0], t[0]); */ /* 665: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6c */\ +/* sqr(t[0], t[0]); */ /* 666: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38d8 */\ +/* sqr(t[0], t[0]); */ /* 667: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b0 */\ +/* sqr(t[0], t[0]); */ /* 668: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce360 */\ +/* sqr(t[0], t[0]); */ /* 669: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6c0 */\ +sqr_n_mul(t[0], t[0], 6, t[8]); /* 670: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd */\ +/* sqr(t[0], t[0]); */ /* 671: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba */\ +/* sqr(t[0], t[0]); */ /* 672: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b74 */\ +/* sqr(t[0], t[0]); */ /* 673: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e8 */\ +/* sqr(t[0], t[0]); */ /* 674: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0 */\ +/* sqr(t[0], t[0]); */ /* 675: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba0 */\ +/* sqr(t[0], t[0]); */ /* 676: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b740 */\ +sqr_n_mul(t[0], t[0], 6, t[1]); /* 677: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743 */\ +/* sqr(t[0], t[0]); */ /* 678: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e86 */\ +/* sqr(t[0], t[0]); */ /* 679: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c */\ +/* sqr(t[0], t[0]); */ /* 680: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18 */\ +/* sqr(t[0], t[0]); */ /* 681: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b7430 */\ +/* sqr(t[0], t[0]); */ /* 682: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e860 */\ +/* sqr(t[0], t[0]); */ /* 683: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c0 */\ +/* sqr(t[0], t[0]); */ /* 684: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba180 */\ +sqr_n_mul(t[0], t[0], 7, t[13]); /* 685: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b */\ +/* sqr(t[0], t[0]); */ /* 686: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b74316 */\ +/* sqr(t[0], t[0]); */ /* 687: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c */\ +/* sqr(t[0], t[0]); */ /* 688: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58 */\ +/* sqr(t[0], t[0]); */ /* 689: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b0 */\ +/* sqr(t[0], t[0]); */ /* 690: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743160 */\ +/* sqr(t[0], t[0]); */ /* 691: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c0 */\ +/* sqr(t[0], t[0]); */ /* 692: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c580 */\ +sqr_n_mul(t[0], t[0], 7, t[13]); /* 693: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b */\ +/* sqr(t[0], t[0]); */ /* 694: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b16 */\ +/* sqr(t[0], t[0]); */ /* 695: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c */\ +/* sqr(t[0], t[0]); */ /* 696: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c58 */\ +/* sqr(t[0], t[0]); */ /* 697: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0 */\ +/* sqr(t[0], t[0]); */ /* 698: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b160 */\ +/* sqr(t[0], t[0]); */ /* 699: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c0 */\ +sqr_n_mul(t[0], t[0], 6, t[1]); /* 700: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c3 */\ +/* sqr(t[0], t[0]); */ /* 701: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586 */\ +/* sqr(t[0], t[0]); */ /* 702: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0c */\ +/* sqr(t[0], t[0]); */ /* 703: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1618 */\ +/* sqr(t[0], t[0]); */ /* 704: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c30 */\ +/* sqr(t[0], t[0]); */ /* 705: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c5860 */\ +sqr_n_mul(t[0], t[0], 5, t[2]); /* 706: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c5867 */\ +/* sqr(t[0], t[0]); */ /* 707: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce */\ +/* sqr(t[0], t[0]); */ /* 708: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c */\ +/* sqr(t[0], t[0]); */ /* 709: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338 */\ +/* sqr(t[0], t[0]); */ /* 710: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c58670 */\ +/* sqr(t[0], t[0]); */ /* 711: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0 */\ +/* sqr(t[0], t[0]); */ /* 712: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c0 */\ +/* sqr(t[0], t[0]); */ /* 713: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c3380 */\ +/* sqr(t[0], t[0]); */ /* 714: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586700 */\ +/* sqr(t[0], t[0]); */ /* 715: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce00 */\ +/* sqr(t[0], t[0]); */ /* 716: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c00 */\ +sqr_n_mul(t[0], t[0], 10, t[12]); /* 717: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b */\ +/* sqr(t[0], t[0]); */ /* 718: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c33836 */\ +/* sqr(t[0], t[0]); */ /* 719: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c */\ +/* sqr(t[0], t[0]); */ /* 720: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8 */\ +/* sqr(t[0], t[0]); */ /* 721: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b0 */\ +sqr_n_mul(t[0], t[0], 4, t[7]); /* 722: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1 */\ +/* sqr(t[0], t[0]); */ /* 723: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362 */\ +/* sqr(t[0], t[0]); */ /* 724: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c4 */\ +/* sqr(t[0], t[0]); */ /* 725: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d88 */\ +/* sqr(t[0], t[0]); */ /* 726: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b10 */\ +/* sqr(t[0], t[0]); */ /* 727: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c3383620 */\ +/* sqr(t[0], t[0]); */ /* 728: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c40 */\ +/* sqr(t[0], t[0]); */ /* 729: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d880 */\ +/* sqr(t[0], t[0]); */ /* 730: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b100 */\ +/* sqr(t[0], t[0]); */ /* 731: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c33836200 */\ +sqr_n_mul(t[0], t[0], 9, t[9]); /* 732: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c33836211 */\ +/* sqr(t[0], t[0]); */ /* 733: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c422 */\ +/* sqr(t[0], t[0]); */ /* 734: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844 */\ +/* sqr(t[0], t[0]); */ /* 735: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1088 */\ +/* sqr(t[0], t[0]); */ /* 736: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362110 */\ +/* sqr(t[0], t[0]); */ /* 737: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c4220 */\ +/* sqr(t[0], t[0]); */ /* 738: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d88440 */\ +sqr_n_mul(t[0], t[0], 6, t[10]); /* 739: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f */\ +/* sqr(t[0], t[0]); */ /* 740: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e */\ +/* sqr(t[0], t[0]); */ /* 741: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113c */\ +/* sqr(t[0], t[0]); */ /* 742: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42278 */\ +/* sqr(t[0], t[0]); */ /* 743: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f0 */\ +/* sqr(t[0], t[0]); */ /* 744: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e0 */\ +/* sqr(t[0], t[0]); */ /* 745: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113c0 */\ +/* sqr(t[0], t[0]); */ /* 746: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c422780 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 747: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279f */\ +/* sqr(t[0], t[0]); */ /* 748: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3e */\ +/* sqr(t[0], t[0]); */ /* 749: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7c */\ +/* sqr(t[0], t[0]); */ /* 750: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cf8 */\ +/* sqr(t[0], t[0]); */ /* 751: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279f0 */\ +/* sqr(t[0], t[0]); */ /* 752: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3e0 */\ +sqr_n_mul(t[0], t[0], 5, t[3]); /* 753: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5 */\ +/* sqr(t[0], t[0]); */ /* 754: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ea */\ +/* sqr(t[0], t[0]); */ /* 755: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd4 */\ +/* sqr(t[0], t[0]); */ /* 756: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279fa8 */\ +/* sqr(t[0], t[0]); */ /* 757: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f50 */\ +sqr_n_mul(t[0], t[0], 4, t[10]); /* 758: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f */\ +/* sqr(t[0], t[0]); */ /* 759: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe */\ +/* sqr(t[0], t[0]); */ /* 760: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7c */\ +/* sqr(t[0], t[0]); */ /* 761: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf8 */\ +/* sqr(t[0], t[0]); */ /* 762: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f0 */\ +/* sqr(t[0], t[0]); */ /* 763: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe0 */\ +/* sqr(t[0], t[0]); */ /* 764: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7c0 */\ +/* sqr(t[0], t[0]); */ /* 765: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf80 */\ +sqr_n_mul(t[0], t[0], 7, t[8]); /* 766: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9d */\ +/* sqr(t[0], t[0]); */ /* 767: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3a */\ +/* sqr(t[0], t[0]); */ /* 768: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe74 */\ +/* sqr(t[0], t[0]); */ /* 769: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ce8 */\ +/* sqr(t[0], t[0]); */ /* 770: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9d0 */\ +/* sqr(t[0], t[0]); */ /* 771: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3a0 */\ +sqr_n_mul(t[0], t[0], 5, t[3]); /* 772: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5 */\ +/* sqr(t[0], t[0]); */ /* 773: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76a */\ +/* sqr(t[0], t[0]); */ /* 774: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced4 */\ +/* sqr(t[0], t[0]); */ /* 775: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9da8 */\ +/* sqr(t[0], t[0]); */ /* 776: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b50 */\ +/* sqr(t[0], t[0]); */ /* 777: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76a0 */\ +sqr_n_mul(t[0], t[0], 5, t[3]); /* 778: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b5 */\ +/* sqr(t[0], t[0]); */ /* 779: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6a */\ +/* sqr(t[0], t[0]); */ /* 780: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad4 */\ +/* sqr(t[0], t[0]); */ /* 781: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5a8 */\ +/* sqr(t[0], t[0]); */ /* 782: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b50 */\ +/* sqr(t[0], t[0]); */ /* 783: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6a0 */\ +sqr_n_mul(t[0], t[0], 5, t[9]); /* 784: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1 */\ +/* sqr(t[0], t[0]); */ /* 785: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad62 */\ +/* sqr(t[0], t[0]); */ /* 786: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac4 */\ +/* sqr(t[0], t[0]); */ /* 787: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b588 */\ +/* sqr(t[0], t[0]); */ /* 788: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b10 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 789: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d */\ +/* sqr(t[0], t[0]); */ /* 790: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63a */\ +/* sqr(t[0], t[0]); */ /* 791: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac74 */\ +/* sqr(t[0], t[0]); */ /* 792: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58e8 */\ +/* sqr(t[0], t[0]); */ /* 793: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d0 */\ +/* sqr(t[0], t[0]); */ /* 794: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63a0 */\ +/* sqr(t[0], t[0]); */ /* 795: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac740 */\ +sqr_n_mul(t[0], t[0], 6, t[8]); /* 796: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d */\ +/* sqr(t[0], t[0]); */ /* 797: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58eba */\ +sqr_n_mul(t[0], t[0], 1, t[7]); /* 798: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb */\ +/* sqr(t[0], t[0]); */ /* 799: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76 */\ +/* sqr(t[0], t[0]); */ /* 800: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec */\ +/* sqr(t[0], t[0]); */ /* 801: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8 */\ +/* sqr(t[0], t[0]); */ /* 802: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb0 */\ +/* sqr(t[0], t[0]); */ /* 803: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d760 */\ +/* sqr(t[0], t[0]); */ /* 804: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec0 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 805: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec7 */\ +/* sqr(t[0], t[0]); */ /* 806: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e */\ +/* sqr(t[0], t[0]); */ /* 807: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c */\ +/* sqr(t[0], t[0]); */ /* 808: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d7638 */\ +/* sqr(t[0], t[0]); */ /* 809: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec70 */\ +/* sqr(t[0], t[0]); */ /* 810: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0 */\ +/* sqr(t[0], t[0]); */ /* 811: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c0 */\ +/* sqr(t[0], t[0]); */ /* 812: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76380 */\ +/* sqr(t[0], t[0]); */ /* 813: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec700 */\ +/* sqr(t[0], t[0]); */ /* 814: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e00 */\ +/* sqr(t[0], t[0]); */ /* 815: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c00 */\ +sqr_n_mul(t[0], t[0], 10, t[6]); /* 816: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c17 */\ +/* sqr(t[0], t[0]); */ /* 817: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382e */\ +/* sqr(t[0], t[0]); */ /* 818: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705c */\ +/* sqr(t[0], t[0]); */ /* 819: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0b8 */\ +/* sqr(t[0], t[0]); */ /* 820: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c170 */\ +/* sqr(t[0], t[0]); */ /* 821: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382e0 */\ +/* sqr(t[0], t[0]); */ /* 822: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705c0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 823: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705d5 */\ +/* sqr(t[0], t[0]); */ /* 824: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0baa */\ +/* sqr(t[0], t[0]); */ /* 825: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c1754 */\ +/* sqr(t[0], t[0]); */ /* 826: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382ea8 */\ +/* sqr(t[0], t[0]); */ /* 827: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705d50 */\ +/* sqr(t[0], t[0]); */ /* 828: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0baa0 */\ +/* sqr(t[0], t[0]); */ /* 829: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c17540 */\ +sqr_n_mul(t[0], t[0], 6, t[5]); /* 830: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c17559 */\ +/* sqr(t[0], t[0]); */ /* 831: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382eab2 */\ +/* sqr(t[0], t[0]); */ /* 832: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705d564 */\ +/* sqr(t[0], t[0]); */ /* 833: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0baac8 */\ +/* sqr(t[0], t[0]); */ /* 834: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c175590 */\ +/* sqr(t[0], t[0]); */ /* 835: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382eab20 */\ +/* sqr(t[0], t[0]); */ /* 836: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705d5640 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 837: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705d564d */\ +/* sqr(t[0], t[0]); */ /* 838: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0baac9a */\ +/* sqr(t[0], t[0]); */ /* 839: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c1755934 */\ +/* sqr(t[0], t[0]); */ /* 840: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382eab268 */\ +/* sqr(t[0], t[0]); */ /* 841: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705d564d0 */\ +/* sqr(t[0], t[0]); */ /* 842: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0baac9a0 */\ +/* sqr(t[0], t[0]); */ /* 843: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c17559340 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 844: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c17559355 */\ +/* sqr(t[0], t[0]); */ /* 845: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382eab26aa */\ +/* sqr(t[0], t[0]); */ /* 846: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705d564d54 */\ +/* sqr(t[0], t[0]); */ /* 847: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0baac9aa8 */\ +/* sqr(t[0], t[0]); */ /* 848: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c175593550 */\ +/* sqr(t[0], t[0]); */ /* 849: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382eab26aa0 */\ +/* sqr(t[0], t[0]); */ /* 850: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705d564d540 */\ +/* sqr(t[0], t[0]); */ /* 851: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0baac9aa80 */\ +/* sqr(t[0], t[0]); */ /* 852: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c1755935500 */\ +/* sqr(t[0], t[0]); */ /* 853: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382eab26aa00 */\ +/* sqr(t[0], t[0]); */ /* 854: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705d564d5400 */\ +/* sqr(t[0], t[0]); */ /* 855: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0baac9aa800 */\ +/* sqr(t[0], t[0]); */ /* 856: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c17559355000 */\ +/* sqr(t[0], t[0]); */ /* 857: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382eab26aa000 */\ +/* sqr(t[0], t[0]); */ /* 858: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705d564d54000 */\ +/* sqr(t[0], t[0]); */ /* 859: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0baac9aa8000 */\ +/* sqr(t[0], t[0]); */ /* 860: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c175593550000 */\ +/* sqr(t[0], t[0]); */ /* 861: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382eab26aa0000 */\ +/* sqr(t[0], t[0]); */ /* 862: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705d564d540000 */\ +/* sqr(t[0], t[0]); */ /* 863: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0baac9aa80000 */\ +/* sqr(t[0], t[0]); */ /* 864: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c1755935500000 */\ +/* sqr(t[0], t[0]); */ /* 865: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382eab26aa00000 */\ +/* sqr(t[0], t[0]); */ /* 866: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705d564d5400000 */\ +/* sqr(t[0], t[0]); */ /* 867: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0baac9aa800000 */\ +sqr_n_mul(t[0], t[0], 23, t[2]); /* 868: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0baac9aa800007 */\ +/* sqr(t[0], t[0]); */ /* 869: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c1755935500000e */\ +/* sqr(t[0], t[0]); */ /* 870: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382eab26aa00001c */\ +/* sqr(t[0], t[0]); */ /* 871: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705d564d54000038 */\ +/* sqr(t[0], t[0]); */ /* 872: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0baac9aa8000070 */\ +/* sqr(t[0], t[0]); */ /* 873: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c1755935500000e0 */\ +/* sqr(t[0], t[0]); */ /* 874: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382eab26aa00001c0 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 875: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382eab26aa00001c7 */\ +/* sqr(t[0], t[0]); */ /* 876: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705d564d54000038e */\ +/* sqr(t[0], t[0]); */ /* 877: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0baac9aa8000071c */\ +/* sqr(t[0], t[0]); */ /* 878: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c1755935500000e38 */\ +/* sqr(t[0], t[0]); */ /* 879: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382eab26aa00001c70 */\ +/* sqr(t[0], t[0]); */ /* 880: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705d564d54000038e0 */\ +sqr_n_mul(t[0], t[0], 5, t[1]); /* 881: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705d564d54000038e3 */\ +/* sqr(t[0], t[0]); */ /* 882: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0baac9aa8000071c6 */\ +/* sqr(t[0], t[0]); */ /* 883: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c1755935500000e38c */\ +/* sqr(t[0], t[0]); */ /* 884: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382eab26aa00001c718 */\ +/* sqr(t[0], t[0]); */ /* 885: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705d564d54000038e30 */\ +/* sqr(t[0], t[0]); */ /* 886: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0baac9aa8000071c60 */\ +/* sqr(t[0], t[0]); */ /* 887: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c1755935500000e38c0 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 888: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c1755935500000e38c7 */\ +/* sqr(t[0], t[0]); */ /* 889: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382eab26aa00001c718e */\ +/* sqr(t[0], t[0]); */ /* 890: 5486f497186bf8e97a4f1d5445e4bd3c5b921ca1ce08d68cdcb3c92693d17a0a14c59fa2dbb94ddea62926612f1de023ad0c3390c30b8f6525d0b50e1234092cd7f23da7ce36e862c586706c42279faf9dad63aec705d564d54000038e31c */\ +/* sqr(t[0], t[0]); */ /* 891: a90de92e30d7f1d2f49e3aa88bc97a78b72439439c11ad19b967924d27a2f414298b3f45b7729bbd4c524cc25e3bc0475a18672186171eca4ba16a1c24681259afe47b4f9c6dd0c58b0ce0d8844f3f5f3b5ac75d8e0baac9aa8000071c638 */\ +/* sqr(t[0], t[0]); */ /* 892: 1521bd25c61afe3a5e93c75511792f4f16e48728738235a3372cf249a4f45e82853167e8b6ee5377a98a49984bc77808eb430ce430c2e3d949742d43848d024b35fc8f69f38dba18b1619c1b1089e7ebe76b58ebb1c1755935500000e38c70 */\ +/* sqr(t[0], t[0]); */ /* 893: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382eab26aa00001c718e0 */\ +sqr_n_mul(out, t[0], 5, t[1]); /* 894: 2a437a4b8c35fc74bd278eaa22f25e9e2dc90e50e7046b466e59e49349e8bd050a62cfd16ddca6ef53149330978ef011d68619c86185c7b292e85a87091a04966bf91ed3e71b743162c338362113cfd7ced6b1d76382eab26aa00001c718e3 */\ +} while (0) diff --git a/src/vect.c b/src/vect.c new file mode 100644 index 00000000..53d8736e --- /dev/null +++ b/src/vect.c @@ -0,0 +1,128 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "vect.h" + +/* + * Following are some reference C implementations to assist new + * assembly modules development, as starting-point stand-ins and for + * cross-checking. In order to "polyfil" specific subroutine redefine + * it on compiler command line, e.g. -Dmul_mont_384x=_mul_mont_384x. + */ + +#ifdef lshift_mod_384 +void lshift_mod_384(vec384x ret, const vec384x a, size_t n, const vec384 p) +{ + while(n--) + add_mod_384(ret, a, a, mod), a = ret; +} +#endif + +#ifdef mul_by_8_mod_384 +void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 mod) +{ lshift_mod_384(ret, a, 3, mod); } +#endif + +#ifdef mul_by_3_mod_384 +void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 mod) +{ + vec384 t; + + add_mod_384(t, a, a, mod); + add_mod_384(ret, t, a, mod); +} +#endif + +#ifdef mul_by_3_mod_384x +void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 mod) +{ + mul_by_3_mod_384(ret[0], a[0], mod); + mul_by_3_mod_384(ret[1], a[1], mod); +} +#endif + +#ifdef mul_by_8_mod_384x +void mul_by_8_mod_384x(vec384 ret, const vec384 a, const vec384 mod) +{ + mul_by_8_mod_384(ret[0], a[0], mod); + mul_by_8_mod_384(ret[1], a[1], mod); +} +#endif + +#ifdef mul_by_1_plus_i_mod_384x +void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, const vec384 mod) +{ + vec384 t; + + add_mod_384(t, a[0], a[1], mod); + sub_mod_384(ret[0], a[0], a[1], mod); + vec_copy(ret[1], t, sizeof(t)); +} +#endif + +#ifdef add_mod_384x +void add_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 mod) +{ + add_mod_384(ret[0], a[0], b[0], mod); + add_mod_384(ret[1], a[1], b[1], mod); +} +#endif + +#ifdef sub_mod_384x +void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 mod) +{ + sub_mod_384(ret[0], a[0], b[0], mod); + sub_mod_384(ret[1], a[1], b[1], mod); +} +#endif + +#ifdef lshift_mod_384x +void lshift_mod_384x(vec384x ret, const vec384x a, size_t n, const vec384 p) +{ + lshift_mod_384(ret[0], a[0], n, p); + lshift_mod_384(ret[1], a[1], n, p); +} +#endif + +#if defined(mul_mont_384x) && !defined(__ADX__) +void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 mod, limb_t n0) +{ + vec768 t0, t1, t2; + vec384 aa, bb; + + mul_384(t0, a[0], b[0]); + mul_384(t1, a[1], b[1]); + + add_mod_384(aa, a[0], a[1], mod); + add_mod_384(bb, b[0], b[1], mod); + mul_384(t2, aa, bb); + sub_mod_384x384(t2, t2, t0, mod); + sub_mod_384x384(t2, t2, t1, mod); + + sub_mod_384x384(t0, t0, t1, mod); + + redc_mont_384(ret[0], t0, mod, n0); + redc_mont_384(ret[1], t2, mod, n0); +} +#endif + +#if defined(sqr_mont_384x) && !defined(__ADX__) +void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 mod, limb_t n0) +{ + vec384 t0, t1; + + add_mod_384(t0, a[0], a[1], mod); + sub_mod_384(t1, a[0], a[1], mod); + + mul_mont_384(ret[1], a[0], a[1], mod, n0); + add_mod_384(ret[1], ret[1], ret[1], mod); + + mul_mont_384(ret[0], t0, t1, mod, n0); +} +#endif diff --git a/src/vect.h b/src/vect.h new file mode 100644 index 00000000..65a785c4 --- /dev/null +++ b/src/vect.h @@ -0,0 +1,371 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_VECT_H__ +#define __BLS12_381_ASM_VECT_H__ + +#include + +#if defined(__x86_64__) || defined(__aarch64__) || defined(__mips64) || \ + defined(__ia64) || (defined(__VMS) && !defined(__vax)) +/* These are available even in ILP32 flavours, but even then they are + * capable of performing 64-bit operations as efficiently as in *P64. */ +typedef unsigned long long limb_t; +# define LIMB_T_BITS 64 + +#elif defined(_WIN64) /* Win64 is P64 */ +typedef unsigned __int64 limb_t; +# define LIMB_T_BITS 64 + +# else /* 32 bits on 32-bit platforms, 64 - on 64-bit */ +typedef unsigned long limb_t; +# ifdef _LP64 +# define LIMB_T_BITS 64 +# else +# define LIMB_T_BITS 32 +# endif +#endif + +/* + * Why isn't LIMB_T_BITS defined as 8*sizeof(limb_t)? Because pre-processor + * knows nothing about sizeof(anything)... + */ +#if LIMB_T_BITS == 64 +# define TO_LIMB_T(limb64) limb64 +#else +# define TO_LIMB_T(limb64) (limb_t)limb64,(limb_t)(limb64>>32) +#endif + +#define NLIMBS(bits) (bits/LIMB_T_BITS) + +typedef limb_t vec256[NLIMBS(256)]; +typedef limb_t vec512[NLIMBS(512)]; +typedef limb_t vec384[NLIMBS(384)]; +typedef limb_t vec768[NLIMBS(768)]; +typedef vec384 vec384x[2]; /* 0 is "real" part, 1 is "imaginary" */ + +/* + * Assembly subroutines... + */ +#ifdef __ADX__ /* e.g. -march=broadwell */ +# define mul_mont_sparse_256 mulx_mont_sparse_256 +# define sqr_mont_sparse_256 sqrx_mont_sparse_256 +# define from_mont_256 fromx_mont_256 +# define redc_mont_256 redcx_mont_256 +# define mul_mont_384 mulx_mont_384 +# define sqr_mont_384 sqrx_mont_384 +# define sqr_n_mul_mont_384 sqrx_n_mul_mont_384 +# define sqr_n_mul_mont_383 sqrx_n_mul_mont_383 +# define mul_384 mulx_384 +# define sqr_384 sqrx_384 +# define redc_mont_384 redcx_mont_384 +# define from_mont_384 fromx_mont_384 +# define sgn0_pty_mont_384 sgn0x_pty_mont_384 +# define sgn0_pty_mont_384x sgn0x_pty_mont_384x +#endif + +void mul_mont_sparse_256(vec256 ret, const vec256 a, const vec256 b, + const vec256 p, limb_t n0); +void sqr_mont_sparse_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0); +void redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t n0); +void from_mont_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0); + +void add_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p); +void sub_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p); +void mul_by_3_mod_256(vec256 ret, const vec256 a, const vec256 p); +void cneg_mod_256(vec256 ret, const vec256 a, limb_t flag, const vec256 p); +void lshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p); +void rshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p); + +void mul_mont_384(vec384 ret, const vec384 a, const vec384 b, + const vec384 p, limb_t n0); +void sqr_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0); +void sqr_n_mul_mont_384(vec384 ret, const vec384 a, size_t count, + const vec384 p, limb_t n0, const vec384 b); +void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count, + const vec384 p, limb_t n0, const vec384 b); + +void mul_384(vec768 ret, const vec384 a, const vec384 b); +void sqr_384(vec768 ret, const vec384 a); +void redc_mont_384(vec384 ret, const vec768 a, const vec384 p, limb_t n0); +void from_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0); +limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0); +limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0); +limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p); +limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p); + +void add_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p); +void sub_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p); +void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 p); +void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 p); +void cneg_mod_384(vec384 ret, const vec384 a, limb_t flag, const vec384 p); +void lshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p); +limb_t eucl_inverse_mod_384(vec384 ret, const vec384 a, const vec384 p, + const vec384 one); + +#ifdef __ADX__ /* e.g. -march=broadwell */ +# define mul_mont_384x mulx_mont_384x +# define sqr_mont_384x sqrx_mont_384x +# define sqr_mont_382x sqrx_mont_382x +# define sqr_n_mul_mont_384x sqrx_n_mul_mont_384x +# define mul_382x mulx_382x +# define sqr_382x sqrx_382x +#endif + +void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p, limb_t n0); +void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); +void sqr_mont_382x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); +void sqr_n_mul_mont_384x(vec384x ret, const vec384x a, size_t count, + const vec384 p, limb_t n0, const vec384x b); +void mul_382x(vec768 ret[2], const vec384x a, const vec384x b, const vec384 p); +void sqr_382x(vec768 ret[2], const vec384x a, const vec384 p); + +void add_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p); +void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p); +void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 p); +void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 p); +void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, const vec384 p); +void add_mod_384x384(vec768 ret, const vec768 a, const vec768 b, + const vec384 p); +void sub_mod_384x384(vec768 ret, const vec768 a, const vec768 b, + const vec384 p); + +/* + * C subroutines + */ +static void exp_mont_384(vec384 out, const vec384 inp, const limb_t *pow, + size_t pow_bits, const vec384 p, limb_t n0); +static void reciprocal_fp(vec384 out, const vec384 inp); +static limb_t recip_sqrt_fp(vec384 out, const vec384 inp); +static limb_t sqrt_fp(vec384 out, const vec384 inp); + +static void exp_mont_384x(vec384x out, const vec384x inp, const limb_t *pow, + size_t pow_bits, const vec384 p, limb_t n0); +static void reciprocal_fp2(vec384x out, const vec384x inp); +static limb_t recip_sqrt_fp2(vec384x out, const vec384x inp); +static limb_t sqrt_fp2(vec384x out, const vec384x inp); +static limb_t sqrt_align_fp2(vec384x out, const vec384x ret, + const vec384x sqrt, const vec384x inp); + +#if !defined(restrict) +# if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 +# if defined(__GNUC__) && __GNUC__>=2 +# define restrict __restrict__ +# elif defined(_MSC_VER) +# define restrict __restrict +# else +# define restrict +# endif +# endif +#endif + +#if !defined(inline) +# if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 +# if defined(__GNUC__) && __GNUC__>=2 +# define inline __inline__ +# elif defined(_MSC_VER) +# define inline __inline +# else +# define inline +# endif +# endif +#endif + +static inline int is_bit_set(const limb_t *v, size_t i) +{ + return (v[i/LIMB_T_BITS] >> (i%LIMB_T_BITS)) & 1; +} + +static inline void vec_cswap(void *restrict a, void *restrict b, size_t num, + limb_t cbit) +{ + limb_t ai, *ap = (limb_t *)a; + limb_t bi, *bp = (limb_t *)b; + limb_t xorm, mask = 0 - cbit; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) { + xorm = ((ai = ap[i]) ^ (bi = bp[i])) & mask; + ap[i] = ai ^ xorm; + bp[i] = bi ^ xorm; + } +} + +/* ret = bit ? a : b */ +static inline void vec_select(void *restrict ret, const void *restrict a, + const void *restrict b, + size_t num, limb_t sel_a) +{ + limb_t bi, *rp = (limb_t *)ret; + const limb_t *ap = (const limb_t *)a; + const limb_t *bp = (const limb_t *)b; + limb_t xorm, mask = 0 - sel_a; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) { + xorm = (ap[i] ^ (bi = bp[i])) & mask; + rp[i] = bi ^ xorm; + } +} + +static inline limb_t vec_is_zero(const void *a, size_t num) +{ + const limb_t *ap = (const limb_t *)a; + limb_t acc; + size_t i; + + num /= sizeof(limb_t); + + for (acc = 0, i = 0; i < num; i++) + acc |= ap[i]; + + return (~acc & (acc - 1)) >> (LIMB_T_BITS - 1); +} + +static inline limb_t vec_is_equal(const void *a, const void *b, size_t num) +{ + const limb_t *ap = (const limb_t *)a; + const limb_t *bp = (const limb_t *)b; + limb_t acc; + size_t i; + + num /= sizeof(limb_t); + + for (acc = 0, i = 0; i < num; i++) + acc |= ap[i] ^ bp[i]; + + return (~acc & (acc - 1)) >> (LIMB_T_BITS - 1); +} + +static inline void cneg_mod_384x(vec384x ret, const vec384x a, limb_t flag, + const vec384 p) +{ + cneg_mod_384(ret[0], a[0], flag, p); + cneg_mod_384(ret[1], a[1], flag, p); +} + +static inline void vec_copy(void *restrict ret, const void *a, size_t num) +{ + limb_t *rp = (limb_t *)ret; + const limb_t *ap = (const limb_t *)a; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] = ap[i]; +} + +static inline void vec_zero(void *ret, size_t num) +{ + limb_t *rp = (limb_t *)ret; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] = 0; +} + +static inline void limbs_from_be_bytes(limb_t *restrict ret, + const unsigned char *in, size_t n) +{ + limb_t limb = 0; + + while(n--) { + limb <<= 8; + limb |= *in++; + /* + * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper + * to perform redundant stores than to pay penalty for + * mispredicted branch. Besides, some compilers unroll the + * loop and remove redundant stores to 'restict'-ed storage... + */ + ret[n / sizeof(limb_t)] = limb; + } +} + +static inline void be_bytes_from_limbs(unsigned char *out, const limb_t *in, + size_t n) +{ + limb_t limb; + + while(n--) { + limb = in[n / sizeof(limb_t)]; + *out++ = (unsigned char)(limb >> (8 * (n % sizeof(limb_t)))); + } +} + +static inline void limbs_from_le_bytes(limb_t *restrict ret, + const unsigned char *in, size_t n) +{ + limb_t limb = 0; + + while(n--) { + limb <<= 8; + limb |= in[n]; + /* + * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper + * to perform redundant stores than to pay penalty for + * mispredicted branch. Besides, some compilers unroll the + * loop and remove redundant stores to 'restict'-ed storage... + */ + ret[n / sizeof(limb_t)] = limb; + } +} + +static inline void le_bytes_from_limbs(unsigned char *out, const limb_t *in, + size_t n) +{ + limb_t limb; + size_t i; + + for(i = 0; i < n; i++) { + limb = in[i / sizeof(limb_t)]; + *out++ = (unsigned char)(limb >> (8 * (i % sizeof(limb_t)))); + } +} + +/* + * Some compilers get arguably overzealous(*) when passing pointer to + * multi-dimensional array [such as vec384x] as 'const' argument. + * General direction seems to be to legitimize such constification, + * so it's argued that suppressing the warning is appropriate. + * + * (*) http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1923.htm + */ +#if defined(__INTEL_COMPILER) +# pragma warning(disable:167) +# pragma warning(disable:556) +#elif defined(__GNUC__) && !defined(__clang__) +# pragma GCC diagnostic ignored "-Wpedantic" +# if __GNUC__>=9 +# pragma GCC diagnostic ignored "-Wrestrict" +# endif +#endif + +#include + +#if defined(__GNUC__) +# ifndef alloca +# define alloca(s) __builtin_alloca(s) +# endif +#elif defined(__sun) +# include +#elif defined(_WIN32) +# include +# ifndef alloca +# define alloca(s) _alloca(s) +# endif +#endif + +#endif /* __BLS12_381_ASM_VECT_H__ */