@@ -224,6 +224,28 @@ static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
224224 VERIFY_CHECK (c1 >= th ); \
225225}
226226
227+ /** Add 2*a*b to the number defined by (c0,c1,c2). c2 must never overflow. */
228+ #define muladd2 (a ,b ) { \
229+ uint64_t tl, th, th2, tl2; \
230+ { \
231+ uint128_t t = (uint128_t)a * b; \
232+ th = t >> 64; /* at most 0xFFFFFFFFFFFFFFFE */ \
233+ tl = t ; \
234+ } \
235+ th2 = th + th ; /* at most 0xFFFFFFFFFFFFFFFE (in case th was 0x7FFFFFFFFFFFFFFF) */ \
236+ c2 += (th2 < th ); /* never overflows by contract (verified the next line) */ \
237+ VERIFY_CHECK ((th2 >= th ) || (c2 != 0 )); \
238+ tl2 = tl + tl ; /* at most 0xFFFFFFFFFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFFFFFFFFFF) */ \
239+ th2 += (tl2 < tl ); /* at most 0xFFFFFFFFFFFFFFFF */ \
240+ c0 += tl2 ; /* overflow is handled on the next line */ \
241+ th2 += (c0 < tl2 ); /* second overflow is handled on the next line */ \
242+ c2 += (c0 < tl2 ) & (th2 == 0 ); /* never overflows by contract (verified the next line) */ \
243+ VERIFY_CHECK ((c0 >= tl2 ) || (th2 != 0 ) || (c2 != 0 )); \
244+ c1 += th2 ; /* overflow is handled on the next line */ \
245+ c2 += (c1 < th2 ); /* never overflows by contract (verified the next line) */ \
246+ VERIFY_CHECK ((c1 >= th2 ) || (c2 != 0 )); \
247+ }
248+
227249/** Add a to the number defined by (c0,c1,c2). c2 must never overflow. */
228250#define sumadd (a ) { \
229251 unsigned int over; \
@@ -733,10 +755,148 @@ static void secp256k1_scalar_mul_512(uint64_t l[8], const secp256k1_scalar *a, c
733755#endif
734756}
735757
758+ static void secp256k1_scalar_sqr_512 (uint64_t l [8 ], const secp256k1_scalar * a ) {
759+ #ifdef USE_ASM_X86_64
760+ __asm__ __volatile__(
761+ /* Preload */
762+ "movq 0(%%rdi), %%r11\n"
763+ "movq 8(%%rdi), %%r12\n"
764+ "movq 16(%%rdi), %%r13\n"
765+ "movq 24(%%rdi), %%r14\n"
766+ /* (rax,rdx) = a0 * a0 */
767+ "movq %%r11, %%rax\n"
768+ "mulq %%r11\n"
769+ /* Extract l0 */
770+ "movq %%rax, 0(%%rsi)\n"
771+ /* (r8,r9,r10) = (rdx,0) */
772+ "movq %%rdx, %%r8\n"
773+ "xorq %%r9, %%r9\n"
774+ "xorq %%r10, %%r10\n"
775+ /* (r8,r9,r10) += 2 * a0 * a1 */
776+ "movq %%r11, %%rax\n"
777+ "mulq %%r12\n"
778+ "addq %%rax, %%r8\n"
779+ "adcq %%rdx, %%r9\n"
780+ "adcq $0, %%r10\n"
781+ "addq %%rax, %%r8\n"
782+ "adcq %%rdx, %%r9\n"
783+ "adcq $0, %%r10\n"
784+ /* Extract l1 */
785+ "movq %%r8, 8(%%rsi)\n"
786+ "xorq %%r8, %%r8\n"
787+ /* (r9,r10,r8) += 2 * a0 * a2 */
788+ "movq %%r11, %%rax\n"
789+ "mulq %%r13\n"
790+ "addq %%rax, %%r9\n"
791+ "adcq %%rdx, %%r10\n"
792+ "adcq $0, %%r8\n"
793+ "addq %%rax, %%r9\n"
794+ "adcq %%rdx, %%r10\n"
795+ "adcq $0, %%r8\n"
796+ /* (r9,r10,r8) += a1 * a1 */
797+ "movq %%r12, %%rax\n"
798+ "mulq %%r12\n"
799+ "addq %%rax, %%r9\n"
800+ "adcq %%rdx, %%r10\n"
801+ "adcq $0, %%r8\n"
802+ /* Extract l2 */
803+ "movq %%r9, 16(%%rsi)\n"
804+ "xorq %%r9, %%r9\n"
805+ /* (r10,r8,r9) += 2 * a0 * a3 */
806+ "movq %%r11, %%rax\n"
807+ "mulq %%r14\n"
808+ "addq %%rax, %%r10\n"
809+ "adcq %%rdx, %%r8\n"
810+ "adcq $0, %%r9\n"
811+ "addq %%rax, %%r10\n"
812+ "adcq %%rdx, %%r8\n"
813+ "adcq $0, %%r9\n"
814+ /* (r10,r8,r9) += 2 * a1 * a2 */
815+ "movq %%r12, %%rax\n"
816+ "mulq %%r13\n"
817+ "addq %%rax, %%r10\n"
818+ "adcq %%rdx, %%r8\n"
819+ "adcq $0, %%r9\n"
820+ "addq %%rax, %%r10\n"
821+ "adcq %%rdx, %%r8\n"
822+ "adcq $0, %%r9\n"
823+ /* Extract l3 */
824+ "movq %%r10, 24(%%rsi)\n"
825+ "xorq %%r10, %%r10\n"
826+ /* (r8,r9,r10) += 2 * a1 * a3 */
827+ "movq %%r12, %%rax\n"
828+ "mulq %%r14\n"
829+ "addq %%rax, %%r8\n"
830+ "adcq %%rdx, %%r9\n"
831+ "adcq $0, %%r10\n"
832+ "addq %%rax, %%r8\n"
833+ "adcq %%rdx, %%r9\n"
834+ "adcq $0, %%r10\n"
835+ /* (r8,r9,r10) += a2 * a2 */
836+ "movq %%r13, %%rax\n"
837+ "mulq %%r13\n"
838+ "addq %%rax, %%r8\n"
839+ "adcq %%rdx, %%r9\n"
840+ "adcq $0, %%r10\n"
841+ /* Extract l4 */
842+ "movq %%r8, 32(%%rsi)\n"
843+ "xorq %%r8, %%r8\n"
844+ /* (r9,r10,r8) += 2 * a2 * a3 */
845+ "movq %%r13, %%rax\n"
846+ "mulq %%r14\n"
847+ "addq %%rax, %%r9\n"
848+ "adcq %%rdx, %%r10\n"
849+ "adcq $0, %%r8\n"
850+ "addq %%rax, %%r9\n"
851+ "adcq %%rdx, %%r10\n"
852+ "adcq $0, %%r8\n"
853+ /* Extract l5 */
854+ "movq %%r9, 40(%%rsi)\n"
855+ /* (r10,r8) += a3 * a3 */
856+ "movq %%r14, %%rax\n"
857+ "mulq %%r14\n"
858+ "addq %%rax, %%r10\n"
859+ "adcq %%rdx, %%r8\n"
860+ /* Extract l6 */
861+ "movq %%r10, 48(%%rsi)\n"
862+ /* Extract l7 */
863+ "movq %%r8, 56(%%rsi)\n"
864+ :
865+ : "S" (l ), "D" (a -> d )
866+ : "rax" , "rdx" , "r8" , "r9" , "r10" , "r11" , "r12" , "r13" , "r14" , "cc" , "memory" );
867+ #else
868+ /* 160 bit accumulator. */
869+ uint64_t c0 = 0 , c1 = 0 ;
870+ uint32_t c2 = 0 ;
871+
872+ /* l[0..7] = a[0..3] * b[0..3]. */
873+ muladd_fast (a -> d [0 ], a -> d [0 ]);
874+ extract_fast (l [0 ]);
875+ muladd2 (a -> d [0 ], a -> d [1 ]);
876+ extract (l [1 ]);
877+ muladd2 (a -> d [0 ], a -> d [2 ]);
878+ muladd (a -> d [1 ], a -> d [1 ]);
879+ extract (l [2 ]);
880+ muladd2 (a -> d [0 ], a -> d [3 ]);
881+ muladd2 (a -> d [1 ], a -> d [2 ]);
882+ extract (l [3 ]);
883+ muladd2 (a -> d [1 ], a -> d [3 ]);
884+ muladd (a -> d [2 ], a -> d [2 ]);
885+ extract (l [4 ]);
886+ muladd2 (a -> d [2 ], a -> d [3 ]);
887+ extract (l [5 ]);
888+ muladd_fast (a -> d [3 ], a -> d [3 ]);
889+ extract_fast (l [6 ]);
890+ VERIFY_CHECK (c1 == 0 );
891+ l [7 ] = c0 ;
892+ #endif
893+ }
894+
736895#undef sumadd
737896#undef sumadd_fast
738897#undef muladd
739898#undef muladd_fast
899+ #undef muladd2
740900#undef extract
741901#undef extract_fast
742902
@@ -758,6 +918,12 @@ static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
758918 return ret ;
759919}
760920
921+ static void secp256k1_scalar_sqr (secp256k1_scalar * r , const secp256k1_scalar * a ) {
922+ uint64_t l [8 ];
923+ secp256k1_scalar_sqr_512 (l , a );
924+ secp256k1_scalar_reduce_512 (r , l );
925+ }
926+
761927static void secp256k1_scalar_split_128 (secp256k1_scalar * r1 , secp256k1_scalar * r2 , const secp256k1_scalar * k ) {
762928 r1 -> d [0 ] = k -> d [0 ];
763929 r1 -> d [1 ] = k -> d [1 ];
0 commit comments