diff --git a/benchmarks.csv b/benchmarks.csv index 4bc2981a..5e3ba323 100644 --- a/benchmarks.csv +++ b/benchmarks.csv @@ -52,21 +52,10 @@ cross-sha3-r-sdpg-1-fast (10 executions),ref,290136,287742,297758,29963868,29960 cross-sha3-r-sdpg-1-small (10 executions),ref,290135,287741,297757,102853622,102847774,102861948,75137510,75126803,75159685 cross-sha3-r-sdpg-3-fast (10 executions),ref,627948,625525,637639,43573841,43565461,43582933,27513830,27493024,27525746 cross-sha3-r-sdpg-5-fast (10 executions),ref,1146280,1142409,1153794,93557878,93547167,93566329,59948216,59857434,60043852 -falcon-1024 (10 executions),clean,602066436,377135260,1488065363,136241759,136017549,136556585,1678109,1677732,1678566 -falcon-1024 (10 executions),m4-ct,408725773,314885208,712370124,87706019,87549942,87839508,990541,984448,997160 -falcon-1024 (10 executions),opt-ct,448194494,301446952,784390745,87699336,87550679,87857833,992822,983184,998271 -falcon-1024 (10 executions),opt-leaktime,371539477,261831977,576613448,80134413,79844667,80338608,992815,982774,998600 -falcon-1024-tree (10 executions),opt-ct,469168139,341160847,733947155,39197559,39095597,39392055,995190,984826,998305 -falcon-1024-tree (10 executions),opt-leaktime,418213501,284879287,699555143,42181577,41844047,42456098,991791,983935,997742 -falcon-512 (10 executions),clean,229742458,134930383,358460785,62255726,62124149,62424751,834970,834402,835533 -falcon-512 (10 executions),m4-ct,146357328,106015844,250638532,40191597,40123901,40381630,482280,472137,485160 -falcon-512 (10 executions),opt-ct,168942163,106015882,258726842,40136012,40046972,40195851,481102,472809,485947 -falcon-512 (10 executions),opt-leaktime,130638983,94352160,240934147,37196341,36969717,37564986,476152,471514,484487 -falcon-512-tree (10 executions),m4-ct,187840863,121618909,531189026,18199972,18111179,18297541,479819,472890,485685 -falcon-512-tree (10 executions),opt-ct,179501018,121618960,347996956,18222471,18064774,18329860,479635,472057,484767 -falcon-512-tree (10 executions),opt-leaktime,203618838,106760540,425495750,20110699,19752157,20375122,480119,472263,485743 -falcon-padded-1024 (10 executions),clean,464867653,351942875,908060882,136157961,135988344,136430038,1677719,1677506,1677932 -falcon-padded-512 (10 executions),clean,241548154,164862595,348699388,62231774,62096573,62365088,834766,834480,834957 +fndsa_provisional-1024 (10 executions),m4f,308608613,195536229,763483542,48321135,48158227,48398718,793856,782604,799182 +fndsa_provisional-1024 (10 executions),ref,274928016,217869128,448029028,107512779,107106716,107788566,1461795,1444739,1469817 +fndsa_provisional-512 (10 executions),m4f,67693338,57825106,81542705,22469685,22280159,22594542,396949,390368,406553 +fndsa_provisional-512 (10 executions),ref,85699591,64822516,132207505,49522949,49325465,49631598,731387,714301,738870 haetae2 (100 executions),m4f,6743278,1555292,25393506,21993963,4721290,86765689,918459,918244,918668 haetae2 (100 executions),ref,9363639,1716264,41895014,31631089,6247382,216853925,1104080,1103874,1104329 haetae3 (100 executions),m4f,12925388,2752846,52240529,30891994,7467529,160522018,1760745,1760408,1761081 @@ -205,21 +194,10 @@ cross-sha3-r-sdpg-1-fast,ref,2328,130928,69560,,,,,, cross-sha3-r-sdpg-1-small,ref,2328,466400,245512,,,,,, cross-sha3-r-sdpg-3-fast,ref,4032,205080,108236,,,,,, cross-sha3-r-sdpg-5-fast,ref,6824,398600,213436,,,,,, -falcon-1024,clean,35076,84604,8776,,,,,, -falcon-1024,m4-ct,1156,2508,376,,,,,, -falcon-1024,opt-ct,1204,2508,376,,,,,, -falcon-1024,opt-leaktime,1252,2580,444,,,,,, -falcon-1024-tree,opt-ct,1148,2884,376,,,,,, -falcon-1024-tree,opt-leaktime,1196,2988,376,,,,,, -falcon-512,clean,18180,43548,4680,,,,,, -falcon-512,m4-ct,1148,2428,376,,,,,, -falcon-512,opt-ct,1244,2428,376,,,,,, -falcon-512,opt-leaktime,1148,2492,376,,,,,, -falcon-512-tree,m4-ct,1172,2636,376,,,,,, -falcon-512-tree,opt-ct,1156,2636,376,,,,,, -falcon-512-tree,opt-leaktime,1196,2828,376,,,,,, -falcon-padded-1024,clean,34988,84596,8776,,,,,, -falcon-padded-512,clean,18092,43540,4680,,,,,, +fndsa_provisional-1024,m4f,27772,81992,5024,,,,,, +fndsa_provisional-1024,ref,27676,82276,5308,,,,,, +fndsa_provisional-512,m4f,14348,41952,2976,,,,,, +fndsa_provisional-512,ref,14380,42124,3260,,,,,, haetae2,m4f,19756,55568,23296,,,,,, haetae2,ref,26092,54444,29696,,,,,, haetae3,m4f,29596,83420,31784,,,,,, @@ -359,21 +337,10 @@ cross-sha3-r-sdpg-1-fast,ref,71.8,74.8,77.1,,,,,, cross-sha3-r-sdpg-1-small,ref,71.8,74.7,78.4,,,,,, cross-sha3-r-sdpg-3-fast,ref,71.7,68.2,68.7,,,,,, cross-sha3-r-sdpg-5-fast,ref,71.1,66.1,66.8,,,,,, -falcon-1024,clean,8.9,0.3,23.7,,,,,, -falcon-1024,m4-ct,8.6,0.4,32.2,,,,,, -falcon-1024,opt-ct,9.8,0.4,32.2,,,,,, -falcon-1024,opt-leaktime,10.9,0.5,32.2,,,,,, -falcon-1024-tree,opt-ct,9.2,0.9,32.3,,,,,, -falcon-1024-tree,opt-leaktime,10.6,0.9,32.3,,,,,, -falcon-512,clean,7.9,0.4,26.0,,,,,, -falcon-512,m4-ct,13.7,0.5,33.9,,,,,, -falcon-512,opt-ct,14.0,0.5,33.2,,,,,, -falcon-512,opt-leaktime,17.3,0.5,33.6,,,,,, -falcon-512-tree,m4-ct,12.6,1.1,33.7,,,,,, -falcon-512-tree,opt-ct,14.6,1.1,34.2,,,,,, -falcon-512-tree,opt-leaktime,20.5,1.0,34.3,,,,,, -falcon-padded-1024,clean,7.3,0.3,23.7,,,,,, -falcon-padded-512,clean,16.0,0.4,26.0,,,,,, +fndsa_provisional-1024,m4f,0.0,0.0,0.0,,,,,, +fndsa_provisional-1024,ref,0.0,0.0,0.0,,,,,, +fndsa_provisional-512,m4f,0.0,0.0,0.0,,,,,, +fndsa_provisional-512,ref,0.0,0.0,0.0,,,,,, haetae2,m4f,12.4,56.7,54.1,,,,,, haetae2,ref,10.6,42.4,45.1,,,,,, haetae3,m4f,14.6,56.6,57.1,,,,,, @@ -512,21 +479,10 @@ cross-sha3-r-sdpg-1-fast,ref,18605,0,208,18813,,,,, cross-sha3-r-sdpg-1-small,ref,18846,0,208,19054,,,,, cross-sha3-r-sdpg-3-fast,ref,19689,0,208,19897,,,,, cross-sha3-r-sdpg-5-fast,ref,18593,0,208,18801,,,,, -falcon-1024,clean,82703,0,0,82703,,,,, -falcon-1024,m4-ct,81825,0,79872,161697,,,,, -falcon-1024,opt-ct,81825,0,79872,161697,,,,, -falcon-1024,opt-leaktime,75429,0,79872,155301,,,,, -falcon-1024-tree,opt-ct,81569,0,55296,136865,,,,, -falcon-1024-tree,opt-leaktime,75173,0,55296,130469,,,,, -falcon-512,clean,82663,0,0,82663,,,,, -falcon-512,m4-ct,81825,0,39936,121761,,,,, -falcon-512,opt-ct,81825,0,39936,121761,,,,, -falcon-512,opt-leaktime,75429,0,39936,115365,,,,, -falcon-512-tree,m4-ct,81569,0,27648,109217,,,,, -falcon-512-tree,opt-ct,81569,0,27648,109217,,,,, -falcon-512-tree,opt-leaktime,75173,0,27648,102821,,,,, -falcon-padded-1024,clean,82643,0,0,82643,,,,, -falcon-padded-512,clean,82599,0,0,82599,,,,, +fndsa_provisional-1024,m4f,103801,0,0,103801,,,,, +fndsa_provisional-1024,ref,103089,0,0,103089,,,,, +fndsa_provisional-512,m4f,103789,0,0,103789,,,,, +fndsa_provisional-512,ref,103077,0,0,103077,,,,, haetae2,m4f,35708,0,0,35708,,,,, haetae2,ref,25568,0,0,25568,,,,, haetae3,m4f,35936,0,0,35936,,,,, diff --git a/benchmarks.md b/benchmarks.md index afe0ae3a..f5439c7d 100644 --- a/benchmarks.md +++ b/benchmarks.md @@ -54,21 +54,10 @@ | cross-sha3-r-sdpg-1-small (10 executions) | ref | AVG: 290,135
MIN: 287,741
MAX: 297,757 | AVG: 102,853,622
MIN: 102,847,774
MAX: 102,861,948 | AVG: 75,137,510
MIN: 75,126,803
MAX: 75,159,685 | | cross-sha3-r-sdpg-3-fast (10 executions) | ref | AVG: 627,948
MIN: 625,525
MAX: 637,639 | AVG: 43,573,841
MIN: 43,565,461
MAX: 43,582,933 | AVG: 27,513,830
MIN: 27,493,024
MAX: 27,525,746 | | cross-sha3-r-sdpg-5-fast (10 executions) | ref | AVG: 1,146,280
MIN: 1,142,409
MAX: 1,153,794 | AVG: 93,557,878
MIN: 93,547,167
MAX: 93,566,329 | AVG: 59,948,216
MIN: 59,857,434
MAX: 60,043,852 | -| falcon-1024 (10 executions) | clean | AVG: 602,066,436
MIN: 377,135,260
MAX: 1,488,065,363 | AVG: 136,241,759
MIN: 136,017,549
MAX: 136,556,585 | AVG: 1,678,109
MIN: 1,677,732
MAX: 1,678,566 | -| falcon-1024 (10 executions) | m4-ct | AVG: 408,725,773
MIN: 314,885,208
MAX: 712,370,124 | AVG: 87,706,019
MIN: 87,549,942
MAX: 87,839,508 | AVG: 990,541
MIN: 984,448
MAX: 997,160 | -| falcon-1024 (10 executions) | opt-ct | AVG: 448,194,494
MIN: 301,446,952
MAX: 784,390,745 | AVG: 87,699,336
MIN: 87,550,679
MAX: 87,857,833 | AVG: 992,822
MIN: 983,184
MAX: 998,271 | -| falcon-1024 (10 executions) | opt-leaktime | AVG: 371,539,477
MIN: 261,831,977
MAX: 576,613,448 | AVG: 80,134,413
MIN: 79,844,667
MAX: 80,338,608 | AVG: 992,815
MIN: 982,774
MAX: 998,600 | -| falcon-1024-tree (10 executions) | opt-ct | AVG: 469,168,139
MIN: 341,160,847
MAX: 733,947,155 | AVG: 39,197,559
MIN: 39,095,597
MAX: 39,392,055 | AVG: 995,190
MIN: 984,826
MAX: 998,305 | -| falcon-1024-tree (10 executions) | opt-leaktime | AVG: 418,213,501
MIN: 284,879,287
MAX: 699,555,143 | AVG: 42,181,577
MIN: 41,844,047
MAX: 42,456,098 | AVG: 991,791
MIN: 983,935
MAX: 997,742 | -| falcon-512 (10 executions) | clean | AVG: 229,742,458
MIN: 134,930,383
MAX: 358,460,785 | AVG: 62,255,726
MIN: 62,124,149
MAX: 62,424,751 | AVG: 834,970
MIN: 834,402
MAX: 835,533 | -| falcon-512 (10 executions) | m4-ct | AVG: 146,357,328
MIN: 106,015,844
MAX: 250,638,532 | AVG: 40,191,597
MIN: 40,123,901
MAX: 40,381,630 | AVG: 482,280
MIN: 472,137
MAX: 485,160 | -| falcon-512 (10 executions) | opt-ct | AVG: 168,942,163
MIN: 106,015,882
MAX: 258,726,842 | AVG: 40,136,012
MIN: 40,046,972
MAX: 40,195,851 | AVG: 481,102
MIN: 472,809
MAX: 485,947 | -| falcon-512 (10 executions) | opt-leaktime | AVG: 130,638,983
MIN: 94,352,160
MAX: 240,934,147 | AVG: 37,196,341
MIN: 36,969,717
MAX: 37,564,986 | AVG: 476,152
MIN: 471,514
MAX: 484,487 | -| falcon-512-tree (10 executions) | m4-ct | AVG: 187,840,863
MIN: 121,618,909
MAX: 531,189,026 | AVG: 18,199,972
MIN: 18,111,179
MAX: 18,297,541 | AVG: 479,819
MIN: 472,890
MAX: 485,685 | -| falcon-512-tree (10 executions) | opt-ct | AVG: 179,501,018
MIN: 121,618,960
MAX: 347,996,956 | AVG: 18,222,471
MIN: 18,064,774
MAX: 18,329,860 | AVG: 479,635
MIN: 472,057
MAX: 484,767 | -| falcon-512-tree (10 executions) | opt-leaktime | AVG: 203,618,838
MIN: 106,760,540
MAX: 425,495,750 | AVG: 20,110,699
MIN: 19,752,157
MAX: 20,375,122 | AVG: 480,119
MIN: 472,263
MAX: 485,743 | -| falcon-padded-1024 (10 executions) | clean | AVG: 464,867,653
MIN: 351,942,875
MAX: 908,060,882 | AVG: 136,157,961
MIN: 135,988,344
MAX: 136,430,038 | AVG: 1,677,719
MIN: 1,677,506
MAX: 1,677,932 | -| falcon-padded-512 (10 executions) | clean | AVG: 241,548,154
MIN: 164,862,595
MAX: 348,699,388 | AVG: 62,231,774
MIN: 62,096,573
MAX: 62,365,088 | AVG: 834,766
MIN: 834,480
MAX: 834,957 | +| fndsa_provisional-1024 (10 executions) | m4f | AVG: 308,608,613
MIN: 195,536,229
MAX: 763,483,542 | AVG: 48,321,135
MIN: 48,158,227
MAX: 48,398,718 | AVG: 793,856
MIN: 782,604
MAX: 799,182 | +| fndsa_provisional-1024 (10 executions) | ref | AVG: 274,928,016
MIN: 217,869,128
MAX: 448,029,028 | AVG: 107,512,779
MIN: 107,106,716
MAX: 107,788,566 | AVG: 1,461,795
MIN: 1,444,739
MAX: 1,469,817 | +| fndsa_provisional-512 (10 executions) | m4f | AVG: 67,693,338
MIN: 57,825,106
MAX: 81,542,705 | AVG: 22,469,685
MIN: 22,280,159
MAX: 22,594,542 | AVG: 396,949
MIN: 390,368
MAX: 406,553 | +| fndsa_provisional-512 (10 executions) | ref | AVG: 85,699,591
MIN: 64,822,516
MAX: 132,207,505 | AVG: 49,522,949
MIN: 49,325,465
MAX: 49,631,598 | AVG: 731,387
MIN: 714,301
MAX: 738,870 | | haetae2 (100 executions) | m4f | AVG: 6,743,278
MIN: 1,555,292
MAX: 25,393,506 | AVG: 21,993,963
MIN: 4,721,290
MAX: 86,765,689 | AVG: 918,459
MIN: 918,244
MAX: 918,668 | | haetae2 (100 executions) | ref | AVG: 9,363,639
MIN: 1,716,264
MAX: 41,895,014 | AVG: 31,631,089
MIN: 6,247,382
MAX: 216,853,925 | AVG: 1,104,080
MIN: 1,103,874
MAX: 1,104,329 | | haetae3 (100 executions) | m4f | AVG: 12,925,388
MIN: 2,752,846
MAX: 52,240,529 | AVG: 30,891,994
MIN: 7,467,529
MAX: 160,522,018 | AVG: 1,760,745
MIN: 1,760,408
MAX: 1,761,081 | @@ -209,21 +198,10 @@ | cross-sha3-r-sdpg-1-small | ref | 2,328 | 466,400 | 245,512 | | cross-sha3-r-sdpg-3-fast | ref | 4,032 | 205,080 | 108,236 | | cross-sha3-r-sdpg-5-fast | ref | 6,824 | 398,600 | 213,436 | -| falcon-1024 | clean | 35,076 | 84,604 | 8,776 | -| falcon-1024 | m4-ct | 1,156 | 2,508 | 376 | -| falcon-1024 | opt-ct | 1,204 | 2,508 | 376 | -| falcon-1024 | opt-leaktime | 1,252 | 2,580 | 444 | -| falcon-1024-tree | opt-ct | 1,148 | 2,884 | 376 | -| falcon-1024-tree | opt-leaktime | 1,196 | 2,988 | 376 | -| falcon-512 | clean | 18,180 | 43,548 | 4,680 | -| falcon-512 | m4-ct | 1,148 | 2,428 | 376 | -| falcon-512 | opt-ct | 1,244 | 2,428 | 376 | -| falcon-512 | opt-leaktime | 1,148 | 2,492 | 376 | -| falcon-512-tree | m4-ct | 1,172 | 2,636 | 376 | -| falcon-512-tree | opt-ct | 1,156 | 2,636 | 376 | -| falcon-512-tree | opt-leaktime | 1,196 | 2,828 | 376 | -| falcon-padded-1024 | clean | 34,988 | 84,596 | 8,776 | -| falcon-padded-512 | clean | 18,092 | 43,540 | 4,680 | +| fndsa_provisional-1024 | m4f | 27,772 | 81,992 | 5,024 | +| fndsa_provisional-1024 | ref | 27,676 | 82,276 | 5,308 | +| fndsa_provisional-512 | m4f | 14,348 | 41,952 | 2,976 | +| fndsa_provisional-512 | ref | 14,380 | 42,124 | 3,260 | | haetae2 | m4f | 19,756 | 55,568 | 23,296 | | haetae2 | ref | 26,092 | 54,444 | 29,696 | | haetae3 | m4f | 29,596 | 83,420 | 31,784 | @@ -364,21 +342,10 @@ | cross-sha3-r-sdpg-1-small | ref | 71.8% | 74.7% | 78.4% | | cross-sha3-r-sdpg-3-fast | ref | 71.7% | 68.2% | 68.7% | | cross-sha3-r-sdpg-5-fast | ref | 71.1% | 66.1% | 66.8% | -| falcon-1024 | clean | 8.9% | 0.3% | 23.7% | -| falcon-1024 | m4-ct | 8.6% | 0.4% | 32.2% | -| falcon-1024 | opt-ct | 9.8% | 0.4% | 32.2% | -| falcon-1024 | opt-leaktime | 10.9% | 0.5% | 32.2% | -| falcon-1024-tree | opt-ct | 9.2% | 0.9% | 32.3% | -| falcon-1024-tree | opt-leaktime | 10.6% | 0.9% | 32.3% | -| falcon-512 | clean | 7.9% | 0.4% | 26.0% | -| falcon-512 | m4-ct | 13.7% | 0.5% | 33.9% | -| falcon-512 | opt-ct | 14.0% | 0.5% | 33.2% | -| falcon-512 | opt-leaktime | 17.3% | 0.5% | 33.6% | -| falcon-512-tree | m4-ct | 12.6% | 1.1% | 33.7% | -| falcon-512-tree | opt-ct | 14.6% | 1.1% | 34.2% | -| falcon-512-tree | opt-leaktime | 20.5% | 1.0% | 34.3% | -| falcon-padded-1024 | clean | 7.3% | 0.3% | 23.7% | -| falcon-padded-512 | clean | 16.0% | 0.4% | 26.0% | +| fndsa_provisional-1024 | m4f | 0.0% | 0.0% | 0.0% | +| fndsa_provisional-1024 | ref | 0.0% | 0.0% | 0.0% | +| fndsa_provisional-512 | m4f | 0.0% | 0.0% | 0.0% | +| fndsa_provisional-512 | ref | 0.0% | 0.0% | 0.0% | | haetae2 | m4f | 12.4% | 56.7% | 54.1% | | haetae2 | ref | 10.6% | 42.4% | 45.1% | | haetae3 | m4f | 14.6% | 56.6% | 57.1% | @@ -519,21 +486,10 @@ | cross-sha3-r-sdpg-1-small | ref | 18,846 | 0 | 208 | 19,054 | | cross-sha3-r-sdpg-3-fast | ref | 19,689 | 0 | 208 | 19,897 | | cross-sha3-r-sdpg-5-fast | ref | 18,593 | 0 | 208 | 18,801 | -| falcon-1024 | clean | 82,703 | 0 | 0 | 82,703 | -| falcon-1024 | m4-ct | 81,825 | 0 | 79,872 | 161,697 | -| falcon-1024 | opt-ct | 81,825 | 0 | 79,872 | 161,697 | -| falcon-1024 | opt-leaktime | 75,429 | 0 | 79,872 | 155,301 | -| falcon-1024-tree | opt-ct | 81,569 | 0 | 55,296 | 136,865 | -| falcon-1024-tree | opt-leaktime | 75,173 | 0 | 55,296 | 130,469 | -| falcon-512 | clean | 82,663 | 0 | 0 | 82,663 | -| falcon-512 | m4-ct | 81,825 | 0 | 39,936 | 121,761 | -| falcon-512 | opt-ct | 81,825 | 0 | 39,936 | 121,761 | -| falcon-512 | opt-leaktime | 75,429 | 0 | 39,936 | 115,365 | -| falcon-512-tree | m4-ct | 81,569 | 0 | 27,648 | 109,217 | -| falcon-512-tree | opt-ct | 81,569 | 0 | 27,648 | 109,217 | -| falcon-512-tree | opt-leaktime | 75,173 | 0 | 27,648 | 102,821 | -| falcon-padded-1024 | clean | 82,643 | 0 | 0 | 82,643 | -| falcon-padded-512 | clean | 82,599 | 0 | 0 | 82,599 | +| fndsa_provisional-1024 | m4f | 103,801 | 0 | 0 | 103,801 | +| fndsa_provisional-1024 | ref | 103,089 | 0 | 0 | 103,089 | +| fndsa_provisional-512 | m4f | 103,789 | 0 | 0 | 103,789 | +| fndsa_provisional-512 | ref | 103,077 | 0 | 0 | 103,077 | | haetae2 | m4f | 35,708 | 0 | 0 | 35,708 | | haetae2 | ref | 25,568 | 0 | 0 | 25,568 | | haetae3 | m4f | 35,936 | 0 | 0 | 35,936 | diff --git a/crypto_sign/falcon-1024/m4-ct/README.txt b/crypto_sign/falcon-1024/m4-ct/README.txt deleted file mode 100644 index 7bedf7f1..00000000 --- a/crypto_sign/falcon-1024/m4-ct/README.txt +++ /dev/null @@ -1,137 +0,0 @@ -Falcon implementation for PQM4 (or even mupq in general). - - -There are multiple variants. Each variant is selected with the choice of -api.h (four choices: api512dyn.h, api512tree.h, api1024dyn.h, -api1024tree.h), and additional compile-time macro that are documented in -config.h and can be set either in config.h, or through command-line -flags passed to the C compiler. - -Choice of api.h: - - api512dyn.h - "Normal" Falcon-512. Private key is reasonably compact. The - Falcon LDL tree is internally recomputed for each signature. - - api512tree.h - Falcon-512 is key expansion. The Falcon LDL tree is computed - as part of the keygen, and returned as private key. This - speeds up signature generation, but also greatly enlarges - the private key size. - - api1024dyn.h - "Normal" Falcon-1024. - - api1024tree.h - Falcon-1024 with key expansion. - -Compile-time options (config.h): - - FALCON_FPEMU - Set to 1 to enable use of the internal constant-time emulation - of floating-point operations. - - FALCON_FPNATIVE - Set to 1 to use the native 'double' type and floating-point - operations. On architectures that lack a FPU, this will use the - compiler-provided floating-point emulation routines, which are - usually not constant-time (and sometimes return values which - do not follow IEEE-754 rounding rules). - - FALCON_ASM_CORTEXM4 - Set to 1 to use the M4 assembly routine for the constant-time - emulation of floating-point operations. These are faster than - the generic routines in C activated by FALCON_FPEMU. - -There is some internal autodetection that tries to select the right -values automatically, but it's safer to explicitly select things: - - To use the native 'double' type: - -DFALCON_FPNATIVE=1 - - To use the generic FP emulation code: - -DFALCON_FPEMU=1 -DFALCON_ASM_CORTEXM4=0 - - To use the M4 assembly code for FP emulation: - -DFALCON_FPEMU=1 -DFALCON_ASM_CORTEXM4=1 - -The code relying on the native 'double' type requires an implementation -that follows IEEE-754 rules with a 64-bit type. It works on 64-bit x86 -and PowerPC / POWER systems. On 32-bit x86, it tends to fail because the -80387 FPU is used with more precision; on such a system, use -'-msse2 -mfpmath=sse' to force use of the SSE2 unit (this might be the -default on some systems, e.g. Darwin / macOS). - - -IMPORTANT NOTES -=============== - - * The PQM4 API is implemented in pqm4.c. Since the M4 stack is usually - small (usual default is 4 kB), temporary buffers are statically - allocated. This implies that the crypto_sign_keypair(), crypto_sign() - and crypto_sign_open() functions are not thread-safe or reentrant. - Also, the static allocation is "forever". - - See the comments for the 'tmp' variable in pqm4.c; this gives the - relevant sizes. - - * When using expanded keys, the private key contains 64-bit values - (floating-point, i.e. 'double' or 'uint64_t' depending on the kind - of floating-point emulation that is used). On many systems, this - implies some alignment requirements. I.e. crypto_sign_keypair() and - crypto_sign() then require the 'sk' pointer to be suitably aligned. - On an ARM Cortex M4, 32-bit alignment is required (while the basic - RAM access opcodes tolerate unaligned accesses, the 'ldm' and 'stm' - opcodes need 32-bit aligned pointers). - - * When using the native 'double' type, the code has a dependency on - the sqrt() function. On x86, the relevant SSE2 opcode is inlined, - but the library function is still (potentially) invoked in case the - operand is negative, so that proper error management is performed. - This case does not happen in Falcon, but the library function is - still referenced, and explicitly linking with '-lm' may be - necessary. - - * When using the native 'double' type, do _NOT_ enable -ffast-math. - The internal rounding function relies on the usual trick: - when x >= 0, round(x) = (x + 2**52) - 2**52 - - This trick works only as long as each addition is rounded as per - the IEEE-754 rules to the exact precision of the 64-bit type. - When -ffast-math is enabled, the compiler may assume commutativity - and "optimize" that expression into 'round(x) = x', which does not - work at all. - - -TESTS -===== - -In the 'tests/' directory is a generator for known-answer tests, and the -expected file. The code comes from the NIST, but was modified to avoid a -dependency on OpenSSL. When compiling the C source file against the -selected Falcon implementation, an executable is produced, that, when -executed, generates an '*.req' and an '*.rsp' files. The .req file is -redundant (the .rsp file contains all the information, and some more). - -The expected .rsp files are provided as: - KAT512dyn.rsp Falcon-512, no expanded key - KAT512tree.rsp Falcon-512, with expanded key - KAT1024dyn.rsp Falcon-1024, no expanded key - KAT1024tree.rsp Falcon-1024, with expanded key - - -Normally, all computations are exact and the files are exactly -reproducible. However, some discrepancies may occur with the '*tree' -files in the following cases: - - - On big-endian architectures, the bytes in sk[] will be in a - different order. This is a side effect of putting the raw bytes - of the expanded key in sk[] (this could be fixed with some - reencoding pass, but this was not implemented yet). - - - If a non-exact IEEE-754 implementation is used, some of the - low bits of the values may be changed. This may happen if the - underlying implementation is not strictly faithful to rounding. - -As long as only the 'sk' lines are changed, then the public keys -and signature values are unimpacted. diff --git a/crypto_sign/falcon-1024/m4-ct/api.h b/crypto_sign/falcon-1024/m4-ct/api.h deleted file mode 100644 index e22e11f1..00000000 --- a/crypto_sign/falcon-1024/m4-ct/api.h +++ /dev/null @@ -1,17 +0,0 @@ -#include - -#define CRYPTO_SECRETKEYBYTES 2305 -#define CRYPTO_PUBLICKEYBYTES 1793 -#define CRYPTO_BYTES 1330 - -#define CRYPTO_ALGNAME "Falcon-1024" - -int crypto_sign_keypair(unsigned char *pk, unsigned char *sk); - -int crypto_sign(unsigned char *sm, size_t *smlen, - const unsigned char *m, size_t mlen, - const unsigned char *sk); - -int crypto_sign_open(unsigned char *m, size_t *mlen, - const unsigned char *sm, size_t smlen, - const unsigned char *pk); diff --git a/crypto_sign/falcon-1024/m4-ct/codec.c b/crypto_sign/falcon-1024/m4-ct/codec.c deleted file mode 100644 index 5bd61424..00000000 --- a/crypto_sign/falcon-1024/m4-ct/codec.c +++ /dev/null @@ -1,559 +0,0 @@ -/* - * Encoding/decoding of keys and signatures. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -/* see inner.h */ -size_t -Zf(modq_encode)( - void *out, size_t max_out_len, - const uint16_t *x, unsigned logn) -{ - size_t n, out_len, u; - uint8_t *buf; - uint32_t acc; - int acc_len; - - n = (size_t)1 << logn; - for (u = 0; u < n; u ++) { - if (x[u] >= 12289) { - return 0; - } - } - out_len = ((n * 14) + 7) >> 3; - if (out == NULL) { - return out_len; - } - if (out_len > max_out_len) { - return 0; - } - buf = out; - acc = 0; - acc_len = 0; - for (u = 0; u < n; u ++) { - acc = (acc << 14) | x[u]; - acc_len += 14; - while (acc_len >= 8) { - acc_len -= 8; - *buf ++ = (uint8_t)(acc >> acc_len); - } - } - if (acc_len > 0) { - *buf = (uint8_t)(acc << (8 - acc_len)); - } - return out_len; -} - -/* see inner.h */ -size_t -Zf(modq_decode)( - uint16_t *x, unsigned logn, - const void *in, size_t max_in_len) -{ - size_t n, in_len, u; - const uint8_t *buf; - uint32_t acc; - int acc_len; - - n = (size_t)1 << logn; - in_len = ((n * 14) + 7) >> 3; - if (in_len > max_in_len) { - return 0; - } - buf = in; - acc = 0; - acc_len = 0; - u = 0; - while (u < n) { - acc = (acc << 8) | (*buf ++); - acc_len += 8; - if (acc_len >= 14) { - unsigned w; - - acc_len -= 14; - w = (acc >> acc_len) & 0x3FFF; - if (w >= 12289) { - return 0; - } - x[u ++] = (uint16_t)w; - } - } - if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) { - return 0; - } - return in_len; -} - -/* see inner.h */ -size_t -Zf(trim_i16_encode)( - void *out, size_t max_out_len, - const int16_t *x, unsigned logn, unsigned bits) -{ - size_t n, u, out_len; - int minv, maxv; - uint8_t *buf; - uint32_t acc, mask; - unsigned acc_len; - - n = (size_t)1 << logn; - maxv = (1 << (bits - 1)) - 1; - minv = -maxv; - for (u = 0; u < n; u ++) { - if (x[u] < minv || x[u] > maxv) { - return 0; - } - } - out_len = ((n * bits) + 7) >> 3; - if (out == NULL) { - return out_len; - } - if (out_len > max_out_len) { - return 0; - } - buf = out; - acc = 0; - acc_len = 0; - mask = ((uint32_t)1 << bits) - 1; - for (u = 0; u < n; u ++) { - acc = (acc << bits) | ((uint16_t)x[u] & mask); - acc_len += bits; - while (acc_len >= 8) { - acc_len -= 8; - *buf ++ = (uint8_t)(acc >> acc_len); - } - } - if (acc_len > 0) { - *buf ++ = (uint8_t)(acc << (8 - acc_len)); - } - return out_len; -} - -/* see inner.h */ -size_t -Zf(trim_i16_decode)( - int16_t *x, unsigned logn, unsigned bits, - const void *in, size_t max_in_len) -{ - size_t n, in_len; - const uint8_t *buf; - size_t u; - uint32_t acc, mask1, mask2; - unsigned acc_len; - - n = (size_t)1 << logn; - in_len = ((n * bits) + 7) >> 3; - if (in_len > max_in_len) { - return 0; - } - buf = in; - u = 0; - acc = 0; - acc_len = 0; - mask1 = ((uint32_t)1 << bits) - 1; - mask2 = (uint32_t)1 << (bits - 1); - while (u < n) { - acc = (acc << 8) | *buf ++; - acc_len += 8; - while (acc_len >= bits && u < n) { - uint32_t w; - - acc_len -= bits; - w = (acc >> acc_len) & mask1; - w |= -(w & mask2); - if (w == -mask2) { - /* - * The -2^(bits-1) value is forbidden. - */ - return 0; - } - w |= -(w & mask2); - x[u ++] = (int16_t)*(int32_t *)&w; - } - } - if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) { - /* - * Extra bits in the last byte must be zero. - */ - return 0; - } - return in_len; -} - -/* see inner.h */ -size_t -Zf(trim_i8_encode)( - void *out, size_t max_out_len, - const int8_t *x, unsigned logn, unsigned bits) -{ - size_t n, u, out_len; - int minv, maxv; - uint8_t *buf; - uint32_t acc, mask; - unsigned acc_len; - - n = (size_t)1 << logn; - maxv = (1 << (bits - 1)) - 1; - minv = -maxv; - for (u = 0; u < n; u ++) { - if (x[u] < minv || x[u] > maxv) { - return 0; - } - } - out_len = ((n * bits) + 7) >> 3; - if (out == NULL) { - return out_len; - } - if (out_len > max_out_len) { - return 0; - } - buf = out; - acc = 0; - acc_len = 0; - mask = ((uint32_t)1 << bits) - 1; - for (u = 0; u < n; u ++) { - acc = (acc << bits) | ((uint8_t)x[u] & mask); - acc_len += bits; - while (acc_len >= 8) { - acc_len -= 8; - *buf ++ = (uint8_t)(acc >> acc_len); - } - } - if (acc_len > 0) { - *buf ++ = (uint8_t)(acc << (8 - acc_len)); - } - return out_len; -} - -/* see inner.h */ -size_t -Zf(trim_i8_decode)( - int8_t *x, unsigned logn, unsigned bits, - const void *in, size_t max_in_len) -{ - size_t n, in_len; - const uint8_t *buf; - size_t u; - uint32_t acc, mask1, mask2; - unsigned acc_len; - - n = (size_t)1 << logn; - in_len = ((n * bits) + 7) >> 3; - if (in_len > max_in_len) { - return 0; - } - buf = in; - u = 0; - acc = 0; - acc_len = 0; - mask1 = ((uint32_t)1 << bits) - 1; - mask2 = (uint32_t)1 << (bits - 1); - while (u < n) { - acc = (acc << 8) | *buf ++; - acc_len += 8; - while (acc_len >= bits && u < n) { - uint32_t w; - - acc_len -= bits; - w = (acc >> acc_len) & mask1; - w |= -(w & mask2); - if (w == -mask2) { - /* - * The -2^(bits-1) value is forbidden. - */ - return 0; - } - x[u ++] = (int8_t)*(int32_t *)&w; - } - } - if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) { - /* - * Extra bits in the last byte must be zero. - */ - return 0; - } - return in_len; -} - -/* see inner.h */ -size_t -Zf(comp_encode)( - void *out, size_t max_out_len, - const int16_t *x, unsigned logn) -{ - uint8_t *buf; - size_t n, u, v; - uint32_t acc; - unsigned acc_len; - - n = (size_t)1 << logn; - buf = out; - - /* - * Make sure that all values are within the -2047..+2047 range. - */ - for (u = 0; u < n; u ++) { - if (x[u] < -2047 || x[u] > +2047) { - return 0; - } - } - - acc = 0; - acc_len = 0; - v = 0; - for (u = 0; u < n; u ++) { - int t; - unsigned w; - - /* - * Get sign and absolute value of next integer; push the - * sign bit. - */ - acc <<= 1; - t = x[u]; - if (t < 0) { - t = -t; - acc |= 1; - } - w = (unsigned)t; - - /* - * Push the low 7 bits of the absolute value. - */ - acc <<= 7; - acc |= w & 127u; - w >>= 7; - - /* - * We pushed exactly 8 bits. - */ - acc_len += 8; - - /* - * Push as many zeros as necessary, then a one. Since the - * absolute value is at most 2047, w can only range up to - * 15 at this point, thus we will add at most 16 bits - * here. With the 8 bits above and possibly up to 7 bits - * from previous iterations, we may go up to 31 bits, which - * will fit in the accumulator, which is an uint32_t. - */ - acc <<= (w + 1); - acc |= 1; - acc_len += w + 1; - - /* - * Produce all full bytes. - */ - while (acc_len >= 8) { - acc_len -= 8; - if (buf != NULL) { - if (v >= max_out_len) { - return 0; - } - buf[v] = (uint8_t)(acc >> acc_len); - } - v ++; - } - } - - /* - * Flush remaining bits (if any). - */ - if (acc_len > 0) { - if (buf != NULL) { - if (v >= max_out_len) { - return 0; - } - buf[v] = (uint8_t)(acc << (8 - acc_len)); - } - v ++; - } - - return v; -} - -/* see inner.h */ -size_t -Zf(comp_decode)( - int16_t *x, unsigned logn, - const void *in, size_t max_in_len) -{ - const uint8_t *buf; - size_t n, u, v; - uint32_t acc; - unsigned acc_len; - - n = (size_t)1 << logn; - buf = in; - acc = 0; - acc_len = 0; - v = 0; - for (u = 0; u < n; u ++) { - unsigned b, s, m; - - /* - * Get next eight bits: sign and low seven bits of the - * absolute value. - */ - if (v >= max_in_len) { - return 0; - } - acc = (acc << 8) | (uint32_t)buf[v ++]; - b = acc >> acc_len; - s = b & 128; - m = b & 127; - - /* - * Get next bits until a 1 is reached. - */ - for (;;) { - if (acc_len == 0) { - if (v >= max_in_len) { - return 0; - } - acc = (acc << 8) | (uint32_t)buf[v ++]; - acc_len = 8; - } - acc_len --; - if (((acc >> acc_len) & 1) != 0) { - break; - } - m += 128; - if (m > 2047) { - return 0; - } - } - x[u] = (int16_t)(s ? -(int)m : (int)m); - } - return v; -} - -/* - * Key elements and signatures are polynomials with small integer - * coefficients. Here are some statistics gathered over many - * generated key pairs (10000 or more for each degree): - * - * log(n) n max(f,g) std(f,g) max(F,G) std(F,G) - * 1 2 129 56.31 143 60.02 - * 2 4 123 40.93 160 46.52 - * 3 8 97 28.97 159 38.01 - * 4 16 100 21.48 154 32.50 - * 5 32 71 15.41 151 29.36 - * 6 64 59 11.07 138 27.77 - * 7 128 39 7.91 144 27.00 - * 8 256 32 5.63 148 26.61 - * 9 512 22 4.00 137 26.46 - * 10 1024 15 2.84 146 26.41 - * - * We want a compact storage format for private key, and, as part of - * key generation, we are allowed to reject some keys which would - * otherwise be fine (this does not induce any noticeable vulnerability - * as long as we reject only a small proportion of possible keys). - * Hence, we enforce at key generation time maximum values for the - * elements of f, g, F and G, so that their encoding can be expressed - * in fixed-width values. Limits have been chosen so that generated - * keys are almost always within bounds, thus not impacting neither - * security or performance. - * - * IMPORTANT: the code assumes that all coefficients of f, g, F and G - * ultimately fit in the -127..+127 range. Thus, none of the elements - * of max_fg_bits[] and max_FG_bits[] shall be greater than 8. - */ - -const uint8_t Zf(max_fg_bits)[] = { - 0, /* unused */ - 8, - 8, - 8, - 8, - 8, - 7, - 7, - 6, - 6, - 5 -}; - -const uint8_t Zf(max_FG_bits)[] = { - 0, /* unused */ - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 8 -}; - -/* - * When generating a new key pair, we can always reject keys which - * feature an abnormally large coefficient. This can also be done for - * signatures, albeit with some care: in case the signature process is - * used in a derandomized setup (explicitly seeded with the message and - * private key), we have to follow the specification faithfully, and the - * specification only enforces a limit on the L2 norm of the signature - * vector. The limit on the L2 norm implies that the absolute value of - * a coefficient of the signature cannot be more than the following: - * - * log(n) n max sig coeff (theoretical) - * 1 2 412 - * 2 4 583 - * 3 8 824 - * 4 16 1166 - * 5 32 1649 - * 6 64 2332 - * 7 128 3299 - * 8 256 4665 - * 9 512 6598 - * 10 1024 9331 - * - * However, the largest observed signature coefficients during our - * experiments was 1077 (in absolute value), hence we can assume that, - * with overwhelming probability, signature coefficients will fit - * in -2047..2047, i.e. 12 bits. - */ - -const uint8_t Zf(max_sig_bits)[] = { - 0, /* unused */ - 10, - 11, - 11, - 12, - 12, - 12, - 12, - 12, - 12, - 12 -}; diff --git a/crypto_sign/falcon-1024/m4-ct/common.c b/crypto_sign/falcon-1024/m4-ct/common.c deleted file mode 100644 index ef30028b..00000000 --- a/crypto_sign/falcon-1024/m4-ct/common.c +++ /dev/null @@ -1,297 +0,0 @@ -/* - * Support functions for signatures (hash-to-point, norm). - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -/* see inner.h */ -void -Zf(hash_to_point_vartime)( - inner_shake256_context *sc, - uint16_t *x, unsigned logn) -{ - /* - * This is the straightforward per-the-spec implementation. It - * is not constant-time, thus it might reveal information on the - * plaintext (at least, enough to check the plaintext against a - * list of potential plaintexts) in a scenario where the - * attacker does not have access to the signature value or to - * the public key, but knows the nonce (without knowledge of the - * nonce, the hashed output cannot be matched against potential - * plaintexts). - */ - size_t n; - - n = (size_t)1 << logn; - while (n > 0) { - uint8_t buf[2]; - uint32_t w; - - inner_shake256_extract(sc, (void *)buf, sizeof buf); - w = ((unsigned)buf[0] << 8) | (unsigned)buf[1]; - if (w < 61445) { - while (w >= 12289) { - w -= 12289; - } - *x ++ = (uint16_t)w; - n --; - } - } -} - -/* see inner.h */ -void -Zf(hash_to_point_ct)( - inner_shake256_context *sc, - uint16_t *x, unsigned logn, uint8_t *tmp) -{ - /* - * Each 16-bit sample is a value in 0..65535. The value is - * kept if it falls in 0..61444 (because 61445 = 5*12289) - * and rejected otherwise; thus, each sample has probability - * about 0.93758 of being selected. - * - * We want to oversample enough to be sure that we will - * have enough values with probability at least 1 - 2^(-256). - * Depending on degree N, this leads to the following - * required oversampling: - * - * logn n oversampling - * 1 2 65 - * 2 4 67 - * 3 8 71 - * 4 16 77 - * 5 32 86 - * 6 64 100 - * 7 128 122 - * 8 256 154 - * 9 512 205 - * 10 1024 287 - * - * If logn >= 7, then the provided temporary buffer is large - * enough. Otherwise, we use a stack buffer of 63 entries - * (i.e. 126 bytes) for the values that do not fit in tmp[]. - */ - - static const uint16_t overtab[] = { - 0, /* unused */ - 65, - 67, - 71, - 77, - 86, - 100, - 122, - 154, - 205, - 287 - }; - - unsigned n, n2, u, m, p, over; - uint16_t *tt1, tt2[63]; - - /* - * We first generate m 16-bit value. Values 0..n-1 go to x[]. - * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[]. - * We also reduce modulo q the values; rejected values are set - * to 0xFFFF. - */ - n = 1U << logn; - n2 = n << 1; - over = overtab[logn]; - m = n + over; - tt1 = (uint16_t *)tmp; - for (u = 0; u < m; u ++) { - uint8_t buf[2]; - uint32_t w, wr; - - inner_shake256_extract(sc, buf, sizeof buf); - w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1]; - wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1)); - wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1)); - wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1)); - wr |= ((w - 61445) >> 31) - 1; - if (u < n) { - x[u] = (uint16_t)wr; - } else if (u < n2) { - tt1[u - n] = (uint16_t)wr; - } else { - tt2[u - n2] = (uint16_t)wr; - } - } - - /* - * Now we must "squeeze out" the invalid values. We do this in - * a logarithmic sequence of passes; each pass computes where a - * value should go, and moves it down by 'p' slots if necessary, - * where 'p' uses an increasing powers-of-two scale. It can be - * shown that in all cases where the loop decides that a value - * has to be moved down by p slots, the destination slot is - * "free" (i.e. contains an invalid value). - */ - for (p = 1; p <= over; p <<= 1) { - unsigned v; - - /* - * In the loop below: - * - * - v contains the index of the final destination of - * the value; it is recomputed dynamically based on - * whether values are valid or not. - * - * - u is the index of the value we consider ("source"); - * its address is s. - * - * - The loop may swap the value with the one at index - * u-p. The address of the swap destination is d. - */ - v = 0; - for (u = 0; u < m; u ++) { - uint16_t *s, *d; - unsigned j, sv, dv, mk; - - if (u < n) { - s = &x[u]; - } else if (u < n2) { - s = &tt1[u - n]; - } else { - s = &tt2[u - n2]; - } - sv = *s; - - /* - * The value in sv should ultimately go to - * address v, i.e. jump back by u-v slots. - */ - j = u - v; - - /* - * We increment v for the next iteration, but - * only if the source value is valid. The mask - * 'mk' is -1 if the value is valid, 0 otherwise, - * so we _subtract_ mk. - */ - mk = (sv >> 15) - 1U; - v -= mk; - - /* - * In this loop we consider jumps by p slots; if - * u < p then there is nothing more to do. - */ - if (u < p) { - continue; - } - - /* - * Destination for the swap: value at address u-p. - */ - if ((u - p) < n) { - d = &x[u - p]; - } else if ((u - p) < n2) { - d = &tt1[(u - p) - n]; - } else { - d = &tt2[(u - p) - n2]; - } - dv = *d; - - /* - * The swap should be performed only if the source - * is valid AND the jump j has its 'p' bit set. - */ - mk &= -(((j & p) + 0x1FF) >> 9); - - *s = (uint16_t)(sv ^ (mk & (sv ^ dv))); - *d = (uint16_t)(dv ^ (mk & (sv ^ dv))); - } - } -} - -/* see inner.h */ -int -Zf(is_short)( - const int16_t *s1, const int16_t *s2, unsigned logn) -{ - /* - * We use the l2-norm. Code below uses only 32-bit operations to - * compute the square of the norm with saturation to 2^32-1 if - * the value exceeds 2^31-1. - */ - size_t n, u; - uint32_t s, ng; - - n = (size_t)1 << logn; - s = 0; - ng = 0; - for (u = 0; u < n; u ++) { - int32_t z; - - z = s1[u]; - s += (uint32_t)(z * z); - ng |= s; - z = s2[u]; - s += (uint32_t)(z * z); - ng |= s; - } - s |= -(ng >> 31); - - /* - * Acceptance bound on the l2-norm is: - * 1.2*1.55*sqrt(q)*sqrt(2*N) - * Value 7085 is floor((1.2^2)*(1.55^2)*2*1024). - */ - return s < (((uint32_t)7085 * (uint32_t)12289) >> (10 - logn)); -} - -/* see inner.h */ -int -Zf(is_short_half)( - uint32_t sqn, const int16_t *s2, unsigned logn) -{ - size_t n, u; - uint32_t ng; - - n = (size_t)1 << logn; - ng = -(sqn >> 31); - for (u = 0; u < n; u ++) { - int32_t z; - - z = s2[u]; - sqn += (uint32_t)(z * z); - ng |= sqn; - } - sqn |= -(ng >> 31); - - /* - * Acceptance bound on the l2-norm is: - * 1.2*1.55*sqrt(q)*sqrt(2*N) - * Value 7085 is floor((1.2^2)*(1.55^2)*2*1024). - */ - return sqn < (((uint32_t)7085 * (uint32_t)12289) >> (10 - logn)); -} diff --git a/crypto_sign/falcon-1024/m4-ct/config.h b/crypto_sign/falcon-1024/m4-ct/config.h deleted file mode 100644 index cd78727e..00000000 --- a/crypto_sign/falcon-1024/m4-ct/config.h +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Manual configuration file for the Falcon implementation. Here can - * be set some compilation-time options. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#ifndef FALCON_CONFIG_H__ -#define FALCON_CONFIG_H__ - -/* - * Each option is a macro which should be defined to either 1 or 0. - * If any of the options below is left undefined, then a default value - * will be used by the code, possibly using compile-time autodetection - * from compiler-defined macros. - * - * Explicitly setting a parameter can be done by uncommenting/modifying - * its definition below, in this file, or equivalently by setting it as - * a compiler flag. - */ - -/* - * Use the native 'double' C type for floating-point computations. Exact - * reproducibility of all tests requires that type to faithfully follow - * IEEE-754 "round-to-nearest" rules. - * - * Native double support will use the CPU hardware and/or - * compiler-provided functions; the latter is typically NOT - * constant-time, while the former MAY be constant-time, or not. On - * recent x86 CPU in 64-bit mode, SSE2 opcodes are used and they provide - * constant-time operations for all the operations used in Falcon, - * except for some special cases of divisions and square roots, but it - * can be shown that theses cases imply only negligible leak of - * information that cannot be leveraged into a full attack. - * - * If neither FALCON_FPNATIVE nor FALCON_FPEMU is defined, then use of - * the native 'double' C type is the default behaviour unless - * FALCON_ASM_CORTEXM4 is defined to 1, in which case the emulated code - * will be used. - * -#define FALCON_FPNATIVE 1 - */ - -/* - * Use emulated floating-point implementation. - * - * Emulation uses only integer operations with uint32_t and uint64_t - * types. This is constant-time, provided that the underlying platform - * offers constant-time opcodes for the following operations: - * - * - Multiplication of two 32-bit unsigned integers into a 64-bit result. - * - Left-shift or right-shift of a 32-bit unsigned integer by a - * potentially secret shift count in the 0..31 range. - * - * Notably, the ARM Cortex M3 does not fulfill the first condition, - * while the Pentium IV does not fulfill the second. - * - * If neither FALCON_FPNATIVE nor FALCON_FPEMU is defined, then use of - * the native 'double' C type is the default behaviour unless - * FALCON_ASM_CORTEXM4 is defined to 1, in which case the emulated code - * will be used. - * -#define FALCON_FPEMU 1 - */ - -/* - * Enable use of assembly for ARM Cortex-M4 CPU. By default, such - * support will be used based on some autodection on the compiler - * version and target architecture. Define this variable to 1 to force - * use of the assembly code, or 0 to disable it regardless of the - * autodetection. - * - * When FALCON_ASM_CORTEXM4 is enabled (whether defined explicitly or - * autodetected), emulated floating-point code will be used, unless - * FALCON_FPNATIVE or FALCON_FPEMU is explicitly set to override the - * choice. Emulated code with ARM assembly is constant-time and provides - * better performance than emulated code with plain C. - * - * The assembly code for the M4 can also work on a Cortex-M3. If the - * compiler is instructed to target the M3 (e.g. '-mcpu=cortex-m3' with - * GCC) then FALCON_ASM_CORTEXM4 won't be autodetected, but it can be - * enabled explicitly. Take care, though, that the M3 multiplication - * opcode (multiplication of two 32-bit unsigned integers with a 64-bit - * result) is NOT constant-time. - * -#define FALCON_ASM_CORTEXM4 1 - */ - -#define FALCON_ASM_CORTEXM4 1 - -/* - * Enable use of AVX2 intrinsics. If enabled, then the code will compile - * only when targeting x86 with a compiler that supports AVX2 intrinsics - * (tested with GCC 7.4.0, Clang 6.0.0, and MSVC 2015, both in 32-bit - * and 64-bit modes), and run only on systems that offer the AVX2 - * opcodes. Some operations leverage AVX2 for better performance. - * -#define FALCON_AVX2 1 - */ - -/* - * Enable use of FMA intrinsics. This setting has any effect only if - * FALCON_AVX2 is also enabled. The FMA intrinsics are normally available - * on any x86 CPU that also has AVX2. Note that setting this option will - * slightly modify the values of expanded private keys, but will normally - * not change the values of non-expanded private keys, public keys or - * signatures, for a given keygen/sign seed (non-expanded private keys - * and signatures might theoretically change, but only with low probability, - * less than 2^(-40); produced signatures are still safe and interoperable). - * -#define FALCON_FMA 1 - */ - -/* - * Assert that the platform uses little-endian encoding. If enabled, - * then encoding and decoding of aligned multibyte values will be - * slightly faster (especially for hashing and random number - * generation). If not defined explicitly, then autodetection is - * applied. - * -#define FALCON_LE 1 - */ - -/* - * Assert that the platform tolerates accesses to unaligned multibyte - * values. If enabled, then some operations are slightly faster. Note - * that ARM Cortex M4 do _not_ fully tolerate unaligned accesses; for - * such systems, this option should not be enabled. If not defined - * explicitly, then autodetection is applied. - * -#define FALCON_UNALIGNED 1 - */ - -/* - * Use a PRNG based on ChaCha20 and seeded with SHAKE256, instead of - * SHAKE256 directly, for key pair generation purposes. This speeds up - * key pair generation, especially on platforms where SHAKE256 is - * comparatively slow: on the ARM Cortex M4, average key generation time - * is reduced by 19% with this setting; on a recent x86 Skylake, the - * reduction is smaller (less than 8%). - * - * However, this setting changes the private/public key pair obtained - * from a given seed, thus preventing reproducibility of the - * known-answer tests vectors. For compatibility with existing KAT - * vectors (e.g. in PQClean, pqm4 and NIST implementations), this - * setting is not enabled by default. - * -#define FALCON_KG_CHACHA20 1 - */ - -/* - * Use an explicit OS-provided source of randomness for seeding (for the - * Zf(get_seed)() function implementation). Three possible sources are - * defined: - * - * - getentropy() system call - * - /dev/urandom special file - * - CryptGenRandom() function call - * - * More than one source may be enabled, in which case they will be tried - * in the order above, until a success is reached. - * - * By default, sources are enabled at compile-time based on these - * conditions: - * - * - getentropy(): target is one of: Linux with Glibc-2.25+, FreeBSD 12+, - * or OpenBSD. - * - /dev/urandom: target is a Unix-like system (including Linux, - * FreeBSD, NetBSD, OpenBSD, DragonFly, macOS, Android, Solaris, AIX). - * - CryptGenRandom(): target is Windows (Win32 or Win64). - * - * On most small embedded systems, none will be enabled and Zf(get_seed)() - * will always return 0. Applications will need to provide their own seeds. - * -#define FALCON_RAND_GETENTROPY 1 -#define FALCON_RAND_URANDOM 1 -#define FALCON_RAND_WIN32 1 - */ - -#endif diff --git a/crypto_sign/falcon-1024/m4-ct/fft.c b/crypto_sign/falcon-1024/m4-ct/fft.c deleted file mode 100644 index b1904b24..00000000 --- a/crypto_sign/falcon-1024/m4-ct/fft.c +++ /dev/null @@ -1,1412 +0,0 @@ -/* - * FFT code. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -/* - * Rules for complex number macros: - * -------------------------------- - * - * Operand order is: destination, source1, source2... - * - * Each operand is a real and an imaginary part. - * - * All overlaps are allowed. - */ - -/* - * Addition of two complex numbers (d = a + b). - */ -#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im) do { \ - fpr fpct_re, fpct_im; \ - fpct_re = fpr_add(a_re, b_re); \ - fpct_im = fpr_add(a_im, b_im); \ - (d_re) = fpct_re; \ - (d_im) = fpct_im; \ - } while (0) - -/* - * Subtraction of two complex numbers (d = a - b). - */ -#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im) do { \ - fpr fpct_re, fpct_im; \ - fpct_re = fpr_sub(a_re, b_re); \ - fpct_im = fpr_sub(a_im, b_im); \ - (d_re) = fpct_re; \ - (d_im) = fpct_im; \ - } while (0) - -/* - * Multplication of two complex numbers (d = a * b). - */ -#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im) do { \ - fpr fpct_a_re, fpct_a_im; \ - fpr fpct_b_re, fpct_b_im; \ - fpr fpct_d_re, fpct_d_im; \ - fpct_a_re = (a_re); \ - fpct_a_im = (a_im); \ - fpct_b_re = (b_re); \ - fpct_b_im = (b_im); \ - fpct_d_re = fpr_sub( \ - fpr_mul(fpct_a_re, fpct_b_re), \ - fpr_mul(fpct_a_im, fpct_b_im)); \ - fpct_d_im = fpr_add( \ - fpr_mul(fpct_a_re, fpct_b_im), \ - fpr_mul(fpct_a_im, fpct_b_re)); \ - (d_re) = fpct_d_re; \ - (d_im) = fpct_d_im; \ - } while (0) - -/* - * Squaring of a complex number (d = a * a). - */ -#define FPC_SQR(d_re, d_im, a_re, a_im) do { \ - fpr fpct_a_re, fpct_a_im; \ - fpr fpct_d_re, fpct_d_im; \ - fpct_a_re = (a_re); \ - fpct_a_im = (a_im); \ - fpct_d_re = fpr_sub(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \ - fpct_d_im = fpr_double(fpr_mul(fpct_a_re, fpct_a_im)); \ - (d_re) = fpct_d_re; \ - (d_im) = fpct_d_im; \ - } while (0) - -/* - * Inversion of a complex number (d = 1 / a). - */ -#define FPC_INV(d_re, d_im, a_re, a_im) do { \ - fpr fpct_a_re, fpct_a_im; \ - fpr fpct_d_re, fpct_d_im; \ - fpr fpct_m; \ - fpct_a_re = (a_re); \ - fpct_a_im = (a_im); \ - fpct_m = fpr_add(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \ - fpct_m = fpr_inv(fpct_m); \ - fpct_d_re = fpr_mul(fpct_a_re, fpct_m); \ - fpct_d_im = fpr_mul(fpr_neg(fpct_a_im), fpct_m); \ - (d_re) = fpct_d_re; \ - (d_im) = fpct_d_im; \ - } while (0) - -/* - * Division of complex numbers (d = a / b). - */ -#define FPC_DIV(d_re, d_im, a_re, a_im, b_re, b_im) do { \ - fpr fpct_a_re, fpct_a_im; \ - fpr fpct_b_re, fpct_b_im; \ - fpr fpct_d_re, fpct_d_im; \ - fpr fpct_m; \ - fpct_a_re = (a_re); \ - fpct_a_im = (a_im); \ - fpct_b_re = (b_re); \ - fpct_b_im = (b_im); \ - fpct_m = fpr_add(fpr_sqr(fpct_b_re), fpr_sqr(fpct_b_im)); \ - fpct_m = fpr_inv(fpct_m); \ - fpct_b_re = fpr_mul(fpct_b_re, fpct_m); \ - fpct_b_im = fpr_mul(fpr_neg(fpct_b_im), fpct_m); \ - fpct_d_re = fpr_sub( \ - fpr_mul(fpct_a_re, fpct_b_re), \ - fpr_mul(fpct_a_im, fpct_b_im)); \ - fpct_d_im = fpr_add( \ - fpr_mul(fpct_a_re, fpct_b_im), \ - fpr_mul(fpct_a_im, fpct_b_re)); \ - (d_re) = fpct_d_re; \ - (d_im) = fpct_d_im; \ - } while (0) - -/* - * Let w = exp(i*pi/N); w is a primitive 2N-th root of 1. We define the - * values w_j = w^(2j+1) for all j from 0 to N-1: these are the roots - * of X^N+1 in the field of complex numbers. A crucial property is that - * w_{N-1-j} = conj(w_j) = 1/w_j for all j. - * - * FFT representation of a polynomial f (taken modulo X^N+1) is the - * set of values f(w_j). Since f is real, conj(f(w_j)) = f(conj(w_j)), - * thus f(w_{N-1-j}) = conj(f(w_j)). We thus store only half the values, - * for j = 0 to N/2-1; the other half can be recomputed easily when (if) - * needed. A consequence is that FFT representation has the same size - * as normal representation: N/2 complex numbers use N real numbers (each - * complex number is the combination of a real and an imaginary part). - * - * We use a specific ordering which makes computations easier. Let rev() - * be the bit-reversal function over log(N) bits. For j in 0..N/2-1, we - * store the real and imaginary parts of f(w_j) in slots: - * - * Re(f(w_j)) -> slot rev(j)/2 - * Im(f(w_j)) -> slot rev(j)/2+N/2 - * - * (Note that rev(j) is even for j < N/2.) - */ - -/* see inner.h */ -TARGET_AVX2 -void -Zf(FFT)(fpr *f, unsigned logn) -{ - /* - * FFT algorithm in bit-reversal order uses the following - * iterative algorithm: - * - * t = N - * for m = 1; m < N; m *= 2: - * ht = t/2 - * for i1 = 0; i1 < m; i1 ++: - * j1 = i1 * t - * s = GM[m + i1] - * for j = j1; j < (j1 + ht); j ++: - * x = f[j] - * y = s * f[j + ht] - * f[j] = x + y - * f[j + ht] = x - y - * t = ht - * - * GM[k] contains w^rev(k) for primitive root w = exp(i*pi/N). - * - * In the description above, f[] is supposed to contain complex - * numbers. In our in-memory representation, the real and - * imaginary parts of f[k] are in array slots k and k+N/2. - * - * We only keep the first half of the complex numbers. We can - * see that after the first iteration, the first and second halves - * of the array of complex numbers have separate lives, so we - * simply ignore the second part. - */ - - unsigned u; - size_t t, n, hn, m; - - /* - * First iteration: compute f[j] + i * f[j+N/2] for all j < N/2 - * (because GM[1] = w^rev(1) = w^(N/2) = i). - * In our chosen representation, this is a no-op: everything is - * already where it should be. - */ - - /* - * Subsequent iterations are truncated to use only the first - * half of values. - */ - n = (size_t)1 << logn; - hn = n >> 1; - t = hn; - for (u = 1, m = 2; u < logn; u ++, m <<= 1) { - size_t ht, hm, i1, j1; - - ht = t >> 1; - hm = m >> 1; - for (i1 = 0, j1 = 0; i1 < hm; i1 ++, j1 += t) { - size_t j, j2; - - j2 = j1 + ht; -#if FALCON_AVX2 // yyyAVX2+1 - if (ht >= 4) { - __m256d s_re, s_im; - - s_re = _mm256_set1_pd( - fpr_gm_tab[((m + i1) << 1) + 0].v); - s_im = _mm256_set1_pd( - fpr_gm_tab[((m + i1) << 1) + 1].v); - for (j = j1; j < j2; j += 4) { - __m256d x_re, x_im, y_re, y_im; - __m256d z_re, z_im; - - x_re = _mm256_loadu_pd(&f[j].v); - x_im = _mm256_loadu_pd(&f[j + hn].v); - z_re = _mm256_loadu_pd(&f[j+ht].v); - z_im = _mm256_loadu_pd(&f[j+ht + hn].v); - y_re = FMSUB(z_re, s_re, - _mm256_mul_pd(z_im, s_im)); - y_im = FMADD(z_re, s_im, - _mm256_mul_pd(z_im, s_re)); - _mm256_storeu_pd(&f[j].v, - _mm256_add_pd(x_re, y_re)); - _mm256_storeu_pd(&f[j + hn].v, - _mm256_add_pd(x_im, y_im)); - _mm256_storeu_pd(&f[j + ht].v, - _mm256_sub_pd(x_re, y_re)); - _mm256_storeu_pd(&f[j + ht + hn].v, - _mm256_sub_pd(x_im, y_im)); - } - } else { - fpr s_re, s_im; - - s_re = fpr_gm_tab[((m + i1) << 1) + 0]; - s_im = fpr_gm_tab[((m + i1) << 1) + 1]; - for (j = j1; j < j2; j ++) { - fpr x_re, x_im, y_re, y_im; - - x_re = f[j]; - x_im = f[j + hn]; - y_re = f[j + ht]; - y_im = f[j + ht + hn]; - FPC_MUL(y_re, y_im, - y_re, y_im, s_re, s_im); - FPC_ADD(f[j], f[j + hn], - x_re, x_im, y_re, y_im); - FPC_SUB(f[j + ht], f[j + ht + hn], - x_re, x_im, y_re, y_im); - } - } -#else // yyyAVX2+0 - fpr s_re, s_im; - - s_re = fpr_gm_tab[((m + i1) << 1) + 0]; - s_im = fpr_gm_tab[((m + i1) << 1) + 1]; - for (j = j1; j < j2; j ++) { - fpr x_re, x_im, y_re, y_im; - - x_re = f[j]; - x_im = f[j + hn]; - y_re = f[j + ht]; - y_im = f[j + ht + hn]; - FPC_MUL(y_re, y_im, y_re, y_im, s_re, s_im); - FPC_ADD(f[j], f[j + hn], - x_re, x_im, y_re, y_im); - FPC_SUB(f[j + ht], f[j + ht + hn], - x_re, x_im, y_re, y_im); - } -#endif // yyyAVX2- - } - t = ht; - } -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(iFFT)(fpr *f, unsigned logn) -{ - /* - * Inverse FFT algorithm in bit-reversal order uses the following - * iterative algorithm: - * - * t = 1 - * for m = N; m > 1; m /= 2: - * hm = m/2 - * dt = t*2 - * for i1 = 0; i1 < hm; i1 ++: - * j1 = i1 * dt - * s = iGM[hm + i1] - * for j = j1; j < (j1 + t); j ++: - * x = f[j] - * y = f[j + t] - * f[j] = x + y - * f[j + t] = s * (x - y) - * t = dt - * for i1 = 0; i1 < N; i1 ++: - * f[i1] = f[i1] / N - * - * iGM[k] contains (1/w)^rev(k) for primitive root w = exp(i*pi/N) - * (actually, iGM[k] = 1/GM[k] = conj(GM[k])). - * - * In the main loop (not counting the final division loop), in - * all iterations except the last, the first and second half of f[] - * (as an array of complex numbers) are separate. In our chosen - * representation, we do not keep the second half. - * - * The last iteration recombines the recomputed half with the - * implicit half, and should yield only real numbers since the - * target polynomial is real; moreover, s = i at that step. - * Thus, when considering x and y: - * y = conj(x) since the final f[j] must be real - * Therefore, f[j] is filled with 2*Re(x), and f[j + t] is - * filled with 2*Im(x). - * But we already have Re(x) and Im(x) in array slots j and j+t - * in our chosen representation. That last iteration is thus a - * simple doubling of the values in all the array. - * - * We make the last iteration a no-op by tweaking the final - * division into a division by N/2, not N. - */ - size_t u, n, hn, t, m; - - n = (size_t)1 << logn; - t = 1; - m = n; - hn = n >> 1; - for (u = logn; u > 1; u --) { - size_t hm, dt, i1, j1; - - hm = m >> 1; - dt = t << 1; - for (i1 = 0, j1 = 0; j1 < hn; i1 ++, j1 += dt) { - size_t j, j2; - - j2 = j1 + t; -#if FALCON_AVX2 // yyyAVX2+1 - if (t >= 4) { - __m256d s_re, s_im; - - s_re = _mm256_set1_pd( - fpr_gm_tab[((hm + i1) << 1) + 0].v); - s_im = _mm256_set1_pd( - fpr_gm_tab[((hm + i1) << 1) + 1].v); - for (j = j1; j < j2; j += 4) { - __m256d x_re, x_im, y_re, y_im; - __m256d z_re, z_im; - - x_re = _mm256_loadu_pd(&f[j].v); - x_im = _mm256_loadu_pd(&f[j + hn].v); - y_re = _mm256_loadu_pd(&f[j+t].v); - y_im = _mm256_loadu_pd(&f[j+t + hn].v); - _mm256_storeu_pd(&f[j].v, - _mm256_add_pd(x_re, y_re)); - _mm256_storeu_pd(&f[j + hn].v, - _mm256_add_pd(x_im, y_im)); - x_re = _mm256_sub_pd(y_re, x_re); - x_im = _mm256_sub_pd(x_im, y_im); - z_re = FMSUB(x_im, s_im, - _mm256_mul_pd(x_re, s_re)); - z_im = FMADD(x_re, s_im, - _mm256_mul_pd(x_im, s_re)); - _mm256_storeu_pd(&f[j+t].v, z_re); - _mm256_storeu_pd(&f[j+t + hn].v, z_im); - } - } else { - fpr s_re, s_im; - - s_re = fpr_gm_tab[((hm + i1) << 1)+0]; - s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1)+1]); - for (j = j1; j < j2; j ++) { - fpr x_re, x_im, y_re, y_im; - - x_re = f[j]; - x_im = f[j + hn]; - y_re = f[j + t]; - y_im = f[j + t + hn]; - FPC_ADD(f[j], f[j + hn], - x_re, x_im, y_re, y_im); - FPC_SUB(x_re, x_im, - x_re, x_im, y_re, y_im); - FPC_MUL(f[j + t], f[j + t + hn], - x_re, x_im, s_re, s_im); - } - } -#else // yyyAVX2+0 - fpr s_re, s_im; - - s_re = fpr_gm_tab[((hm + i1) << 1) + 0]; - s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1) + 1]); - for (j = j1; j < j2; j ++) { - fpr x_re, x_im, y_re, y_im; - - x_re = f[j]; - x_im = f[j + hn]; - y_re = f[j + t]; - y_im = f[j + t + hn]; - FPC_ADD(f[j], f[j + hn], - x_re, x_im, y_re, y_im); - FPC_SUB(x_re, x_im, x_re, x_im, y_re, y_im); - FPC_MUL(f[j + t], f[j + t + hn], - x_re, x_im, s_re, s_im); - } -#endif // yyyAVX2- - } - t = dt; - m = hm; - } - - /* - * Last iteration is a no-op, provided that we divide by N/2 - * instead of N. We need to make a special case for logn = 0. - */ - if (logn > 0) { - fpr ni; - - ni = fpr_p2_tab[logn]; - for (u = 0; u < n; u ++) { - f[u] = fpr_mul(f[u], ni); - } - } -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_add)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, u; - - n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 4) { - for (u = 0; u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_add_pd( - _mm256_loadu_pd(&a[u].v), - _mm256_loadu_pd(&b[u].v))); - } - } else { - for (u = 0; u < n; u ++) { - a[u] = fpr_add(a[u], b[u]); - } - } -#else // yyyAVX2+0 - for (u = 0; u < n; u ++) { - a[u] = fpr_add(a[u], b[u]); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_sub)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, u; - - n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 4) { - for (u = 0; u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_sub_pd( - _mm256_loadu_pd(&a[u].v), - _mm256_loadu_pd(&b[u].v))); - } - } else { - for (u = 0; u < n; u ++) { - a[u] = fpr_sub(a[u], b[u]); - } - } -#else // yyyAVX2+0 - for (u = 0; u < n; u ++) { - a[u] = fpr_sub(a[u], b[u]); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_neg)(fpr *a, unsigned logn) -{ - size_t n, u; - - n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 4) { - __m256d s; - - s = _mm256_set1_pd(-0.0); - for (u = 0; u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s)); - } - } else { - for (u = 0; u < n; u ++) { - a[u] = fpr_neg(a[u]); - } - } -#else // yyyAVX2+0 - for (u = 0; u < n; u ++) { - a[u] = fpr_neg(a[u]); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_adj_fft)(fpr *a, unsigned logn) -{ - size_t n, u; - - n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d s; - - s = _mm256_set1_pd(-0.0); - for (u = (n >> 1); u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s)); - } - } else { - for (u = (n >> 1); u < n; u ++) { - a[u] = fpr_neg(a[u]); - } - } -#else // yyyAVX2+0 - for (u = (n >> 1); u < n; u ++) { - a[u] = fpr_neg(a[u]); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_mul_fft)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, b_re, b_im, c_re, c_im; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - b_re = _mm256_loadu_pd(&b[u].v); - b_im = _mm256_loadu_pd(&b[u + hn].v); - c_re = FMSUB( - a_re, b_re, _mm256_mul_pd(a_im, b_im)); - c_im = FMADD( - a_re, b_im, _mm256_mul_pd(a_im, b_re)); - _mm256_storeu_pd(&a[u].v, c_re); - _mm256_storeu_pd(&a[u + hn].v, c_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_muladj_fft)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, b_re, b_im, c_re, c_im; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - b_re = _mm256_loadu_pd(&b[u].v); - b_im = _mm256_loadu_pd(&b[u + hn].v); - c_re = FMADD( - a_re, b_re, _mm256_mul_pd(a_im, b_im)); - c_im = FMSUB( - a_im, b_re, _mm256_mul_pd(a_re, b_im)); - _mm256_storeu_pd(&a[u].v, c_re); - _mm256_storeu_pd(&a[u + hn].v, c_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = fpr_neg(b[u + hn]); - FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = fpr_neg(b[u + hn]); - FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn) -{ - /* - * Since each coefficient is multiplied with its own conjugate, - * the result contains only real values. - */ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d zero; - - zero = _mm256_setzero_pd(); - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - _mm256_storeu_pd(&a[u].v, - FMADD(a_re, a_re, - _mm256_mul_pd(a_im, a_im))); - _mm256_storeu_pd(&a[u + hn].v, zero); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im; - - a_re = a[u]; - a_im = a[u + hn]; - a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)); - a[u + hn] = fpr_zero; - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr a_re, a_im; - - a_re = a[u]; - a_im = a[u + hn]; - a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)); - a[u + hn] = fpr_zero; - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_mulconst)(fpr *a, fpr x, unsigned logn) -{ - size_t n, u; - - n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 4) { - __m256d x4; - - x4 = _mm256_set1_pd(x.v); - for (u = 0; u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_mul_pd(x4, _mm256_loadu_pd(&a[u].v))); - } - } else { - for (u = 0; u < n; u ++) { - a[u] = fpr_mul(a[u], x); - } - } -#else // yyyAVX2+0 - for (u = 0; u < n; u ++) { - a[u] = fpr_mul(a[u], x); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_div_fft)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, b_re, b_im, c_re, c_im, t; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - b_re = _mm256_loadu_pd(&b[u].v); - b_im = _mm256_loadu_pd(&b[u + hn].v); - t = _mm256_div_pd(one, - FMADD(b_re, b_re, - _mm256_mul_pd(b_im, b_im))); - b_re = _mm256_mul_pd(b_re, t); - b_im = _mm256_mul_pd(b_im, t); - c_re = FMADD( - a_re, b_re, _mm256_mul_pd(a_im, b_im)); - c_im = FMSUB( - a_im, b_re, _mm256_mul_pd(a_re, b_im)); - _mm256_storeu_pd(&a[u].v, c_re); - _mm256_storeu_pd(&a[u + hn].v, c_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_invnorm2_fft)(fpr *restrict d, - const fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, b_re, b_im, dv; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - b_re = _mm256_loadu_pd(&b[u].v); - b_im = _mm256_loadu_pd(&b[u + hn].v); - dv = _mm256_div_pd(one, - _mm256_add_pd( - FMADD(a_re, a_re, - _mm256_mul_pd(a_im, a_im)), - FMADD(b_re, b_re, - _mm256_mul_pd(b_im, b_im)))); - _mm256_storeu_pd(&d[u].v, dv); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im; - fpr b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - d[u] = fpr_inv(fpr_add( - fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)), - fpr_add(fpr_sqr(b_re), fpr_sqr(b_im)))); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr a_re, a_im; - fpr b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - d[u] = fpr_inv(fpr_add( - fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)), - fpr_add(fpr_sqr(b_re), fpr_sqr(b_im)))); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_add_muladj_fft)(fpr *restrict d, - const fpr *restrict F, const fpr *restrict G, - const fpr *restrict f, const fpr *restrict g, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - for (u = 0; u < hn; u += 4) { - __m256d F_re, F_im, G_re, G_im; - __m256d f_re, f_im, g_re, g_im; - __m256d a_re, a_im, b_re, b_im; - - F_re = _mm256_loadu_pd(&F[u].v); - F_im = _mm256_loadu_pd(&F[u + hn].v); - G_re = _mm256_loadu_pd(&G[u].v); - G_im = _mm256_loadu_pd(&G[u + hn].v); - f_re = _mm256_loadu_pd(&f[u].v); - f_im = _mm256_loadu_pd(&f[u + hn].v); - g_re = _mm256_loadu_pd(&g[u].v); - g_im = _mm256_loadu_pd(&g[u + hn].v); - - a_re = FMADD(F_re, f_re, - _mm256_mul_pd(F_im, f_im)); - a_im = FMSUB(F_im, f_re, - _mm256_mul_pd(F_re, f_im)); - b_re = FMADD(G_re, g_re, - _mm256_mul_pd(G_im, g_im)); - b_im = FMSUB(G_im, g_re, - _mm256_mul_pd(G_re, g_im)); - _mm256_storeu_pd(&d[u].v, - _mm256_add_pd(a_re, b_re)); - _mm256_storeu_pd(&d[u + hn].v, - _mm256_add_pd(a_im, b_im)); - } - } else { - for (u = 0; u < hn; u ++) { - fpr F_re, F_im, G_re, G_im; - fpr f_re, f_im, g_re, g_im; - fpr a_re, a_im, b_re, b_im; - - F_re = F[u]; - F_im = F[u + hn]; - G_re = G[u]; - G_im = G[u + hn]; - f_re = f[u]; - f_im = f[u + hn]; - g_re = g[u]; - g_im = g[u + hn]; - - FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im)); - FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im)); - d[u] = fpr_add(a_re, b_re); - d[u + hn] = fpr_add(a_im, b_im); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr F_re, F_im, G_re, G_im; - fpr f_re, f_im, g_re, g_im; - fpr a_re, a_im, b_re, b_im; - - F_re = F[u]; - F_im = F[u + hn]; - G_re = G[u]; - G_im = G[u + hn]; - f_re = f[u]; - f_im = f[u + hn]; - g_re = g[u]; - g_im = g[u + hn]; - - FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im)); - FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im)); - d[u] = fpr_add(a_re, b_re); - d[u + hn] = fpr_add(a_im, b_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_mul_autoadj_fft)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, bv; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - bv = _mm256_loadu_pd(&b[u].v); - _mm256_storeu_pd(&a[u].v, - _mm256_mul_pd(a_re, bv)); - _mm256_storeu_pd(&a[u + hn].v, - _mm256_mul_pd(a_im, bv)); - } - } else { - for (u = 0; u < hn; u ++) { - a[u] = fpr_mul(a[u], b[u]); - a[u + hn] = fpr_mul(a[u + hn], b[u]); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - a[u] = fpr_mul(a[u], b[u]); - a[u + hn] = fpr_mul(a[u + hn], b[u]); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_div_autoadj_fft)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d ib, a_re, a_im; - - ib = _mm256_div_pd(one, _mm256_loadu_pd(&b[u].v)); - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - _mm256_storeu_pd(&a[u].v, _mm256_mul_pd(a_re, ib)); - _mm256_storeu_pd(&a[u + hn].v, _mm256_mul_pd(a_im, ib)); - } - } else { - for (u = 0; u < hn; u ++) { - fpr ib; - - ib = fpr_inv(b[u]); - a[u] = fpr_mul(a[u], ib); - a[u + hn] = fpr_mul(a[u + hn], ib); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr ib; - - ib = fpr_inv(b[u]); - a[u] = fpr_mul(a[u], ib); - a[u + hn] = fpr_mul(a[u + hn], ib); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_LDL_fft)( - const fpr *restrict g00, - fpr *restrict g01, fpr *restrict g11, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - __m256d t, mu_re, mu_im, xi_re, xi_im; - - g00_re = _mm256_loadu_pd(&g00[u].v); - g00_im = _mm256_loadu_pd(&g00[u + hn].v); - g01_re = _mm256_loadu_pd(&g01[u].v); - g01_im = _mm256_loadu_pd(&g01[u + hn].v); - g11_re = _mm256_loadu_pd(&g11[u].v); - g11_im = _mm256_loadu_pd(&g11[u + hn].v); - - t = _mm256_div_pd(one, - FMADD(g00_re, g00_re, - _mm256_mul_pd(g00_im, g00_im))); - g00_re = _mm256_mul_pd(g00_re, t); - g00_im = _mm256_mul_pd(g00_im, t); - mu_re = FMADD(g01_re, g00_re, - _mm256_mul_pd(g01_im, g00_im)); - mu_im = FMSUB(g01_re, g00_im, - _mm256_mul_pd(g01_im, g00_re)); - xi_re = FMSUB(mu_re, g01_re, - _mm256_mul_pd(mu_im, g01_im)); - xi_im = FMADD(mu_im, g01_re, - _mm256_mul_pd(mu_re, g01_im)); - _mm256_storeu_pd(&g11[u].v, - _mm256_sub_pd(g11_re, xi_re)); - _mm256_storeu_pd(&g11[u + hn].v, - _mm256_add_pd(g11_im, xi_im)); - _mm256_storeu_pd(&g01[u].v, mu_re); - _mm256_storeu_pd(&g01[u + hn].v, mu_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - fpr mu_re, mu_im; - - g00_re = g00[u]; - g00_im = g00[u + hn]; - g01_re = g01[u]; - g01_im = g01[u + hn]; - g11_re = g11[u]; - g11_im = g11[u + hn]; - FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im); - FPC_MUL(g01_re, g01_im, - mu_re, mu_im, g01_re, fpr_neg(g01_im)); - FPC_SUB(g11[u], g11[u + hn], - g11_re, g11_im, g01_re, g01_im); - g01[u] = mu_re; - g01[u + hn] = fpr_neg(mu_im); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - fpr mu_re, mu_im; - - g00_re = g00[u]; - g00_im = g00[u + hn]; - g01_re = g01[u]; - g01_im = g01[u + hn]; - g11_re = g11[u]; - g11_im = g11[u + hn]; - FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im); - FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im)); - FPC_SUB(g11[u], g11[u + hn], g11_re, g11_im, g01_re, g01_im); - g01[u] = mu_re; - g01[u + hn] = fpr_neg(mu_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_LDLmv_fft)( - fpr *restrict d11, fpr *restrict l10, - const fpr *restrict g00, const fpr *restrict g01, - const fpr *restrict g11, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - __m256d t, mu_re, mu_im, xi_re, xi_im; - - g00_re = _mm256_loadu_pd(&g00[u].v); - g00_im = _mm256_loadu_pd(&g00[u + hn].v); - g01_re = _mm256_loadu_pd(&g01[u].v); - g01_im = _mm256_loadu_pd(&g01[u + hn].v); - g11_re = _mm256_loadu_pd(&g11[u].v); - g11_im = _mm256_loadu_pd(&g11[u + hn].v); - - t = _mm256_div_pd(one, - FMADD(g00_re, g00_re, - _mm256_mul_pd(g00_im, g00_im))); - g00_re = _mm256_mul_pd(g00_re, t); - g00_im = _mm256_mul_pd(g00_im, t); - mu_re = FMADD(g01_re, g00_re, - _mm256_mul_pd(g01_im, g00_im)); - mu_im = FMSUB(g01_re, g00_im, - _mm256_mul_pd(g01_im, g00_re)); - xi_re = FMSUB(mu_re, g01_re, - _mm256_mul_pd(mu_im, g01_im)); - xi_im = FMADD(mu_im, g01_re, - _mm256_mul_pd(mu_re, g01_im)); - _mm256_storeu_pd(&d11[u].v, - _mm256_sub_pd(g11_re, xi_re)); - _mm256_storeu_pd(&d11[u + hn].v, - _mm256_add_pd(g11_im, xi_im)); - _mm256_storeu_pd(&l10[u].v, mu_re); - _mm256_storeu_pd(&l10[u + hn].v, mu_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - fpr mu_re, mu_im; - - g00_re = g00[u]; - g00_im = g00[u + hn]; - g01_re = g01[u]; - g01_im = g01[u + hn]; - g11_re = g11[u]; - g11_im = g11[u + hn]; - FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im); - FPC_MUL(g01_re, g01_im, - mu_re, mu_im, g01_re, fpr_neg(g01_im)); - FPC_SUB(d11[u], d11[u + hn], - g11_re, g11_im, g01_re, g01_im); - l10[u] = mu_re; - l10[u + hn] = fpr_neg(mu_im); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - fpr mu_re, mu_im; - - g00_re = g00[u]; - g00_im = g00[u + hn]; - g01_re = g01[u]; - g01_im = g01[u + hn]; - g11_re = g11[u]; - g11_im = g11[u + hn]; - FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im); - FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im)); - FPC_SUB(d11[u], d11[u + hn], g11_re, g11_im, g01_re, g01_im); - l10[u] = mu_re; - l10[u + hn] = fpr_neg(mu_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_split_fft)( - fpr *restrict f0, fpr *restrict f1, - const fpr *restrict f, unsigned logn) -{ - /* - * The FFT representation we use is in bit-reversed order - * (element i contains f(w^(rev(i))), where rev() is the - * bit-reversal function over the ring degree. This changes - * indexes with regards to the Falcon specification. - */ - size_t n, hn, qn, u; - - n = (size_t)1 << logn; - hn = n >> 1; - qn = hn >> 1; - -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d half, sv; - - half = _mm256_set1_pd(0.5); - sv = _mm256_set_pd(-0.0, 0.0, -0.0, 0.0); - for (u = 0; u < qn; u += 2) { - __m256d ab_re, ab_im, ff0, ff1, ff2, ff3, gmt; - - ab_re = _mm256_loadu_pd(&f[(u << 1)].v); - ab_im = _mm256_loadu_pd(&f[(u << 1) + hn].v); - ff0 = _mm256_mul_pd(_mm256_hadd_pd(ab_re, ab_im), half); - ff0 = _mm256_permute4x64_pd(ff0, 0xD8); - _mm_storeu_pd(&f0[u].v, - _mm256_extractf128_pd(ff0, 0)); - _mm_storeu_pd(&f0[u + qn].v, - _mm256_extractf128_pd(ff0, 1)); - - ff1 = _mm256_mul_pd(_mm256_hsub_pd(ab_re, ab_im), half); - gmt = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v); - ff2 = _mm256_shuffle_pd(ff1, ff1, 0x5); - ff3 = _mm256_hadd_pd( - _mm256_mul_pd(ff1, gmt), - _mm256_xor_pd(_mm256_mul_pd(ff2, gmt), sv)); - ff3 = _mm256_permute4x64_pd(ff3, 0xD8); - _mm_storeu_pd(&f1[u].v, - _mm256_extractf128_pd(ff3, 0)); - _mm_storeu_pd(&f1[u + qn].v, - _mm256_extractf128_pd(ff3, 1)); - } - } else { - f0[0] = f[0]; - f1[0] = f[hn]; - - for (u = 0; u < qn; u ++) { - fpr a_re, a_im, b_re, b_im; - fpr t_re, t_im; - - a_re = f[(u << 1) + 0]; - a_im = f[(u << 1) + 0 + hn]; - b_re = f[(u << 1) + 1]; - b_im = f[(u << 1) + 1 + hn]; - - FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im); - f0[u] = fpr_half(t_re); - f0[u + qn] = fpr_half(t_im); - - FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im); - FPC_MUL(t_re, t_im, t_re, t_im, - fpr_gm_tab[((u + hn) << 1) + 0], - fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1])); - f1[u] = fpr_half(t_re); - f1[u + qn] = fpr_half(t_im); - } - } -#else // yyyAVX2+0 - /* - * We process complex values by pairs. For logn = 1, there is only - * one complex value (the other one is the implicit conjugate), - * so we add the two lines below because the loop will be - * skipped. - */ - f0[0] = f[0]; - f1[0] = f[hn]; - - for (u = 0; u < qn; u ++) { - fpr a_re, a_im, b_re, b_im; - fpr t_re, t_im; - - a_re = f[(u << 1) + 0]; - a_im = f[(u << 1) + 0 + hn]; - b_re = f[(u << 1) + 1]; - b_im = f[(u << 1) + 1 + hn]; - - FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im); - f0[u] = fpr_half(t_re); - f0[u + qn] = fpr_half(t_im); - - FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im); - FPC_MUL(t_re, t_im, t_re, t_im, - fpr_gm_tab[((u + hn) << 1) + 0], - fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1])); - f1[u] = fpr_half(t_re); - f1[u + qn] = fpr_half(t_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_merge_fft)( - fpr *restrict f, - const fpr *restrict f0, const fpr *restrict f1, unsigned logn) -{ - size_t n, hn, qn, u; - - n = (size_t)1 << logn; - hn = n >> 1; - qn = hn >> 1; - -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 16) { - for (u = 0; u < qn; u += 4) { - __m256d a_re, a_im, b_re, b_im, c_re, c_im; - __m256d gm1, gm2, g_re, g_im; - __m256d t_re, t_im, u_re, u_im; - __m256d tu1_re, tu2_re, tu1_im, tu2_im; - - a_re = _mm256_loadu_pd(&f0[u].v); - a_im = _mm256_loadu_pd(&f0[u + qn].v); - c_re = _mm256_loadu_pd(&f1[u].v); - c_im = _mm256_loadu_pd(&f1[u + qn].v); - - gm1 = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v); - gm2 = _mm256_loadu_pd(&fpr_gm_tab[(u + 2 + hn) << 1].v); - g_re = _mm256_unpacklo_pd(gm1, gm2); - g_im = _mm256_unpackhi_pd(gm1, gm2); - g_re = _mm256_permute4x64_pd(g_re, 0xD8); - g_im = _mm256_permute4x64_pd(g_im, 0xD8); - - b_re = FMSUB( - c_re, g_re, _mm256_mul_pd(c_im, g_im)); - b_im = FMADD( - c_re, g_im, _mm256_mul_pd(c_im, g_re)); - - t_re = _mm256_add_pd(a_re, b_re); - t_im = _mm256_add_pd(a_im, b_im); - u_re = _mm256_sub_pd(a_re, b_re); - u_im = _mm256_sub_pd(a_im, b_im); - - tu1_re = _mm256_unpacklo_pd(t_re, u_re); - tu2_re = _mm256_unpackhi_pd(t_re, u_re); - tu1_im = _mm256_unpacklo_pd(t_im, u_im); - tu2_im = _mm256_unpackhi_pd(t_im, u_im); - _mm256_storeu_pd(&f[(u << 1)].v, - _mm256_permute2f128_pd(tu1_re, tu2_re, 0x20)); - _mm256_storeu_pd(&f[(u << 1) + 4].v, - _mm256_permute2f128_pd(tu1_re, tu2_re, 0x31)); - _mm256_storeu_pd(&f[(u << 1) + hn].v, - _mm256_permute2f128_pd(tu1_im, tu2_im, 0x20)); - _mm256_storeu_pd(&f[(u << 1) + 4 + hn].v, - _mm256_permute2f128_pd(tu1_im, tu2_im, 0x31)); - } - } else { - f[0] = f0[0]; - f[hn] = f1[0]; - - for (u = 0; u < qn; u ++) { - fpr a_re, a_im, b_re, b_im; - fpr t_re, t_im; - - a_re = f0[u]; - a_im = f0[u + qn]; - FPC_MUL(b_re, b_im, f1[u], f1[u + qn], - fpr_gm_tab[((u + hn) << 1) + 0], - fpr_gm_tab[((u + hn) << 1) + 1]); - FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im); - f[(u << 1) + 0] = t_re; - f[(u << 1) + 0 + hn] = t_im; - FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im); - f[(u << 1) + 1] = t_re; - f[(u << 1) + 1 + hn] = t_im; - } - } -#else // yyyAVX2+0 - /* - * An extra copy to handle the special case logn = 1. - */ - f[0] = f0[0]; - f[hn] = f1[0]; - - for (u = 0; u < qn; u ++) { - fpr a_re, a_im, b_re, b_im; - fpr t_re, t_im; - - a_re = f0[u]; - a_im = f0[u + qn]; - FPC_MUL(b_re, b_im, f1[u], f1[u + qn], - fpr_gm_tab[((u + hn) << 1) + 0], - fpr_gm_tab[((u + hn) << 1) + 1]); - FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im); - f[(u << 1) + 0] = t_re; - f[(u << 1) + 0 + hn] = t_im; - FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im); - f[(u << 1) + 1] = t_re; - f[(u << 1) + 1 + hn] = t_im; - } -#endif // yyyAVX2- -} diff --git a/crypto_sign/falcon-1024/m4-ct/fpr.c b/crypto_sign/falcon-1024/m4-ct/fpr.c deleted file mode 100644 index eb23a44b..00000000 --- a/crypto_sign/falcon-1024/m4-ct/fpr.c +++ /dev/null @@ -1,3460 +0,0 @@ -/* - * Floating-point operations. - * - * This file implements the non-inline functions declared in - * fpr.h, as well as the constants for FFT / iFFT. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -#if FALCON_FPEMU // yyyFPEMU+1 - -/* - * Normalize a provided unsigned integer to the 2^63..2^64-1 range by - * left-shifting it if necessary. The exponent e is adjusted accordingly - * (i.e. if the value was left-shifted by n bits, then n is subtracted - * from e). If source m is 0, then it remains 0, but e is altered. - * Both m and e must be simple variables (no expressions allowed). - */ -#define FPR_NORM64(m, e) do { \ - uint32_t nt; \ - \ - (e) -= 63; \ - \ - nt = (uint32_t)((m) >> 32); \ - nt = (nt | -nt) >> 31; \ - (m) ^= ((m) ^ ((m) << 32)) & ((uint64_t)nt - 1); \ - (e) += (int)(nt << 5); \ - \ - nt = (uint32_t)((m) >> 48); \ - nt = (nt | -nt) >> 31; \ - (m) ^= ((m) ^ ((m) << 16)) & ((uint64_t)nt - 1); \ - (e) += (int)(nt << 4); \ - \ - nt = (uint32_t)((m) >> 56); \ - nt = (nt | -nt) >> 31; \ - (m) ^= ((m) ^ ((m) << 8)) & ((uint64_t)nt - 1); \ - (e) += (int)(nt << 3); \ - \ - nt = (uint32_t)((m) >> 60); \ - nt = (nt | -nt) >> 31; \ - (m) ^= ((m) ^ ((m) << 4)) & ((uint64_t)nt - 1); \ - (e) += (int)(nt << 2); \ - \ - nt = (uint32_t)((m) >> 62); \ - nt = (nt | -nt) >> 31; \ - (m) ^= ((m) ^ ((m) << 2)) & ((uint64_t)nt - 1); \ - (e) += (int)(nt << 1); \ - \ - nt = (uint32_t)((m) >> 63); \ - (m) ^= ((m) ^ ((m) << 1)) & ((uint64_t)nt - 1); \ - (e) += (int)(nt); \ - } while (0) - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -__attribute__((naked)) -fpr -fpr_scaled(int64_t i __attribute__((unused)), int sc __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, lr }\n\t" - "\n\t" - "@ Input i is in r0:r1, and sc in r2.\n\t" - "@ Extract the sign bit, and compute the absolute value.\n\t" - "@ -> sign bit in r3, with value 0 or -1\n\t" - "asrs r3, r1, #31\n\t" - "eors r0, r3\n\t" - "eors r1, r3\n\t" - "subs r0, r3\n\t" - "sbcs r1, r3\n\t" - "\n\t" - "@ Scale exponent to account for the encoding; if the source is\n\t" - "@ zero or if the scaled exponent is negative, it is set to 32.\n\t" - "addw r2, r2, #1022\n\t" - "orrs r4, r0, r1\n\t" - "bics r4, r4, r2, asr #31\n\t" - "rsbs r5, r4, #0\n\t" - "orrs r4, r5\n\t" - "ands r2, r2, r4, asr #31\n\t" - "adds r2, #32\n\t" - "\n\t" - "@ Normalize value to a full 64-bit width, by shifting it left.\n\t" - "@ The shift count is subtracted from the exponent (in r2).\n\t" - "@ If the mantissa is 0, the exponent is set to 0.\n\t" - "\n\t" - "@ If top word is 0, replace with low word; otherwise, add 32 to\n\t" - "@ the exponent.\n\t" - "rsbs r4, r1, #0\n\t" - "orrs r4, r1\n\t" - "eors r5, r0, r1\n\t" - "bics r5, r5, r4, asr #31\n\t" - "eors r1, r5\n\t" - "ands r0, r0, r4, asr #31\n\t" - "lsrs r4, r4, #31\n\t" - "adds r2, r2, r4, lsl #5\n\t" - "\n\t" - "@ Count leading zeros of r1 to finish the shift.\n\t" - "clz r4, r1\n\t" - "subs r2, r4\n\t" - "rsbs r5, r4, #32\n\t" - "lsls r1, r4\n\t" - "lsrs r5, r0, r5\n\t" - "lsls r0, r4\n\t" - "orrs r1, r5\n\t" - "\n\t" - "@ Clear the top bit; we know it's a 1 (unless the whole mantissa\n\t" - "@ was zero, but then it's still OK to clear it)\n\t" - "bfc r1, #31, #1\n\t" - "\n\t" - "@ Now shift right the value by 11 bits; this puts the value in\n\t" - "@ the 2^52..2^53-1 range. We also keep a copy of the pre-shift\n\t" - "@ low bits in r5.\n\t" - "movs r5, r0\n\t" - "lsrs r0, #11\n\t" - "orrs r0, r0, r1, lsl #21\n\t" - "lsrs r1, #11\n\t" - "\n\t" - "@ Also plug the exponent at the right place. This must be done\n\t" - "@ now so that, in case the rounding creates a carry, that carry\n\t" - "@ adds to the exponent, which would be exactly what we want at\n\t" - "@ that point.\n\t" - "orrs r1, r1, r2, lsl #20\n\t" - "\n\t" - "@ Rounding: we must add 1 to the mantissa in the following cases:\n\t" - "@ - bits 11 to 9 of r5 are '011', '110' or '111'\n\t" - "@ - bits 11 to 9 of r5 are '010' and one of the\n\t" - "@ bits 0 to 8 is non-zero\n\t" - "ubfx r6, r5, #0, #9\n\t" - "addw r6, r6, #511\n\t" - "orrs r5, r6\n\t" - "\n\t" - "ubfx r5, r5, #9, #3\n\t" - "movs r6, #0xC8\n\t" - "lsrs r6, r5\n\t" - "ands r6, #1\n\t" - "adds r0, r6\n\t" - "adcs r1, #0\n\t" - "\n\t" - "@ Put back the sign.\n\t" - "orrs r1, r1, r3, lsl #31\n\t" - "\n\t" - "pop { r4, r5, r6, pc}\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - -fpr -fpr_scaled(int64_t i, int sc) -{ - /* - * To convert from int to float, we have to do the following: - * 1. Get the absolute value of the input, and its sign - * 2. Shift right or left the value as appropriate - * 3. Pack the result - * - * We can assume that the source integer is not -2^63. - */ - int s, e; - uint32_t t; - uint64_t m; - - /* - * Extract sign bit. - * We have: -i = 1 + ~i - */ - s = (int)((uint64_t)i >> 63); - i ^= -(int64_t)s; - i += s; - - /* - * For now we suppose that i != 0. - * Otherwise, we set m to i and left-shift it as much as needed - * to get a 1 in the top bit. We can do that in a logarithmic - * number of conditional shifts. - */ - m = (uint64_t)i; - e = 9 + sc; - FPR_NORM64(m, e); - - /* - * Now m is in the 2^63..2^64-1 range. We must divide it by 512; - * if one of the dropped bits is a 1, this should go into the - * "sticky bit". - */ - m |= ((uint32_t)m & 0x1FF) + 0x1FF; - m >>= 9; - - /* - * Corrective action: if i = 0 then all of the above was - * incorrect, and we clamp e and m down to zero. - */ - t = (uint32_t)((uint64_t)(i | -i) >> 63); - m &= -(uint64_t)t; - e &= -(int)t; - - /* - * Assemble back everything. The FPR() function will handle cases - * where e is too low. - */ - return FPR(s, e, m); -} - -#endif // yyyASM_CORTEXM4- - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -// yyyPQCLEAN+0 -#if 0 -/* Debug code -- To get a printout of registers from a specific point - in ARM Cortex M4 assembly code, uncomment this code and add a - "bl DEBUG" call where wished for. */ - -void -print_regs(uint32_t *rr, uint32_t flags) -{ - int i; - extern int printf(const char *fmt, ...); - - printf("\nRegs:\n"); - for (i = 0; i < 7; i ++) { - int j; - - j = i + 7; - printf(" %2d = %08X %2d = %08X\n", i, rr[i], j, rr[j]); - } - printf(" flags = %08X ", flags); - if ((flags >> 31) & 1) { - printf("N"); - } - if ((flags >> 30) & 1) { - printf("Z"); - } - if ((flags >> 29) & 1) { - printf("C"); - } - if ((flags >> 28) & 1) { - printf("V"); - } - if ((flags >> 27) & 1) { - printf("Q"); - } - printf("\n"); -} - -__attribute__((naked)) -void -DEBUG(void) -{ - __asm__ ( - "push { r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr }\n\t" - "mov r0, sp\n\t" - "mrs r1, apsr\n\t" - "bl print_regs\n\t" - "pop { r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, pc }\n\t" - ); -} -#endif -// yyyPQCLEAN- - -__attribute__((naked)) -fpr -fpr_add(fpr x __attribute__((unused)), fpr y __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, r7, r8, r10, r11, lr }\n\t" - "\n\t" - "@ Make sure that the first operand (x) has the larger absolute\n\t" - "@ value. This guarantees that the exponent of y is less than\n\t" - "@ or equal to the exponent of x, and, if they are equal, then\n\t" - "@ the mantissa of y will not be greater than the mantissa of x.\n\t" - "@ However, if absolute values are equal and the sign of x is 1,\n\t" - "@ then we want to also swap the values.\n\t" - "ubfx r4, r1, #0, #31 @ top word without sign bit\n\t" - "ubfx r5, r3, #0, #31 @ top word without sign bit\n\t" - "subs r7, r0, r2 @ difference in r7:r4\n\t" - "sbcs r4, r5\n\t" - "orrs r7, r4\n\t" - "rsbs r5, r7, #0\n\t" - "orrs r7, r5 @ bit 31 of r7 is 0 iff difference is zero\n\t" - "bics r6, r1, r7\n\t" - "orrs r6, r4 @ bit 31 of r6 is 1 iff the swap must be done\n\t" - "\n\t" - "@ Conditional swap\n\t" - "eors r4, r0, r2\n\t" - "eors r5, r1, r3\n\t" - "ands r4, r4, r6, asr #31\n\t" - "ands r5, r5, r6, asr #31\n\t" - "eors r0, r4\n\t" - "eors r1, r5\n\t" - "eors r2, r4\n\t" - "eors r3, r5\n\t" - "\n\t" - "@ Extract mantissa of x into r0:r1, exponent in r4, sign in r5\n\t" - "ubfx r4, r1, #20, #11 @ Exponent in r4 (without sign)\n\t" - "addw r5, r4, #2047 @ Get a carry to test r4 for zero\n\t" - "lsrs r5, #11 @ r5 is the mantissa implicit high bit\n\t" - "bfc r1, #20, #11 @ Clear exponent bits (not the sign)\n\t" - "orrs r1, r1, r5, lsl #20 @ Set mantissa high bit\n\t" - "asrs r5, r1, #31 @ Get sign bit (sign-extended)\n\t" - "bfc r1, #31, #1 @ Clear the sign bit\n\t" - "\n\t" - "@ Extract mantissa of y into r2:r3, exponent in r6, sign in r7\n\t" - "ubfx r6, r3, #20, #11 @ Exponent in r6 (without sign)\n\t" - "addw r7, r6, #2047 @ Get a carry to test r6 for zero\n\t" - "lsrs r7, #11 @ r7 is the mantissa implicit high bit\n\t" - "bfc r3, #20, #11 @ Clear exponent bits (not the sign)\n\t" - "orrs r3, r3, r7, lsl #20 @ Set mantissa high bit\n\t" - "asrs r7, r3, #31 @ Get sign bit (sign-extended)\n\t" - "bfc r3, #31, #1 @ Clear the sign bit\n\t" - "\n\t" - "@ Scale mantissas up by three bits.\n\t" - "lsls r1, #3\n\t" - "orrs r1, r1, r0, lsr #29\n\t" - "lsls r0, #3\n\t" - "lsls r3, #3\n\t" - "orrs r3, r3, r2, lsr #29\n\t" - "lsls r2, #3\n\t" - "\n\t" - "@ x: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t" - "@ y: exponent=r6, sign=r7, mantissa=r2:r3 (scaled up 3 bits)\n\t" - "\n\t" - "@ At that point, the exponent of x (in r4) is larger than that\n\t" - "@ of y (in r6). The difference is the amount of shifting that\n\t" - "@ should be done on y. If that amount is larger than 59 then\n\t" - "@ we clamp y to 0. We won't need y's exponent beyond that point,\n\t" - "@ so we store that shift count in r6.\n\t" - "subs r6, r4, r6\n\t" - "subs r8, r6, #60\n\t" - "ands r2, r2, r8, asr #31\n\t" - "ands r3, r3, r8, asr #31\n\t" - "\n\t" - "@ Shift right r2:r3 by r6 bits. The shift count is in the 0..59\n\t" - "@ range. r11 will be non-zero if and only if some non-zero bits\n\t" - "@ were dropped.\n\t" - "subs r8, r6, #32\n\t" - "bics r11, r2, r8, asr #31\n\t" - "ands r2, r2, r8, asr #31\n\t" - "bics r10, r3, r8, asr #31\n\t" - "orrs r2, r2, r10\n\t" - "ands r3, r3, r8, asr #31\n\t" - "ands r6, r6, #31\n\t" - "rsbs r8, r6, #32\n\t" - "lsls r10, r2, r8\n\t" - "orrs r11, r11, r10\n\t" - "lsrs r2, r2, r6\n\t" - "lsls r10, r3, r8\n\t" - "orrs r2, r2, r10\n\t" - "lsrs r3, r3, r6\n\t" - "\n\t" - "@ If r11 is non-zero then some non-zero bit was dropped and the\n\t" - "@ low bit of r2 must be forced to 1 ('sticky bit').\n\t" - "rsbs r6, r11, #0\n\t" - "orrs r6, r6, r11\n\t" - "orrs r2, r2, r6, lsr #31\n\t" - "\n\t" - "@ x: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t" - "@ y: sign=r7, value=r2:r3 (scaled to same exponent as x)\n\t" - "\n\t" - "@ If x and y don't have the same sign, then we should negate r2:r3\n\t" - "@ (i.e. subtract the mantissa instead of adding it). Signs of x\n\t" - "@ and y are in r5 and r7, as full-width words. We won't need r7\n\t" - "@ afterwards.\n\t" - "eors r7, r5 @ r7 = -1 if y must be negated, 0 otherwise\n\t" - "eors r2, r7\n\t" - "eors r3, r7\n\t" - "subs r2, r7\n\t" - "sbcs r3, r7\n\t" - "\n\t" - "@ r2:r3 has been shifted, we can add to r0:r1.\n\t" - "adds r0, r2\n\t" - "adcs r1, r3\n\t" - "\n\t" - "@ result: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t" - "\n\t" - "@ Normalize the result with some left-shifting to full 64-bit\n\t" - "@ width. Shift count goes to r2, and exponent (r4) is adjusted.\n\t" - "clz r2, r0\n\t" - "clz r3, r1\n\t" - "sbfx r6, r3, #5, #1\n\t" - "ands r2, r6\n\t" - "adds r2, r2, r3\n\t" - "subs r4, r4, r2\n\t" - "\n\t" - "@ Shift r0:r1 to the left by r2 bits.\n\t" - "subs r7, r2, #32\n\t" - "lsls r7, r0, r7\n\t" - "lsls r1, r1, r2\n\t" - "rsbs r6, r2, #32\n\t" - "orrs r1, r1, r7\n\t" - "lsrs r6, r0, r6\n\t" - "orrs r1, r1, r6\n\t" - "lsls r0, r0, r2\n\t" - "\n\t" - "@ The exponent of x was in r4. The left-shift operation has\n\t" - "@ subtracted some value from it, 8 in case the result has the\n\t" - "@ same exponent as x. However, the high bit of the mantissa will\n\t" - "@ add 1 to the exponent, so we only add back 7 (the exponent is\n\t" - "@ added in because rounding might have produced a carry, which\n\t" - "@ should then spill into the exponent).\n\t" - "adds r4, #7\n\t" - "\n\t" - "@ If the mantissa new mantissa is non-zero, then its bit 63 is\n\t" - "@ non-zero (thanks to the normalizing shift). Otherwise, that bit\n\t" - "@ is zero, and we should then set the exponent to zero as well.\n\t" - "ands r4, r4, r1, asr #31\n\t" - "\n\t" - "@ Shrink back the value to a 52-bit mantissa. This requires\n\t" - "@ right-shifting by 11 bits; we keep a copy of the pre-shift\n\t" - "@ low word in r3.\n\t" - "movs r3, r0\n\t" - "lsrs r0, #11\n\t" - "orrs r0, r0, r1, lsl #21\n\t" - "lsrs r1, #11\n\t" - "\n\t" - "@ Apply rounding.\n\t" - "ubfx r6, r3, #0, #9\n\t" - "addw r6, r6, #511\n\t" - "orrs r3, r6\n\t" - "ubfx r3, r3, #9, #3\n\t" - "movs r6, #0xC8\n\t" - "lsrs r6, r3\n\t" - "ands r6, #1\n\t" - "adds r0, r6\n\t" - "adcs r1, #0\n\t" - "\n\t" - "@Plug in the exponent with an addition.\n\t" - "adds r1, r1, r4, lsl #20\n\t" - "\n\t" - "@ If the new exponent is negative or zero, then it underflowed\n\t" - "@ and we must clear the whole mantissa and exponent.\n\t" - "rsbs r4, r4, #0\n\t" - "ands r0, r0, r4, asr #31\n\t" - "ands r1, r1, r4, asr #31\n\t" - "\n\t" - "@ Put back the sign. This is the sign of x: thanks to the\n\t" - "@ conditional swap at the start, this is always correct.\n\t" - "bfi r1, r5, #31, #1\n\t" - "\n\t" - "pop { r4, r5, r6, r7, r8, r10, r11, pc }\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - -fpr -fpr_add(fpr x, fpr y) -{ - uint64_t m, xu, yu, za; - uint32_t cs; - int ex, ey, sx, sy, cc; - - /* - * Make sure that the first operand (x) has the larger absolute - * value. This guarantees that the exponent of y is less than - * or equal to the exponent of x, and, if they are equal, then - * the mantissa of y will not be greater than the mantissa of x. - * - * After this swap, the result will have the sign x, except in - * the following edge case: abs(x) = abs(y), and x and y have - * opposite sign bits; in that case, the result shall be +0 - * even if the sign bit of x is 1. To handle this case properly, - * we do the swap is abs(x) = abs(y) AND the sign of x is 1. - */ - m = ((uint64_t)1 << 63) - 1; - za = (x & m) - (y & m); - cs = (uint32_t)(za >> 63) - | ((1U - (uint32_t)(-za >> 63)) & (uint32_t)(x >> 63)); - m = (x ^ y) & -(uint64_t)cs; - x ^= m; - y ^= m; - - /* - * Extract sign bits, exponents and mantissas. The mantissas are - * scaled up to 2^55..2^56-1, and the exponent is unbiased. If - * an operand is zero, its mantissa is set to 0 at this step, and - * its exponent will be -1078. - */ - ex = (int)(x >> 52); - sx = ex >> 11; - ex &= 0x7FF; - m = (uint64_t)(uint32_t)((ex + 0x7FF) >> 11) << 52; - xu = ((x & (((uint64_t)1 << 52) - 1)) | m) << 3; - ex -= 1078; - ey = (int)(y >> 52); - sy = ey >> 11; - ey &= 0x7FF; - m = (uint64_t)(uint32_t)((ey + 0x7FF) >> 11) << 52; - yu = ((y & (((uint64_t)1 << 52) - 1)) | m) << 3; - ey -= 1078; - - /* - * x has the larger exponent; hence, we only need to right-shift y. - * If the shift count is larger than 59 bits then we clamp the - * value to zero. - */ - cc = ex - ey; - yu &= -(uint64_t)((uint32_t)(cc - 60) >> 31); - cc &= 63; - - /* - * The lowest bit of yu is "sticky". - */ - m = fpr_ulsh(1, cc) - 1; - yu |= (yu & m) + m; - yu = fpr_ursh(yu, cc); - - /* - * If the operands have the same sign, then we add the mantissas; - * otherwise, we subtract the mantissas. - */ - xu += yu - ((yu << 1) & -(uint64_t)(sx ^ sy)); - - /* - * The result may be smaller, or slightly larger. We normalize - * it to the 2^63..2^64-1 range (if xu is zero, then it stays - * at zero). - */ - FPR_NORM64(xu, ex); - - /* - * Scale down the value to 2^54..s^55-1, handling the last bit - * as sticky. - */ - xu |= ((uint32_t)xu & 0x1FF) + 0x1FF; - xu >>= 9; - ex += 9; - - /* - * In general, the result has the sign of x. However, if the - * result is exactly zero, then the following situations may - * be encountered: - * x > 0, y = -x -> result should be +0 - * x < 0, y = -x -> result should be +0 - * x = +0, y = +0 -> result should be +0 - * x = -0, y = +0 -> result should be +0 - * x = +0, y = -0 -> result should be +0 - * x = -0, y = -0 -> result should be -0 - * - * But at the conditional swap step at the start of the - * function, we ensured that if abs(x) = abs(y) and the - * sign of x was 1, then x and y were swapped. Thus, the - * two following cases cannot actually happen: - * x < 0, y = -x - * x = -0, y = +0 - * In all other cases, the sign bit of x is conserved, which - * is what the FPR() function does. The FPR() function also - * properly clamps values to zero when the exponent is too - * low, but does not alter the sign in that case. - */ - return FPR(sx, ex, xu); -} - -#endif // yyyASM_CORTEXM4- - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -__attribute__((naked)) -fpr -fpr_mul(fpr x __attribute__((unused)), fpr y __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, r7, r8, r10, r11, lr }\n\t" - "\n\t" - "@ Extract mantissas: x.m = r4:r5, y.m = r6:r7\n\t" - "@ r4 and r6 contain only 25 bits each.\n\t" - "bics r4, r0, #0xFE000000\n\t" - "lsls r5, r1, #7\n\t" - "orrs r5, r5, r0, lsr #25\n\t" - "orrs r5, r5, #0x08000000\n\t" - "bics r5, r5, #0xF0000000\n\t" - "bics r6, r2, #0xFE000000\n\t" - "lsls r7, r3, #7\n\t" - "orrs r7, r7, r2, lsr #25\n\t" - "orrs r7, r7, #0x08000000\n\t" - "bics r7, r7, #0xF0000000\n\t" - "\n\t" - "@ Perform product. Values are in the 2^52..2^53-1 range, so\n\t" - "@ the product is at most 106-bit long. Of the low 50 bits,\n\t" - "@ we only want to know if they are all zeros or not. Here,\n\t" - "@ we get the top 56 bits in r10:r11, and r8 will be non-zero\n\t" - "@ if and only if at least one of the low 50 bits is non-zero.\n\t" - "umull r8, r10, r4, r6 @ x0*y0\n\t" - "lsls r10, #7\n\t" - "orrs r10, r10, r8, lsr #25\n\t" - "eors r11, r11\n\t" - "umlal r10, r11, r4, r7 @ x0*y1\n\t" - "umlal r10, r11, r5, r6 @ x1*y0\n\t" - "orrs r8, r8, r10, lsl #7\n\t" - "lsrs r10, #25\n\t" - "orrs r10, r10, r11, lsl #7\n\t" - "eors r11, r11\n\t" - "umlal r10, r11, r5, r7 @ x1*y1\n\t" - "\n\t" - "@ Now r0, r2, r4, r5, r6 and r7 are free.\n\t" - "@ If any of the low 50 bits was non-zero, then we force the\n\t" - "@ low bit of r10 to 1.\n\t" - "rsbs r4, r8, #0\n\t" - "orrs r8, r8, r4\n\t" - "orrs r10, r10, r8, lsr #31\n\t" - "\n\t" - "@ r8 is free.\n\t" - "@ r10:r11 contains the product in the 2^54..2^56-1 range. We\n\t" - "@ normalize it to 2^54..2^55-1 (into r6:r7) with a conditional\n\t" - "@ shift (low bit is sticky). r5 contains -1 if the shift was done,\n\t" - "@ 0 otherwise.\n\t" - "ands r6, r10, #1\n\t" - "lsrs r5, r11, #23\n\t" - "rsbs r5, r5, #0\n\t" - "orrs r6, r6, r10, lsr #1\n\t" - "orrs r6, r6, r11, lsl #31\n\t" - "lsrs r7, r11, #1\n\t" - "eors r10, r10, r6\n\t" - "eors r11, r11, r7\n\t" - "bics r10, r10, r5\n\t" - "bics r11, r11, r5\n\t" - "eors r6, r6, r10\n\t" - "eors r7, r7, r11\n\t" - "\n\t" - "@ Compute aggregate exponent: ex + ey - 1023 + w\n\t" - "@ (where w = 1 if the conditional shift was done, 0 otherwise)\n\t" - "@ But we subtract 1 because the injection of the mantissa high\n\t" - "@ bit will increment the exponent by 1.\n\t" - "lsls r0, r1, #1\n\t" - "lsls r2, r3, #1\n\t" - "lsrs r0, #21\n\t" - "addw r4, r0, #0x7FF @ save ex + 2047 in r4\n\t" - "lsrs r2, #21\n\t" - "addw r8, r2, #0x7FF @ save ey + 2047 in r8\n\t" - "adds r2, r0\n\t" - "subw r2, r2, #1024\n\t" - "subs r2, r5\n\t" - "\n\t" - "@ r5 is free.\n\t" - "@ Also, if either of the source exponents is 0, or the result\n\t" - "@ exponent is 0 or negative, then the result is zero and the\n\t" - "@ mantissa and the exponent shall be clamped to zero. Since\n\t" - "@ r2 contains the result exponent minus 1, we test on r2\n\t" - "@ being strictly negative.\n\t" - "ands r4, r8 @ if bit 11 = 0 then one of the exponents was 0\n\t" - "mvns r5, r2\n\t" - "ands r5, r5, r4, lsl #20\n\t" - "ands r2, r2, r5, asr #31\n\t" - "ands r6, r6, r5, asr #31\n\t" - "ands r7, r7, r5, asr #31\n\t" - "\n\t" - "@ Sign is the XOR of the sign of the operands. This is true in\n\t" - "@ all cases, including very small results (exponent underflow)\n\t" - "@ and zeros.\n\t" - "eors r1, r3\n\t" - "bfc r1, #0, #31\n\t" - "\n\t" - "@ Plug in the exponent.\n\t" - "bfi r1, r2, #20, #11\n\t" - "\n\t" - "@ r2 and r3 are free.\n\t" - "@ Shift back to the normal 53-bit mantissa, with rounding.\n\t" - "@ Mantissa goes into r0:r1. For r1, we must use an addition\n\t" - "@ because the rounding may have triggered a carry, that should\n\t" - "@ be added to the exponent.\n\t" - "movs r4, r6\n\t" - "lsrs r0, r6, #2\n\t" - "orrs r0, r0, r7, lsl #30\n\t" - "adds r1, r1, r7, lsr #2\n\t" - "ands r4, #0x7\n\t" - "movs r3, #0xC8\n\t" - "lsrs r3, r4\n\t" - "ands r3, #1\n\t" - "adds r0, r3\n\t" - "adcs r1, #0\n\t" - "\n\t" - "pop { r4, r5, r6, r7, r8, r10, r11, pc }\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - -fpr -fpr_mul(fpr x, fpr y) -{ - uint64_t xu, yu, w, zu, zv; - uint32_t x0, x1, y0, y1, z0, z1, z2; - int ex, ey, d, e, s; - - /* - * Extract absolute values as scaled unsigned integers. We - * don't extract exponents yet. - */ - xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52); - yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52); - - /* - * We have two 53-bit integers to multiply; we need to split - * each into a lower half and a upper half. Moreover, we - * prefer to have lower halves to be of 25 bits each, for - * reasons explained later on. - */ - x0 = (uint32_t)xu & 0x01FFFFFF; - x1 = (uint32_t)(xu >> 25); - y0 = (uint32_t)yu & 0x01FFFFFF; - y1 = (uint32_t)(yu >> 25); - w = (uint64_t)x0 * (uint64_t)y0; - z0 = (uint32_t)w & 0x01FFFFFF; - z1 = (uint32_t)(w >> 25); - w = (uint64_t)x0 * (uint64_t)y1; - z1 += (uint32_t)w & 0x01FFFFFF; - z2 = (uint32_t)(w >> 25); - w = (uint64_t)x1 * (uint64_t)y0; - z1 += (uint32_t)w & 0x01FFFFFF; - z2 += (uint32_t)(w >> 25); - zu = (uint64_t)x1 * (uint64_t)y1; - z2 += (z1 >> 25); - z1 &= 0x01FFFFFF; - zu += z2; - - /* - * Since xu and yu are both in the 2^52..2^53-1 range, the - * product is in the 2^104..2^106-1 range. We first reassemble - * it and round it into the 2^54..2^56-1 range; the bottom bit - * is made "sticky". Since the low limbs z0 and z1 are 25 bits - * each, we just take the upper part (zu), and consider z0 and - * z1 only for purposes of stickiness. - * (This is the reason why we chose 25-bit limbs above.) - */ - zu |= ((z0 | z1) + 0x01FFFFFF) >> 25; - - /* - * We normalize zu to the 2^54..s^55-1 range: it could be one - * bit too large at this point. This is done with a conditional - * right-shift that takes into account the sticky bit. - */ - zv = (zu >> 1) | (zu & 1); - w = zu >> 55; - zu ^= (zu ^ zv) & -w; - - /* - * Get the aggregate scaling factor: - * - * - Each exponent is biased by 1023. - * - * - Integral mantissas are scaled by 2^52, hence an - * extra 52 bias for each exponent. - * - * - However, we right-shifted z by 50 bits, and then - * by 0 or 1 extra bit (depending on the value of w). - * - * In total, we must add the exponents, then subtract - * 2 * (1023 + 52), then add 50 + w. - */ - ex = (int)((x >> 52) & 0x7FF); - ey = (int)((y >> 52) & 0x7FF); - e = ex + ey - 2100 + (int)w; - - /* - * Sign bit is the XOR of the operand sign bits. - */ - s = (int)((x ^ y) >> 63); - - /* - * Corrective actions for zeros: if either of the operands is - * zero, then the computations above were wrong. Test for zero - * is whether ex or ey is zero. We just have to set the mantissa - * (zu) to zero, the FPR() function will normalize e. - */ - d = ((ex + 0x7FF) & (ey + 0x7FF)) >> 11; - zu &= -(uint64_t)d; - - /* - * FPR() packs the result and applies proper rounding. - */ - return FPR(s, e, zu); -} - -#endif // yyyASM_CORTEXM4- - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -__attribute__((naked)) -fpr -fpr_div(fpr x __attribute__((unused)), fpr y __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, r7, r8, r10, r11, lr }\n\t" - - "@ Extract mantissas of x and y, in r0:r4 and r2:r5, respectively.\n\t" - "@ We don't touch r1 and r3 as they contain the exponents and\n\t" - "@ signs, which we'll need later on.\n\t" - "ubfx r4, r1, #0, #20\n\t" - "ubfx r5, r3, #0, #20\n\t" - "orrs r4, r4, #0x00100000\n\t" - "orrs r5, r5, #0x00100000\n\t" - "\n\t" - "@ Perform bit-by-bit division. We want a 56-bit result in r8:r10\n\t" - "@ (low bit is 0). Bits come from the carry flag and are\n\t" - "@ injected with rrx, i.e. in position 31; we thus get bits in\n\t" - "@ the reverse order. Bits accumulate in r8; after the first 24\n\t" - "@ bits, we move the quotient bits to r10.\n\t" - "eors r8, r8\n\t" - "\n\t" - -#define DIVSTEP \ - "subs r6, r0, r2\n\t" \ - "sbcs r7, r4, r5\n\t" \ - "rrx r8, r8\n\t" \ - "ands r6, r2, r8, asr #31\n\t" \ - "ands r7, r5, r8, asr #31\n\t" \ - "subs r0, r6\n\t" \ - "sbcs r4, r7\n\t" \ - "adds r0, r0, r0\n\t" \ - "adcs r4, r4, r4\n\t" - -#define DIVSTEP4 DIVSTEP DIVSTEP DIVSTEP DIVSTEP -#define DIVSTEP8 DIVSTEP4 DIVSTEP4 - - DIVSTEP8 - DIVSTEP8 - DIVSTEP8 - - "\n\t" - "@ We have the first 24 bits of the quotient, move them to r10.\n\t" - "rbit r10, r8\n\t" - "\n\t" - - DIVSTEP8 - DIVSTEP8 - DIVSTEP8 - DIVSTEP4 DIVSTEP DIVSTEP DIVSTEP - -#undef DIVSTEP -#undef DIVSTEP4 -#undef DIVSTEP8 - - "\n\t" - "@ Lowest bit will be set if remainder is non-zero at this point\n\t" - "@ (this is the 'sticky' bit).\n\t" - "subs r0, #1\n\t" - "sbcs r4, #0\n\t" - "rrx r8, r8\n\t" - "\n\t" - "@ We now have the next (low) 32 bits of the quotient.\n\t" - "rbit r8, r8\n\t" - "\n\t" - "@ Since both operands had their top bit set, we know that the\n\t" - "@ result at this point is in 2^54..2^56-1. We scale it down\n\t" - "@ to 2^54..2^55-1 with a conditional shift. We also write the\n\t" - "@ result in r4:r5. If the shift is done, r6 will contain -1.\n\t" - "ands r4, r8, #1\n\t" - "lsrs r6, r10, #23\n\t" - "rsbs r6, r6, #0\n\t" - "orrs r4, r4, r8, lsr #1\n\t" - "orrs r4, r4, r10, lsl #31\n\t" - "lsrs r5, r10, #1\n\t" - "eors r8, r8, r4\n\t" - "eors r10, r10, r5\n\t" - "bics r8, r8, r6\n\t" - "bics r10, r10, r6\n\t" - "eors r4, r4, r8\n\t" - "eors r5, r5, r10\n\t" - "\n\t" - "@ Compute aggregate exponent: ex - ey + 1022 + w\n\t" - "@ (where w = 1 if the conditional shift was done, 0 otherwise)\n\t" - "@ But we subtract 1 because the injection of the mantissa high\n\t" - "@ bit will increment the exponent by 1.\n\t" - "lsls r0, r1, #1\n\t" - "lsls r2, r3, #1\n\t" - "lsrs r0, r0, #21\n\t" - "addw r7, r0, #0x7FF @ save ex + 2047 in r7\n\t" - "subs r0, r0, r2, lsr #21\n\t" - "addw r0, r0, #1021\n\t" - "subs r0, r6\n\t" - "\n\t" - "@ If the x operand was zero, then the computation was wrong and\n\t" - "@ the result is zero. Also, if the result exponent is zero or\n\t" - "@ negative, then the mantissa shall be clamped to zero. Since r0\n\t" - "@ contains the result exponent minus 1, we test on r0 being\n\t" - "@ strictly negative.\n\t" - "mvns r2, r0\n\t" - "ands r2, r2, r7, lsl #20\n\t" - "ands r0, r0, r2, asr #31\n\t" - "ands r4, r4, r2, asr #31\n\t" - "ands r5, r5, r2, asr #31\n\t" - "\n\t" - "@ Sign is the XOR of the sign of the operands. This is true in\n\t" - "@ all cases, including very small results (exponent underflow)\n\t" - "@ and zeros.\n\t" - "eors r1, r3\n\t" - "bfc r1, #0, #31\n\t" - "\n\t" - "@ Plug in the exponent.\n\t" - "bfi r1, r0, #20, #11\n\t" - "\n\t" - "@ Shift back to the normal 53-bit mantissa, with rounding.\n\t" - "@ Mantissa goes into r0:r1. For r1, we must use an addition\n\t" - "@ because the rounding may have triggered a carry, that should\n\t" - "@ be added to the exponent.\n\t" - "movs r6, r4\n\t" - "lsrs r0, r4, #2\n\t" - "orrs r0, r0, r5, lsl #30\n\t" - "adds r1, r1, r5, lsr #2\n\t" - "ands r6, #0x7\n\t" - "movs r3, #0xC8\n\t" - "lsrs r3, r6\n\t" - "ands r3, #1\n\t" - "adds r0, r3\n\t" - "adcs r1, #0\n\t" - "\n\t" - "pop { r4, r5, r6, r7, r8, r10, r11, pc }\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - -fpr -fpr_div(fpr x, fpr y) -{ - uint64_t xu, yu, q, q2, w; - int i, ex, ey, e, d, s; - - /* - * Extract mantissas of x and y (unsigned). - */ - xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52); - yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52); - - /* - * Perform bit-by-bit division of xu by yu. We run it for 55 bits. - */ - q = 0; - for (i = 0; i < 55; i ++) { - /* - * If yu is less than or equal xu, then subtract it and - * push a 1 in the quotient; otherwise, leave xu unchanged - * and push a 0. - */ - uint64_t b; - - b = ((xu - yu) >> 63) - 1; - xu -= b & yu; - q |= b & 1; - xu <<= 1; - q <<= 1; - } - - /* - * We got 55 bits in the quotient, followed by an extra zero. We - * want that 56th bit to be "sticky": it should be a 1 if and - * only if the remainder (xu) is non-zero. - */ - q |= (xu | -xu) >> 63; - - /* - * Quotient is at most 2^56-1. Its top bit may be zero, but in - * that case the next-to-top bit will be a one, since the - * initial xu and yu were both in the 2^52..2^53-1 range. - * We perform a conditional shift to normalize q to the - * 2^54..2^55-1 range (with the bottom bit being sticky). - */ - q2 = (q >> 1) | (q & 1); - w = q >> 55; - q ^= (q ^ q2) & -w; - - /* - * Extract exponents to compute the scaling factor: - * - * - Each exponent is biased and we scaled them up by - * 52 bits; but these biases will cancel out. - * - * - The division loop produced a 55-bit shifted result, - * so we must scale it down by 55 bits. - * - * - If w = 1, we right-shifted the integer by 1 bit, - * hence we must add 1 to the scaling. - */ - ex = (int)((x >> 52) & 0x7FF); - ey = (int)((y >> 52) & 0x7FF); - e = ex - ey - 55 + (int)w; - - /* - * Sign is the XOR of the signs of the operands. - */ - s = (int)((x ^ y) >> 63); - - /* - * Corrective actions for zeros: if x = 0, then the computation - * is wrong, and we must clamp e and q to 0. We do not care - * about the case y = 0 (as per assumptions in this module, - * the caller does not perform divisions by zero). - */ - d = (ex + 0x7FF) >> 11; - s &= d; - e &= -d; - q &= -(uint64_t)d; - - /* - * FPR() packs the result and applies proper rounding. - */ - return FPR(s, e, q); -} - -#endif // yyyASM_CORTEXM4- - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -__attribute__((naked)) -fpr -fpr_sqrt(fpr x __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, r7, r8, r10, r11, lr }\n\t" - "\n\t" - "@ Extract mantissa (r0:r1) and exponent (r2). We assume that the\n\t" - "@ sign is positive. If the source is zero, then the mantissa is\n\t" - "@ set to 0.\n\t" - "lsrs r2, r1, #20\n\t" - "bfc r1, #20, #12\n\t" - "addw r3, r2, #0x7FF\n\t" - "subw r2, r2, #1023\n\t" - "lsrs r3, r3, #11\n\t" - "orrs r1, r1, r3, lsl #20\n\t" - "\n\t" - "@ If the exponent is odd, then multiply mantissa by 2 and subtract\n\t" - "@ 1 from the exponent.\n\t" - "ands r3, r2, #1\n\t" - "subs r2, r2, r3\n\t" - "rsbs r3, r3, #0\n\t" - "ands r4, r1, r3\n\t" - "ands r3, r0\n\t" - "adds r0, r3\n\t" - "adcs r1, r4\n\t" - "\n\t" - "@ Left-shift the mantissa by 9 bits to put it in the\n\t" - "@ 2^61..2^63-1 range (unless it is exactly 0).\n\t" - "lsls r1, r1, #9\n\t" - "orrs r1, r1, r0, lsr #23\n\t" - "lsls r0, r0, #9\n\t" - "\n\t" - "@ Compute the square root bit-by-bit.\n\t" - "@ There are 54 iterations; first 30 can work on top word only.\n\t" - "@ q = r3 (bit-reversed)\n\t" - "@ s = r5\n\t" - "eors r3, r3\n\t" - "eors r5, r5\n\t" - -#define SQRT_STEP_HI(bit) \ - "orrs r6, r5, #(1 << (" #bit "))\n\t" \ - "subs r7, r1, r6\n\t" \ - "rrx r3, r3\n\t" \ - "ands r6, r6, r3, asr #31\n\t" \ - "subs r1, r1, r6\n\t" \ - "lsrs r6, r3, #31\n\t" \ - "orrs r5, r5, r6, lsl #((" #bit ") + 1)\n\t" \ - "adds r0, r0\n\t" \ - "adcs r1, r1\n\t" - -#define SQRT_STEP_HIx5(b) \ - SQRT_STEP_HI((b)+4) \ - SQRT_STEP_HI((b)+3) \ - SQRT_STEP_HI((b)+2) \ - SQRT_STEP_HI((b)+1) \ - SQRT_STEP_HI(b) - - SQRT_STEP_HIx5(25) - SQRT_STEP_HIx5(20) - SQRT_STEP_HIx5(15) - SQRT_STEP_HIx5(10) - SQRT_STEP_HIx5(5) - SQRT_STEP_HIx5(0) - -#undef SQRT_STEP_HI -#undef SQRT_STEP_HIx5 - - "@ Top 30 bits of the result must be reversed: they were\n\t" - "@ accumulated with rrx (hence from the top bit).\n\t" - "rbit r3, r3\n\t" - "\n\t" - "@ For the next 24 iterations, we must use two-word operations.\n\t" - "@ bits of q now accumulate in r4\n\t" - "@ s is in r6:r5\n\t" - "eors r4, r4\n\t" - "eors r6, r6\n\t" - "\n\t" - "@ First iteration is special because the potential bit goes into\n\t" - "@ r5, not r6.\n\t" - "orrs r7, r6, #(1 << 31)\n\t" - "subs r8, r0, r7\n\t" - "sbcs r10, r1, r5\n\t" - "rrx r4, r4\n\t" - "ands r7, r7, r4, asr #31\n\t" - "ands r8, r5, r4, asr #31\n\t" - "subs r0, r0, r7\n\t" - "sbcs r1, r1, r8\n\t" - "lsrs r7, r4, #31\n\t" - "orrs r5, r5, r4, lsr #31\n\t" - "adds r0, r0\n\t" - "adcs r1, r1\n\t" - -#define SQRT_STEP_LO(bit) \ - "orrs r7, r6, #(1 << (" #bit "))\n\t" \ - "subs r8, r0, r7\n\t" \ - "sbcs r10, r1, r5\n\t" \ - "rrx r4, r4\n\t" \ - "ands r7, r7, r4, asr #31\n\t" \ - "ands r8, r5, r4, asr #31\n\t" \ - "subs r0, r0, r7\n\t" \ - "sbcs r1, r1, r8\n\t" \ - "lsrs r7, r4, #31\n\t" \ - "orrs r6, r6, r7, lsl #((" #bit ") + 1)\n\t" \ - "adds r0, r0\n\t" \ - "adcs r1, r1\n\t" - -#define SQRT_STEP_LOx4(b) \ - SQRT_STEP_LO((b)+3) \ - SQRT_STEP_LO((b)+2) \ - SQRT_STEP_LO((b)+1) \ - SQRT_STEP_LO(b) - - SQRT_STEP_LO(30) - SQRT_STEP_LO(29) - SQRT_STEP_LO(28) - SQRT_STEP_LOx4(24) - SQRT_STEP_LOx4(20) - SQRT_STEP_LOx4(16) - SQRT_STEP_LOx4(12) - SQRT_STEP_LOx4(8) - -#undef SQRT_STEP_LO -#undef SQRT_STEP_LOx4 - - "@ Put low 24 bits in the right order.\n\t" - "rbit r4, r4\n\t" - "\n\t" - "@ We have a 54-bit result; compute the 55-th bit as the 'sticky'\n\t" - "@ bit: it is non-zero if and only if r0:r1 is non-zero. We put the\n\t" - "@ three low bits (including the sticky bit) in r5.\n\t" - "orrs r0, r1\n\t" - "rsbs r1, r0, #0\n\t" - "orrs r0, r1\n\t" - "lsls r5, r4, #1\n\t" - "orrs r5, r5, r0, lsr #31\n\t" - "ands r5, #0x7\n\t" - "\n\t" - "@ Compute the rounding: r6 is set to 0 or 1, and will be added\n\t" - "@ to the mantissa.\n\t" - "movs r6, #0xC8\n\t" - "lsrs r6, r5\n\t" - "ands r6, #1\n\t" - "\n\t" - "@ Put the mantissa (53 bits, in the 2^52..2^53-1 range) in r0:r1\n\t" - "@ (rounding not applied yet).\n\t" - "lsrs r0, r4, #1\n\t" - "orrs r0, r0, r3, lsl #23\n\t" - "lsrs r1, r3, #9\n\t" - "\n\t" - "@ Compute new exponent. This is half the old one (then reencoded\n\t" - "@ by adding 1023). Exception: if the mantissa is zero, then the\n\t" - "@ encoded exponent is set to 0. At that point, if the mantissa\n\t" - "@ is non-zero, then its high bit (bit 52, i.e. bit 20 of r1) is\n\t" - "@ non-zero. Note that the exponent cannot go out of range.\n\t" - "lsrs r2, r2, #1\n\t" - "addw r2, r2, #1023\n\t" - "lsrs r5, r1, #20\n\t" - "rsbs r5, r5, #0\n\t" - "ands r2, r5\n\t" - "\n\t" - "@ Place exponent. This overwrites the high bit of the mantissa.\n\t" - "bfi r1, r2, #20, #11\n\t" - "\n\t" - "@ Apply rounding. This may create a carry that will spill into\n\t" - "@ the exponent, which is exactly what should be done in that case\n\t" - "@ (i.e. increment the exponent).\n\t" - "adds r0, r0, r6\n\t" - "adcs r1, r1, #0\n\t" - "\n\t" - "pop { r4, r5, r6, r7, r8, r10, r11, pc }\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - -fpr -fpr_sqrt(fpr x) -{ - uint64_t xu, q, s, r; - int ex, e; - - /* - * Extract the mantissa and the exponent. We don't care about - * the sign: by assumption, the operand is nonnegative. - * We want the "true" exponent corresponding to a mantissa - * in the 1..2 range. - */ - xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52); - ex = (int)((x >> 52) & 0x7FF); - e = ex - 1023; - - /* - * If the exponent is odd, double the mantissa and decrement - * the exponent. The exponent is then halved to account for - * the square root. - */ - xu += xu & -(uint64_t)(e & 1); - e >>= 1; - - /* - * Double the mantissa. - */ - xu <<= 1; - - /* - * We now have a mantissa in the 2^53..2^55-1 range. It - * represents a value between 1 (inclusive) and 4 (exclusive) - * in fixed point notation (with 53 fractional bits). We - * compute the square root bit by bit. - */ - q = 0; - s = 0; - r = (uint64_t)1 << 53; - for (int i = 0; i < 54; i ++) { - uint64_t t, b; - - t = s + r; - b = ((xu - t) >> 63) - 1; - s += (r << 1) & b; - xu -= t & b; - q += r & b; - xu <<= 1; - r >>= 1; - } - - /* - * Now, q is a rounded-low 54-bit value, with a leading 1, - * 52 fractional digits, and an additional guard bit. We add - * an extra sticky bit to account for what remains of the operand. - */ - q <<= 1; - q |= (xu | -xu) >> 63; - - /* - * Result q is in the 2^54..2^55-1 range; we bias the exponent - * by 54 bits (the value e at that point contains the "true" - * exponent, but q is now considered an integer, i.e. scaled - * up. - */ - e -= 54; - - /* - * Corrective action for an operand of value zero. - */ - q &= -(uint64_t)((ex + 0x7FF) >> 11); - - /* - * Apply rounding and back result. - */ - return FPR(0, e, q); -} - -#endif // yyyASM_CORTEXM4- - -uint64_t -fpr_expm_p63(fpr x, fpr ccs) -{ - /* - * Polynomial approximation of exp(-x) is taken from FACCT: - * https://eprint.iacr.org/2018/1234 - * Specifically, values are extracted from the implementation - * referenced from the FACCT article, and available at: - * https://github.com/raykzhao/gaussian - * Here, the coefficients have been scaled up by 2^63 and - * converted to integers. - * - * Tests over more than 24 billions of random inputs in the - * 0..log(2) range have never shown a deviation larger than - * 2^(-50) from the true mathematical value. - */ - static const uint64_t C[] = { - 0x00000004741183A3u, - 0x00000036548CFC06u, - 0x0000024FDCBF140Au, - 0x0000171D939DE045u, - 0x0000D00CF58F6F84u, - 0x000680681CF796E3u, - 0x002D82D8305B0FEAu, - 0x011111110E066FD0u, - 0x0555555555070F00u, - 0x155555555581FF00u, - 0x400000000002B400u, - 0x7FFFFFFFFFFF4800u, - 0x8000000000000000u - }; - - uint64_t z, y; - unsigned u; - uint32_t z0, z1, y0, y1; - uint64_t a, b; - - y = C[0]; - z = (uint64_t)fpr_trunc(fpr_mul(x, fpr_ptwo63)) << 1; - for (u = 1; u < (sizeof C) / sizeof(C[0]); u ++) { - /* - * Compute product z * y over 128 bits, but keep only - * the top 64 bits. - * - * TODO: On some architectures/compilers we could use - * some intrinsics (__umulh() on MSVC) or other compiler - * extensions (unsigned __int128 on GCC / Clang) for - * improved speed; however, most 64-bit architectures - * also have appropriate IEEE754 floating-point support, - * which is better. - */ - uint64_t c; - - z0 = (uint32_t)z; - z1 = (uint32_t)(z >> 32); - y0 = (uint32_t)y; - y1 = (uint32_t)(y >> 32); - a = ((uint64_t)z0 * (uint64_t)y1) - + (((uint64_t)z0 * (uint64_t)y0) >> 32); - b = ((uint64_t)z1 * (uint64_t)y0); - c = (a >> 32) + (b >> 32); - c += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32); - c += (uint64_t)z1 * (uint64_t)y1; - y = C[u] - c; - } - - /* - * The scaling factor must be applied at the end. Since y is now - * in fixed-point notation, we have to convert the factor to the - * same format, and do an extra integer multiplication. - */ - z = (uint64_t)fpr_trunc(fpr_mul(ccs, fpr_ptwo63)) << 1; - z0 = (uint32_t)z; - z1 = (uint32_t)(z >> 32); - y0 = (uint32_t)y; - y1 = (uint32_t)(y >> 32); - a = ((uint64_t)z0 * (uint64_t)y1) - + (((uint64_t)z0 * (uint64_t)y0) >> 32); - b = ((uint64_t)z1 * (uint64_t)y0); - y = (a >> 32) + (b >> 32); - y += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32); - y += (uint64_t)z1 * (uint64_t)y1; - - return y; -} - -const fpr fpr_gm_tab[] = { - 0, 0, - 9223372036854775808U, 4607182418800017408U, - 4604544271217802189U, 4604544271217802189U, - 13827916308072577997U, 4604544271217802189U, - 4606496786581982534U, 4600565431771507043U, - 13823937468626282851U, 4606496786581982534U, - 4600565431771507043U, 4606496786581982534U, - 13829868823436758342U, 4600565431771507043U, - 4607009347991985328U, 4596196889902818827U, - 13819568926757594635U, 4607009347991985328U, - 4603179351334086856U, 4605664432017547683U, - 13829036468872323491U, 4603179351334086856U, - 4605664432017547683U, 4603179351334086856U, - 13826551388188862664U, 4605664432017547683U, - 4596196889902818827U, 4607009347991985328U, - 13830381384846761136U, 4596196889902818827U, - 4607139046673687846U, 4591727299969791020U, - 13815099336824566828U, 4607139046673687846U, - 4603889326261607894U, 4605137878724712257U, - 13828509915579488065U, 4603889326261607894U, - 4606118860100255153U, 4602163548591158843U, - 13825535585445934651U, 4606118860100255153U, - 4598900923775164166U, 4606794571824115162U, - 13830166608678890970U, 4598900923775164166U, - 4606794571824115162U, 4598900923775164166U, - 13822272960629939974U, 4606794571824115162U, - 4602163548591158843U, 4606118860100255153U, - 13829490896955030961U, 4602163548591158843U, - 4605137878724712257U, 4603889326261607894U, - 13827261363116383702U, 4605137878724712257U, - 4591727299969791020U, 4607139046673687846U, - 13830511083528463654U, 4591727299969791020U, - 4607171569234046334U, 4587232218149935124U, - 13810604255004710932U, 4607171569234046334U, - 4604224084862889120U, 4604849113969373103U, - 13828221150824148911U, 4604224084862889120U, - 4606317631232591731U, 4601373767755717824U, - 13824745804610493632U, 4606317631232591731U, - 4599740487990714333U, 4606655894547498725U, - 13830027931402274533U, 4599740487990714333U, - 4606912484326125783U, 4597922303871901467U, - 13821294340726677275U, 4606912484326125783U, - 4602805845399633902U, 4605900952042040894U, - 13829272988896816702U, 4602805845399633902U, - 4605409869824231233U, 4603540801876750389U, - 13826912838731526197U, 4605409869824231233U, - 4594454542771183930U, 4607084929468638487U, - 13830456966323414295U, 4594454542771183930U, - 4607084929468638487U, 4594454542771183930U, - 13817826579625959738U, 4607084929468638487U, - 4603540801876750389U, 4605409869824231233U, - 13828781906679007041U, 4603540801876750389U, - 4605900952042040894U, 4602805845399633902U, - 13826177882254409710U, 4605900952042040894U, - 4597922303871901467U, 4606912484326125783U, - 13830284521180901591U, 4597922303871901467U, - 4606655894547498725U, 4599740487990714333U, - 13823112524845490141U, 4606655894547498725U, - 4601373767755717824U, 4606317631232591731U, - 13829689668087367539U, 4601373767755717824U, - 4604849113969373103U, 4604224084862889120U, - 13827596121717664928U, 4604849113969373103U, - 4587232218149935124U, 4607171569234046334U, - 13830543606088822142U, 4587232218149935124U, - 4607179706000002317U, 4582730748936808062U, - 13806102785791583870U, 4607179706000002317U, - 4604386048625945823U, 4604698657331085206U, - 13828070694185861014U, 4604386048625945823U, - 4606409688975526202U, 4600971798440897930U, - 13824343835295673738U, 4606409688975526202U, - 4600154912527631775U, 4606578871587619388U, - 13829950908442395196U, 4600154912527631775U, - 4606963563043808649U, 4597061974398750563U, - 13820434011253526371U, 4606963563043808649U, - 4602994049708411683U, 4605784983948558848U, - 13829157020803334656U, 4602994049708411683U, - 4605539368864982914U, 4603361638657888991U, - 13826733675512664799U, 4605539368864982914U, - 4595327571478659014U, 4607049811591515049U, - 13830421848446290857U, 4595327571478659014U, - 4607114680469659603U, 4593485039402578702U, - 13816857076257354510U, 4607114680469659603U, - 4603716733069447353U, 4605276012900672507U, - 13828648049755448315U, 4603716733069447353U, - 4606012266443150634U, 4602550884377336506U, - 13825922921232112314U, 4606012266443150634U, - 4598476289818621559U, 4606856142606846307U, - 13830228179461622115U, 4598476289818621559U, - 4606727809065869586U, 4599322407794599425U, - 13822694444649375233U, 4606727809065869586U, - 4601771097584682078U, 4606220668805321205U, - 13829592705660097013U, 4601771097584682078U, - 4604995550503212910U, 4604058477489546729U, - 13827430514344322537U, 4604995550503212910U, - 4589965306122607094U, 4607158013403433018U, - 13830530050258208826U, 4589965306122607094U, - 4607158013403433018U, 4589965306122607094U, - 13813337342977382902U, 4607158013403433018U, - 4604058477489546729U, 4604995550503212910U, - 13828367587357988718U, 4604058477489546729U, - 4606220668805321205U, 4601771097584682078U, - 13825143134439457886U, 4606220668805321205U, - 4599322407794599425U, 4606727809065869586U, - 13830099845920645394U, 4599322407794599425U, - 4606856142606846307U, 4598476289818621559U, - 13821848326673397367U, 4606856142606846307U, - 4602550884377336506U, 4606012266443150634U, - 13829384303297926442U, 4602550884377336506U, - 4605276012900672507U, 4603716733069447353U, - 13827088769924223161U, 4605276012900672507U, - 4593485039402578702U, 4607114680469659603U, - 13830486717324435411U, 4593485039402578702U, - 4607049811591515049U, 4595327571478659014U, - 13818699608333434822U, 4607049811591515049U, - 4603361638657888991U, 4605539368864982914U, - 13828911405719758722U, 4603361638657888991U, - 4605784983948558848U, 4602994049708411683U, - 13826366086563187491U, 4605784983948558848U, - 4597061974398750563U, 4606963563043808649U, - 13830335599898584457U, 4597061974398750563U, - 4606578871587619388U, 4600154912527631775U, - 13823526949382407583U, 4606578871587619388U, - 4600971798440897930U, 4606409688975526202U, - 13829781725830302010U, 4600971798440897930U, - 4604698657331085206U, 4604386048625945823U, - 13827758085480721631U, 4604698657331085206U, - 4582730748936808062U, 4607179706000002317U, - 13830551742854778125U, 4582730748936808062U, - 4607181740574479067U, 4578227681973159812U, - 13801599718827935620U, 4607181740574479067U, - 4604465633578481725U, 4604621949701367983U, - 13827993986556143791U, 4604465633578481725U, - 4606453861145241227U, 4600769149537129431U, - 13824141186391905239U, 4606453861145241227U, - 4600360675823176935U, 4606538458821337243U, - 13829910495676113051U, 4600360675823176935U, - 4606987119037722413U, 4596629994023683153U, - 13820002030878458961U, 4606987119037722413U, - 4603087070374583113U, 4605725276488455441U, - 13829097313343231249U, 4603087070374583113U, - 4605602459698789090U, 4603270878689749849U, - 13826642915544525657U, 4605602459698789090U, - 4595762727260045105U, 4607030246558998647U, - 13830402283413774455U, 4595762727260045105U, - 4607127537664763515U, 4592606767730311893U, - 13815978804585087701U, 4607127537664763515U, - 4603803453461190356U, 4605207475328619533U, - 13828579512183395341U, 4603803453461190356U, - 4606066157444814153U, 4602357870542944470U, - 13825729907397720278U, 4606066157444814153U, - 4598688984595225406U, 4606826008603986804U, - 13830198045458762612U, 4598688984595225406U, - 4606761837001494797U, 4599112075441176914U, - 13822484112295952722U, 4606761837001494797U, - 4601967947786150793U, 4606170366472647579U, - 13829542403327423387U, 4601967947786150793U, - 4605067233569943231U, 4603974338538572089U, - 13827346375393347897U, 4605067233569943231U, - 4590846768565625881U, 4607149205763218185U, - 13830521242617993993U, 4590846768565625881U, - 4607165468267934125U, 4588998070480937184U, - 13812370107335712992U, 4607165468267934125U, - 4604141730443515286U, 4604922840319727473U, - 13828294877174503281U, 4604141730443515286U, - 4606269759522929756U, 4601573027631668967U, - 13824945064486444775U, 4606269759522929756U, - 4599531889160152938U, 4606692493141721470U, - 13830064529996497278U, 4599531889160152938U, - 4606884969294623682U, 4598262871476403630U, - 13821634908331179438U, 4606884969294623682U, - 4602710690099904183U, 4605957195211051218U, - 13829329232065827026U, 4602710690099904183U, - 4605343481119364930U, 4603629178146150899U, - 13827001215000926707U, 4605343481119364930U, - 4594016801320007031U, 4607100477024622401U, - 13830472513879398209U, 4594016801320007031U, - 4607068040143112603U, 4594891488091520602U, - 13818263524946296410U, 4607068040143112603U, - 4603451617570386922U, 4605475169017376660U, - 13828847205872152468U, 4603451617570386922U, - 4605843545406134034U, 4602900303344142735U, - 13826272340198918543U, 4605843545406134034U, - 4597492765973365521U, 4606938683557690074U, - 13830310720412465882U, 4597492765973365521U, - 4606618018794815019U, 4599948172872067014U, - 13823320209726842822U, 4606618018794815019U, - 4601173347964633034U, 4606364276725003740U, - 13829736313579779548U, 4601173347964633034U, - 4604774382555066977U, 4604305528345395596U, - 13827677565200171404U, 4604774382555066977U, - 4585465300892538317U, 4607176315382986589U, - 13830548352237762397U, 4585465300892538317U, - 4607176315382986589U, 4585465300892538317U, - 13808837337747314125U, 4607176315382986589U, - 4604305528345395596U, 4604774382555066977U, - 13828146419409842785U, 4604305528345395596U, - 4606364276725003740U, 4601173347964633034U, - 13824545384819408842U, 4606364276725003740U, - 4599948172872067014U, 4606618018794815019U, - 13829990055649590827U, 4599948172872067014U, - 4606938683557690074U, 4597492765973365521U, - 13820864802828141329U, 4606938683557690074U, - 4602900303344142735U, 4605843545406134034U, - 13829215582260909842U, 4602900303344142735U, - 4605475169017376660U, 4603451617570386922U, - 13826823654425162730U, 4605475169017376660U, - 4594891488091520602U, 4607068040143112603U, - 13830440076997888411U, 4594891488091520602U, - 4607100477024622401U, 4594016801320007031U, - 13817388838174782839U, 4607100477024622401U, - 4603629178146150899U, 4605343481119364930U, - 13828715517974140738U, 4603629178146150899U, - 4605957195211051218U, 4602710690099904183U, - 13826082726954679991U, 4605957195211051218U, - 4598262871476403630U, 4606884969294623682U, - 13830257006149399490U, 4598262871476403630U, - 4606692493141721470U, 4599531889160152938U, - 13822903926014928746U, 4606692493141721470U, - 4601573027631668967U, 4606269759522929756U, - 13829641796377705564U, 4601573027631668967U, - 4604922840319727473U, 4604141730443515286U, - 13827513767298291094U, 4604922840319727473U, - 4588998070480937184U, 4607165468267934125U, - 13830537505122709933U, 4588998070480937184U, - 4607149205763218185U, 4590846768565625881U, - 13814218805420401689U, 4607149205763218185U, - 4603974338538572089U, 4605067233569943231U, - 13828439270424719039U, 4603974338538572089U, - 4606170366472647579U, 4601967947786150793U, - 13825339984640926601U, 4606170366472647579U, - 4599112075441176914U, 4606761837001494797U, - 13830133873856270605U, 4599112075441176914U, - 4606826008603986804U, 4598688984595225406U, - 13822061021450001214U, 4606826008603986804U, - 4602357870542944470U, 4606066157444814153U, - 13829438194299589961U, 4602357870542944470U, - 4605207475328619533U, 4603803453461190356U, - 13827175490315966164U, 4605207475328619533U, - 4592606767730311893U, 4607127537664763515U, - 13830499574519539323U, 4592606767730311893U, - 4607030246558998647U, 4595762727260045105U, - 13819134764114820913U, 4607030246558998647U, - 4603270878689749849U, 4605602459698789090U, - 13828974496553564898U, 4603270878689749849U, - 4605725276488455441U, 4603087070374583113U, - 13826459107229358921U, 4605725276488455441U, - 4596629994023683153U, 4606987119037722413U, - 13830359155892498221U, 4596629994023683153U, - 4606538458821337243U, 4600360675823176935U, - 13823732712677952743U, 4606538458821337243U, - 4600769149537129431U, 4606453861145241227U, - 13829825898000017035U, 4600769149537129431U, - 4604621949701367983U, 4604465633578481725U, - 13827837670433257533U, 4604621949701367983U, - 4578227681973159812U, 4607181740574479067U, - 13830553777429254875U, 4578227681973159812U, - 4607182249242036882U, 4573724215515480177U, - 13797096252370255985U, 4607182249242036882U, - 4604505071555817232U, 4604583231088591477U, - 13827955267943367285U, 4604505071555817232U, - 4606475480113671417U, 4600667422348321968U, - 13824039459203097776U, 4606475480113671417U, - 4600463181646572228U, 4606517779747998088U, - 13829889816602773896U, 4600463181646572228U, - 4606998399608725124U, 4596413578358834022U, - 13819785615213609830U, 4606998399608725124U, - 4603133304188877240U, 4605694995810664660U, - 13829067032665440468U, 4603133304188877240U, - 4605633586259814045U, 4603225210076562971U, - 13826597246931338779U, 4605633586259814045U, - 4595979936813835462U, 4607019963775302583U, - 13830392000630078391U, 4595979936813835462U, - 4607133460805585796U, 4592167175087283203U, - 13815539211942059011U, 4607133460805585796U, - 4603846496621587377U, 4605172808754305228U, - 13828544845609081036U, 4603846496621587377U, - 4606092657816072624U, 4602260871257280788U, - 13825632908112056596U, 4606092657816072624U, - 4598795050632330097U, 4606810452769876110U, - 13830182489624651918U, 4598795050632330097U, - 4606778366364612594U, 4599006600037663623U, - 13822378636892439431U, 4606778366364612594U, - 4602065906208722008U, 4606144763310860551U, - 13829516800165636359U, 4602065906208722008U, - 4605102686554936490U, 4603931940768740167U, - 13827303977623515975U, 4605102686554936490U, - 4591287158938884897U, 4607144295058764886U, - 13830516331913540694U, 4591287158938884897U, - 4607168688050493276U, 4588115294056142819U, - 13811487330910918627U, 4607168688050493276U, - 4604183020748362039U, 4604886103475043762U, - 13828258140329819570U, 4604183020748362039U, - 4606293848208650998U, 4601473544562720001U, - 13824845581417495809U, 4606293848208650998U, - 4599636300858866724U, 4606674353838411301U, - 13830046390693187109U, 4599636300858866724U, - 4606898891031025132U, 4598136582470364665U, - 13821508619325140473U, 4606898891031025132U, - 4602758354025980442U, 4605929219593405673U, - 13829301256448181481U, 4602758354025980442U, - 4605376811039722786U, 4603585091850767959U, - 13826957128705543767U, 4605376811039722786U, - 4594235767444503503U, 4607092871118901179U, - 13830464907973676987U, 4594235767444503503U, - 4607076652372832968U, 4594673119063280916U, - 13818045155918056724U, 4607076652372832968U, - 4603496309891590679U, 4605442656228245717U, - 13828814693083021525U, 4603496309891590679U, - 4605872393621214213U, 4602853162432841185U, - 13826225199287616993U, 4605872393621214213U, - 4597707695679609371U, 4606925748668145757U, - 13830297785522921565U, 4597707695679609371U, - 4606637115963965612U, 4599844446633109139U, - 13823216483487884947U, 4606637115963965612U, - 4601273700967202825U, 4606341107699334546U, - 13829713144554110354U, 4601273700967202825U, - 4604811873195349477U, 4604264921241055824U, - 13827636958095831632U, 4604811873195349477U, - 4586348876009622851U, 4607174111710118367U, - 13830546148564894175U, 4586348876009622851U, - 4607178180169683960U, 4584498631466405633U, - 13807870668321181441U, 4607178180169683960U, - 4604345904647073908U, 4604736643460027021U, - 13828108680314802829U, 4604345904647073908U, - 4606387137437298591U, 4601072712526242277U, - 13824444749381018085U, 4606387137437298591U, - 4600051662802353687U, 4606598603759044570U, - 13829970640613820378U, 4600051662802353687U, - 4606951288507767453U, 4597277522845151878U, - 13820649559699927686U, 4606951288507767453U, - 4602947266358709886U, 4605814408482919348U, - 13829186445337695156U, 4602947266358709886U, - 4605507406967535927U, 4603406726595779752U, - 13826778763450555560U, 4605507406967535927U, - 4595109641634432498U, 4607059093103722971U, - 13830431129958498779U, 4595109641634432498U, - 4607107746899444102U, 4593797652641645341U, - 13817169689496421149U, 4607107746899444102U, - 4603673059103075106U, 4605309881318010327U, - 13828681918172786135U, 4603673059103075106U, - 4605984877841711338U, 4602646891659203088U, - 13826018928513978896U, 4605984877841711338U, - 4598369669086960528U, 4606870719641066940U, - 13830242756495842748U, 4598369669086960528U, - 4606710311774494716U, 4599427256825614420U, - 13822799293680390228U, 4606710311774494716U, - 4601672213217083403U, 4606245366082353408U, - 13829617402937129216U, 4601672213217083403U, - 4604959323120302796U, 4604100215502905499U, - 13827472252357681307U, 4604959323120302796U, - 4589524267239410099U, 4607161910007591876U, - 13830533946862367684U, 4589524267239410099U, - 4607153778602162496U, 4590406145430462614U, - 13813778182285238422U, 4607153778602162496U, - 4604016517974851588U, 4605031521104517324U, - 13828403557959293132U, 4604016517974851588U, - 4606195668621671667U, 4601869677011524443U, - 13825241713866300251U, 4606195668621671667U, - 4599217346014614711U, 4606744984357082948U, - 13830117021211858756U, 4599217346014614711U, - 4606841238740778884U, 4598582729657176439U, - 13821954766511952247U, 4606841238740778884U, - 4602454542796181607U, 4606039359984203741U, - 13829411396838979549U, 4602454542796181607U, - 4605241877142478242U, 4603760198400967492U, - 13827132235255743300U, 4605241877142478242U, - 4593046061348462537U, 4607121277474223905U, - 13830493314328999713U, 4593046061348462537U, - 4607040195955932526U, 4595545269419264690U, - 13818917306274040498U, 4607040195955932526U, - 4603316355454250015U, 4605571053506370248U, - 13828943090361146056U, 4603316355454250015U, - 4605755272910869620U, 4603040651631881451U, - 13826412688486657259U, 4605755272910869620U, - 4596846128749438754U, 4606975506703684317U, - 13830347543558460125U, 4596846128749438754U, - 4606558823023444576U, 4600257918160607478U, - 13823629955015383286U, 4606558823023444576U, - 4600870609507958271U, 4606431930490633905U, - 13829803967345409713U, 4600870609507958271U, - 4604660425598397818U, 4604425958770613225U, - 13827797995625389033U, 4604660425598397818U, - 4580962600092897021U, 4607180892816495009U, - 13830552929671270817U, 4580962600092897021U, - 4607180892816495009U, 4580962600092897021U, - 13804334636947672829U, 4607180892816495009U, - 4604425958770613225U, 4604660425598397818U, - 13828032462453173626U, 4604425958770613225U, - 4606431930490633905U, 4600870609507958271U, - 13824242646362734079U, 4606431930490633905U, - 4600257918160607478U, 4606558823023444576U, - 13829930859878220384U, 4600257918160607478U, - 4606975506703684317U, 4596846128749438754U, - 13820218165604214562U, 4606975506703684317U, - 4603040651631881451U, 4605755272910869620U, - 13829127309765645428U, 4603040651631881451U, - 4605571053506370248U, 4603316355454250015U, - 13826688392309025823U, 4605571053506370248U, - 4595545269419264690U, 4607040195955932526U, - 13830412232810708334U, 4595545269419264690U, - 4607121277474223905U, 4593046061348462537U, - 13816418098203238345U, 4607121277474223905U, - 4603760198400967492U, 4605241877142478242U, - 13828613913997254050U, 4603760198400967492U, - 4606039359984203741U, 4602454542796181607U, - 13825826579650957415U, 4606039359984203741U, - 4598582729657176439U, 4606841238740778884U, - 13830213275595554692U, 4598582729657176439U, - 4606744984357082948U, 4599217346014614711U, - 13822589382869390519U, 4606744984357082948U, - 4601869677011524443U, 4606195668621671667U, - 13829567705476447475U, 4601869677011524443U, - 4605031521104517324U, 4604016517974851588U, - 13827388554829627396U, 4605031521104517324U, - 4590406145430462614U, 4607153778602162496U, - 13830525815456938304U, 4590406145430462614U, - 4607161910007591876U, 4589524267239410099U, - 13812896304094185907U, 4607161910007591876U, - 4604100215502905499U, 4604959323120302796U, - 13828331359975078604U, 4604100215502905499U, - 4606245366082353408U, 4601672213217083403U, - 13825044250071859211U, 4606245366082353408U, - 4599427256825614420U, 4606710311774494716U, - 13830082348629270524U, 4599427256825614420U, - 4606870719641066940U, 4598369669086960528U, - 13821741705941736336U, 4606870719641066940U, - 4602646891659203088U, 4605984877841711338U, - 13829356914696487146U, 4602646891659203088U, - 4605309881318010327U, 4603673059103075106U, - 13827045095957850914U, 4605309881318010327U, - 4593797652641645341U, 4607107746899444102U, - 13830479783754219910U, 4593797652641645341U, - 4607059093103722971U, 4595109641634432498U, - 13818481678489208306U, 4607059093103722971U, - 4603406726595779752U, 4605507406967535927U, - 13828879443822311735U, 4603406726595779752U, - 4605814408482919348U, 4602947266358709886U, - 13826319303213485694U, 4605814408482919348U, - 4597277522845151878U, 4606951288507767453U, - 13830323325362543261U, 4597277522845151878U, - 4606598603759044570U, 4600051662802353687U, - 13823423699657129495U, 4606598603759044570U, - 4601072712526242277U, 4606387137437298591U, - 13829759174292074399U, 4601072712526242277U, - 4604736643460027021U, 4604345904647073908U, - 13827717941501849716U, 4604736643460027021U, - 4584498631466405633U, 4607178180169683960U, - 13830550217024459768U, 4584498631466405633U, - 4607174111710118367U, 4586348876009622851U, - 13809720912864398659U, 4607174111710118367U, - 4604264921241055824U, 4604811873195349477U, - 13828183910050125285U, 4604264921241055824U, - 4606341107699334546U, 4601273700967202825U, - 13824645737821978633U, 4606341107699334546U, - 4599844446633109139U, 4606637115963965612U, - 13830009152818741420U, 4599844446633109139U, - 4606925748668145757U, 4597707695679609371U, - 13821079732534385179U, 4606925748668145757U, - 4602853162432841185U, 4605872393621214213U, - 13829244430475990021U, 4602853162432841185U, - 4605442656228245717U, 4603496309891590679U, - 13826868346746366487U, 4605442656228245717U, - 4594673119063280916U, 4607076652372832968U, - 13830448689227608776U, 4594673119063280916U, - 4607092871118901179U, 4594235767444503503U, - 13817607804299279311U, 4607092871118901179U, - 4603585091850767959U, 4605376811039722786U, - 13828748847894498594U, 4603585091850767959U, - 4605929219593405673U, 4602758354025980442U, - 13826130390880756250U, 4605929219593405673U, - 4598136582470364665U, 4606898891031025132U, - 13830270927885800940U, 4598136582470364665U, - 4606674353838411301U, 4599636300858866724U, - 13823008337713642532U, 4606674353838411301U, - 4601473544562720001U, 4606293848208650998U, - 13829665885063426806U, 4601473544562720001U, - 4604886103475043762U, 4604183020748362039U, - 13827555057603137847U, 4604886103475043762U, - 4588115294056142819U, 4607168688050493276U, - 13830540724905269084U, 4588115294056142819U, - 4607144295058764886U, 4591287158938884897U, - 13814659195793660705U, 4607144295058764886U, - 4603931940768740167U, 4605102686554936490U, - 13828474723409712298U, 4603931940768740167U, - 4606144763310860551U, 4602065906208722008U, - 13825437943063497816U, 4606144763310860551U, - 4599006600037663623U, 4606778366364612594U, - 13830150403219388402U, 4599006600037663623U, - 4606810452769876110U, 4598795050632330097U, - 13822167087487105905U, 4606810452769876110U, - 4602260871257280788U, 4606092657816072624U, - 13829464694670848432U, 4602260871257280788U, - 4605172808754305228U, 4603846496621587377U, - 13827218533476363185U, 4605172808754305228U, - 4592167175087283203U, 4607133460805585796U, - 13830505497660361604U, 4592167175087283203U, - 4607019963775302583U, 4595979936813835462U, - 13819351973668611270U, 4607019963775302583U, - 4603225210076562971U, 4605633586259814045U, - 13829005623114589853U, 4603225210076562971U, - 4605694995810664660U, 4603133304188877240U, - 13826505341043653048U, 4605694995810664660U, - 4596413578358834022U, 4606998399608725124U, - 13830370436463500932U, 4596413578358834022U, - 4606517779747998088U, 4600463181646572228U, - 13823835218501348036U, 4606517779747998088U, - 4600667422348321968U, 4606475480113671417U, - 13829847516968447225U, 4600667422348321968U, - 4604583231088591477U, 4604505071555817232U, - 13827877108410593040U, 4604583231088591477U, - 4573724215515480177U, 4607182249242036882U, - 13830554286096812690U, 4573724215515480177U, - 4607182376410422530U, 4569220649180767418U, - 13792592686035543226U, 4607182376410422530U, - 4604524701268679793U, 4604563781218984604U, - 13827935818073760412U, 4604524701268679793U, - 4606486172460753999U, 4600616459743653188U, - 13823988496598428996U, 4606486172460753999U, - 4600514338912178239U, 4606507322377452870U, - 13829879359232228678U, 4600514338912178239U, - 4607003915349878877U, 4596305267720071930U, - 13819677304574847738U, 4607003915349878877U, - 4603156351203636159U, 4605679749231851918U, - 13829051786086627726U, 4603156351203636159U, - 4605649044311923410U, 4603202304363743346U, - 13826574341218519154U, 4605649044311923410U, - 4596088445927168004U, 4607014697483910382U, - 13830386734338686190U, 4596088445927168004U, - 4607136295912168606U, 4591947271803021404U, - 13815319308657797212U, 4607136295912168606U, - 4603867938232615808U, 4605155376589456981U, - 13828527413444232789U, 4603867938232615808U, - 4606105796280968177U, 4602212250118051877U, - 13825584286972827685U, 4606105796280968177U, - 4598848011564831930U, 4606802552898869248U, - 13830174589753645056U, 4598848011564831930U, - 4606786509620734768U, 4598953786765296928U, - 13822325823620072736U, 4606786509620734768U, - 4602114767134999006U, 4606131849150971908U, - 13829503886005747716U, 4602114767134999006U, - 4605120315324767624U, 4603910660507251362U, - 13827282697362027170U, 4605120315324767624U, - 4591507261658050721U, 4607141713064252300U, - 13830513749919028108U, 4591507261658050721U, - 4607170170974224083U, 4587673791460508439U, - 13811045828315284247U, 4607170170974224083U, - 4604203581176243359U, 4604867640218014515U, - 13828239677072790323U, 4604203581176243359U, - 4606305777984577632U, 4601423692641949331U, - 13824795729496725139U, 4606305777984577632U, - 4599688422741010356U, 4606665164148251002U, - 13830037201003026810U, 4599688422741010356U, - 4606905728766014348U, 4598029484874872834U, - 13821401521729648642U, 4606905728766014348U, - 4602782121393764535U, 4605915122243179241U, - 13829287159097955049U, 4602782121393764535U, - 4605393374401988274U, 4603562972219549215U, - 13826935009074325023U, 4605393374401988274U, - 4594345179472540681U, 4607088942243446236U, - 13830460979098222044U, 4594345179472540681U, - 4607080832832247697U, 4594563856311064231U, - 13817935893165840039U, 4607080832832247697U, - 4603518581031047189U, 4605426297151190466U, - 13828798334005966274U, 4603518581031047189U, - 4605886709123365959U, 4602829525820289164U, - 13826201562675064972U, 4605886709123365959U, - 4597815040470278984U, 4606919157647773535U, - 13830291194502549343U, 4597815040470278984U, - 4606646545123403481U, 4599792496117920694U, - 13823164532972696502U, 4606646545123403481U, - 4601323770373937522U, 4606329407841126011U, - 13829701444695901819U, 4601323770373937522U, - 4604830524903495634U, 4604244531615310815U, - 13827616568470086623U, 4604830524903495634U, - 4586790578280679046U, 4607172882816799076U, - 13830544919671574884U, 4586790578280679046U, - 4607178985458280057U, 4583614727651146525U, - 13806986764505922333U, 4607178985458280057U, - 4604366005771528720U, 4604717681185626434U, - 13828089718040402242U, 4604366005771528720U, - 4606398451906509788U, 4601022290077223616U, - 13824394326931999424U, 4606398451906509788U, - 4600103317933788342U, 4606588777269136769U, - 13829960814123912577U, 4600103317933788342U, - 4606957467106717424U, 4597169786279785693U, - 13820541823134561501U, 4606957467106717424U, - 4602970680601913687U, 4605799732098147061U, - 13829171768952922869U, 4602970680601913687U, - 4605523422498301790U, 4603384207141321914U, - 13826756243996097722U, 4605523422498301790U, - 4595218635031890910U, 4607054494135176056U, - 13830426530989951864U, 4595218635031890910U, - 4607111255739239816U, 4593688012422887515U, - 13817060049277663323U, 4607111255739239816U, - 4603694922063032361U, 4605292980606880364U, - 13828665017461656172U, 4603694922063032361U, - 4605998608960791335U, 4602598930031891166U, - 13825970966886666974U, 4605998608960791335U, - 4598423001813699022U, 4606863472012527185U, - 13830235508867302993U, 4598423001813699022U, - 4606719100629313491U, 4599374859150636784U, - 13822746896005412592U, 4606719100629313491U, - 4601721693286060937U, 4606233055365547081U, - 13829605092220322889U, 4601721693286060937U, - 4604977468824438271U, 4604079374282302598U, - 13827451411137078406U, 4604977468824438271U, - 4589744810590291021U, 4607160003989618959U, - 13830532040844394767U, 4589744810590291021U, - 4607155938267770208U, 4590185751760970393U, - 13813557788615746201U, 4607155938267770208U, - 4604037525321326463U, 4605013567986435066U, - 13828385604841210874U, 4604037525321326463U, - 4606208206518262803U, 4601820425647934753U, - 13825192462502710561U, 4606208206518262803U, - 4599269903251194481U, 4606736437002195879U, - 13830108473856971687U, 4599269903251194481U, - 4606848731493011465U, 4598529532600161144U, - 13821901569454936952U, 4606848731493011465U, - 4602502755147763107U, 4606025850160239809U, - 13829397887015015617U, 4602502755147763107U, - 4605258978359093269U, 4603738491917026584U, - 13827110528771802392U, 4605258978359093269U, - 4593265590854265407U, 4607118021058468598U, - 13830490057913244406U, 4593265590854265407U, - 4607045045516813836U, 4595436449949385485U, - 13818808486804161293U, 4607045045516813836U, - 4603339021357904144U, 4605555245917486022U, - 13828927282772261830U, 4603339021357904144U, - 4605770164172969910U, 4603017373458244943U, - 13826389410313020751U, 4605770164172969910U, - 4596954088216812973U, 4606969576261663845U, - 13830341613116439653U, 4596954088216812973U, - 4606568886807728474U, 4600206446098256018U, - 13823578482953031826U, 4606568886807728474U, - 4600921238092511730U, 4606420848538580260U, - 13829792885393356068U, 4600921238092511730U, - 4604679572075463103U, 4604406033021674239U, - 13827778069876450047U, 4604679572075463103U, - 4581846703643734566U, 4607180341788068727U, - 13830552378642844535U, 4581846703643734566U, - 4607181359080094673U, 4579996072175835083U, - 13803368109030610891U, 4607181359080094673U, - 4604445825685214043U, 4604641218080103285U, - 13828013254934879093U, 4604445825685214043U, - 4606442934727379583U, 4600819913163773071U, - 13824191950018548879U, 4606442934727379583U, - 4600309328230211502U, 4606548680329491866U, - 13829920717184267674U, 4600309328230211502U, - 4606981354314050484U, 4596738097012783531U, - 13820110133867559339U, 4606981354314050484U, - 4603063884010218172U, 4605740310302420207U, - 13829112347157196015U, 4603063884010218172U, - 4605586791482848547U, 4603293641160266722U, - 13826665678015042530U, 4605586791482848547U, - 4595654028864046335U, 4607035262954517034U, - 13830407299809292842U, 4595654028864046335U, - 4607124449686274900U, 4592826452951465409U, - 13816198489806241217U, 4607124449686274900U, - 4603781852316960384U, 4605224709411790590U, - 13828596746266566398U, 4603781852316960384U, - 4606052795787882823U, 4602406247776385022U, - 13825778284631160830U, 4606052795787882823U, - 4598635880488956483U, 4606833664420673202U, - 13830205701275449010U, 4598635880488956483U, - 4606753451050079834U, 4599164736579548843U, - 13822536773434324651U, 4606753451050079834U, - 4601918851211878557U, 4606183055233559255U, - 13829555092088335063U, 4601918851211878557U, - 4605049409688478101U, 4603995455647851249U, - 13827367492502627057U, 4605049409688478101U, - 4590626485056654602U, 4607151534426937478U, - 13830523571281713286U, 4590626485056654602U, - 4607163731439411601U, 4589303678145802340U, - 13812675715000578148U, 4607163731439411601U, - 4604121000955189926U, 4604941113561600762U, - 13828313150416376570U, 4604121000955189926U, - 4606257600839867033U, 4601622657843474729U, - 13824994694698250537U, 4606257600839867033U, - 4599479600326345459U, 4606701442584137310U, - 13830073479438913118U, 4599479600326345459U, - 4606877885424248132U, 4598316292140394014U, - 13821688328995169822U, 4606877885424248132U, - 4602686793990243041U, 4605971073215153165U, - 13829343110069928973U, 4602686793990243041U, - 4605326714874986465U, 4603651144395358093U, - 13827023181250133901U, 4605326714874986465U, - 4593907249284540294U, 4607104153983298999U, - 13830476190838074807U, 4593907249284540294U, - 4607063608453868552U, 4595000592312171144U, - 13818372629166946952U, 4607063608453868552U, - 4603429196809300824U, 4605491322423429598U, - 13828863359278205406U, 4603429196809300824U, - 4605829012964735987U, 4602923807199184054U, - 13826295844053959862U, 4605829012964735987U, - 4597385183080791534U, 4606945027305114062U, - 13830317064159889870U, 4597385183080791534U, - 4606608350964852124U, 4599999947619525579U, - 13823371984474301387U, 4606608350964852124U, - 4601123065313358619U, 4606375745674388705U, - 13829747782529164513U, 4601123065313358619U, - 4604755543975806820U, 4604325745441780828U, - 13827697782296556636U, 4604755543975806820U, - 4585023436363055487U, 4607177290141793710U, - 13830549326996569518U, 4585023436363055487U, - 4607175255902437396U, 4585907115494236537U, - 13809279152349012345U, 4607175255902437396U, - 4604285253548209224U, 4604793159020491611U, - 13828165195875267419U, 4604285253548209224U, - 4606352730697093817U, 4601223560006786057U, - 13824595596861561865U, 4606352730697093817U, - 4599896339047301634U, 4606627607157935956U, - 13829999644012711764U, 4599896339047301634U, - 4606932257325205256U, 4597600270510262682U, - 13820972307365038490U, 4606932257325205256U, - 4602876755014813164U, 4605858005670328613U, - 13829230042525104421U, 4602876755014813164U, - 4605458946901419122U, 4603473988668005304U, - 13826846025522781112U, 4605458946901419122U, - 4594782329999411347U, 4607072388129742377U, - 13830444424984518185U, 4594782329999411347U, - 4607096716058023245U, 4594126307716900071U, - 13817498344571675879U, 4607096716058023245U, - 4603607160562208225U, 4605360179893335444U, - 13828732216748111252U, 4603607160562208225U, - 4605943243960030558U, 4602734543519989142U, - 13826106580374764950U, 4605943243960030558U, - 4598209407597805010U, 4606891971185517504U, - 13830264008040293312U, 4598209407597805010U, - 4606683463531482757U, 4599584122834874440U, - 13822956159689650248U, 4606683463531482757U, - 4601523323048804569U, 4606281842017099424U, - 13829653878871875232U, 4601523323048804569U, - 4604904503566677638U, 4604162403772767740U, - 13827534440627543548U, 4604904503566677638U, - 4588556721781247689U, 4607167120476811757U, - 13830539157331587565U, 4588556721781247689U, - 4607146792632922887U, 4591066993883984169U, - 13814439030738759977U, 4607146792632922887U, - 4603953166845776383U, 4605084992581147553U, - 13828457029435923361U, 4603953166845776383U, - 4606157602458368090U, 4602016966272225497U, - 13825389003127001305U, 4606157602458368090U, - 4599059363095165615U, 4606770142132396069U, - 13830142178987171877U, 4599059363095165615U, - 4606818271362779153U, 4598742041476147134U, - 13822114078330922942U, 4606818271362779153U, - 4602309411551204896U, 4606079444829232727U, - 13829451481684008535U, 4602309411551204896U, - 4605190175055178825U, 4603825001630339212U, - 13827197038485115020U, 4605190175055178825U, - 4592387007752762956U, 4607130541380624519U, - 13830502578235400327U, 4592387007752762956U, - 4607025146816593591U, 4595871363584150300U, - 13819243400438926108U, 4607025146816593591U, - 4603248068256948438U, 4605618058006716661U, - 13828990094861492469U, 4603248068256948438U, - 4605710171610479304U, 4603110210506737381U, - 13826482247361513189U, 4605710171610479304U, - 4596521820799644122U, 4606992800820440327U, - 13830364837675216135U, 4596521820799644122U, - 4606528158595189433U, 4600411960456200676U, - 13823783997310976484U, 4606528158595189433U, - 4600718319105833937U, 4606464709641375231U, - 13829836746496151039U, 4600718319105833937U, - 4604602620643553229U, 4604485382263976838U, - 13827857419118752646U, 4604602620643553229U, - 4576459225186735875U, 4607182037296057423U, - 13830554074150833231U, 4576459225186735875U, - 4607182037296057423U, 4576459225186735875U, - 13799831262041511683U, 4607182037296057423U, - 4604485382263976838U, 4604602620643553229U, - 13827974657498329037U, 4604485382263976838U, - 4606464709641375231U, 4600718319105833937U, - 13824090355960609745U, 4606464709641375231U, - 4600411960456200676U, 4606528158595189433U, - 13829900195449965241U, 4600411960456200676U, - 4606992800820440327U, 4596521820799644122U, - 13819893857654419930U, 4606992800820440327U, - 4603110210506737381U, 4605710171610479304U, - 13829082208465255112U, 4603110210506737381U, - 4605618058006716661U, 4603248068256948438U, - 13826620105111724246U, 4605618058006716661U, - 4595871363584150300U, 4607025146816593591U, - 13830397183671369399U, 4595871363584150300U, - 4607130541380624519U, 4592387007752762956U, - 13815759044607538764U, 4607130541380624519U, - 4603825001630339212U, 4605190175055178825U, - 13828562211909954633U, 4603825001630339212U, - 4606079444829232727U, 4602309411551204896U, - 13825681448405980704U, 4606079444829232727U, - 4598742041476147134U, 4606818271362779153U, - 13830190308217554961U, 4598742041476147134U, - 4606770142132396069U, 4599059363095165615U, - 13822431399949941423U, 4606770142132396069U, - 4602016966272225497U, 4606157602458368090U, - 13829529639313143898U, 4602016966272225497U, - 4605084992581147553U, 4603953166845776383U, - 13827325203700552191U, 4605084992581147553U, - 4591066993883984169U, 4607146792632922887U, - 13830518829487698695U, 4591066993883984169U, - 4607167120476811757U, 4588556721781247689U, - 13811928758636023497U, 4607167120476811757U, - 4604162403772767740U, 4604904503566677638U, - 13828276540421453446U, 4604162403772767740U, - 4606281842017099424U, 4601523323048804569U, - 13824895359903580377U, 4606281842017099424U, - 4599584122834874440U, 4606683463531482757U, - 13830055500386258565U, 4599584122834874440U, - 4606891971185517504U, 4598209407597805010U, - 13821581444452580818U, 4606891971185517504U, - 4602734543519989142U, 4605943243960030558U, - 13829315280814806366U, 4602734543519989142U, - 4605360179893335444U, 4603607160562208225U, - 13826979197416984033U, 4605360179893335444U, - 4594126307716900071U, 4607096716058023245U, - 13830468752912799053U, 4594126307716900071U, - 4607072388129742377U, 4594782329999411347U, - 13818154366854187155U, 4607072388129742377U, - 4603473988668005304U, 4605458946901419122U, - 13828830983756194930U, 4603473988668005304U, - 4605858005670328613U, 4602876755014813164U, - 13826248791869588972U, 4605858005670328613U, - 4597600270510262682U, 4606932257325205256U, - 13830304294179981064U, 4597600270510262682U, - 4606627607157935956U, 4599896339047301634U, - 13823268375902077442U, 4606627607157935956U, - 4601223560006786057U, 4606352730697093817U, - 13829724767551869625U, 4601223560006786057U, - 4604793159020491611U, 4604285253548209224U, - 13827657290402985032U, 4604793159020491611U, - 4585907115494236537U, 4607175255902437396U, - 13830547292757213204U, 4585907115494236537U, - 4607177290141793710U, 4585023436363055487U, - 13808395473217831295U, 4607177290141793710U, - 4604325745441780828U, 4604755543975806820U, - 13828127580830582628U, 4604325745441780828U, - 4606375745674388705U, 4601123065313358619U, - 13824495102168134427U, 4606375745674388705U, - 4599999947619525579U, 4606608350964852124U, - 13829980387819627932U, 4599999947619525579U, - 4606945027305114062U, 4597385183080791534U, - 13820757219935567342U, 4606945027305114062U, - 4602923807199184054U, 4605829012964735987U, - 13829201049819511795U, 4602923807199184054U, - 4605491322423429598U, 4603429196809300824U, - 13826801233664076632U, 4605491322423429598U, - 4595000592312171144U, 4607063608453868552U, - 13830435645308644360U, 4595000592312171144U, - 4607104153983298999U, 4593907249284540294U, - 13817279286139316102U, 4607104153983298999U, - 4603651144395358093U, 4605326714874986465U, - 13828698751729762273U, 4603651144395358093U, - 4605971073215153165U, 4602686793990243041U, - 13826058830845018849U, 4605971073215153165U, - 4598316292140394014U, 4606877885424248132U, - 13830249922279023940U, 4598316292140394014U, - 4606701442584137310U, 4599479600326345459U, - 13822851637181121267U, 4606701442584137310U, - 4601622657843474729U, 4606257600839867033U, - 13829629637694642841U, 4601622657843474729U, - 4604941113561600762U, 4604121000955189926U, - 13827493037809965734U, 4604941113561600762U, - 4589303678145802340U, 4607163731439411601U, - 13830535768294187409U, 4589303678145802340U, - 4607151534426937478U, 4590626485056654602U, - 13813998521911430410U, 4607151534426937478U, - 4603995455647851249U, 4605049409688478101U, - 13828421446543253909U, 4603995455647851249U, - 4606183055233559255U, 4601918851211878557U, - 13825290888066654365U, 4606183055233559255U, - 4599164736579548843U, 4606753451050079834U, - 13830125487904855642U, 4599164736579548843U, - 4606833664420673202U, 4598635880488956483U, - 13822007917343732291U, 4606833664420673202U, - 4602406247776385022U, 4606052795787882823U, - 13829424832642658631U, 4602406247776385022U, - 4605224709411790590U, 4603781852316960384U, - 13827153889171736192U, 4605224709411790590U, - 4592826452951465409U, 4607124449686274900U, - 13830496486541050708U, 4592826452951465409U, - 4607035262954517034U, 4595654028864046335U, - 13819026065718822143U, 4607035262954517034U, - 4603293641160266722U, 4605586791482848547U, - 13828958828337624355U, 4603293641160266722U, - 4605740310302420207U, 4603063884010218172U, - 13826435920864993980U, 4605740310302420207U, - 4596738097012783531U, 4606981354314050484U, - 13830353391168826292U, 4596738097012783531U, - 4606548680329491866U, 4600309328230211502U, - 13823681365084987310U, 4606548680329491866U, - 4600819913163773071U, 4606442934727379583U, - 13829814971582155391U, 4600819913163773071U, - 4604641218080103285U, 4604445825685214043U, - 13827817862539989851U, 4604641218080103285U, - 4579996072175835083U, 4607181359080094673U, - 13830553395934870481U, 4579996072175835083U, - 4607180341788068727U, 4581846703643734566U, - 13805218740498510374U, 4607180341788068727U, - 4604406033021674239U, 4604679572075463103U, - 13828051608930238911U, 4604406033021674239U, - 4606420848538580260U, 4600921238092511730U, - 13824293274947287538U, 4606420848538580260U, - 4600206446098256018U, 4606568886807728474U, - 13829940923662504282U, 4600206446098256018U, - 4606969576261663845U, 4596954088216812973U, - 13820326125071588781U, 4606969576261663845U, - 4603017373458244943U, 4605770164172969910U, - 13829142201027745718U, 4603017373458244943U, - 4605555245917486022U, 4603339021357904144U, - 13826711058212679952U, 4605555245917486022U, - 4595436449949385485U, 4607045045516813836U, - 13830417082371589644U, 4595436449949385485U, - 4607118021058468598U, 4593265590854265407U, - 13816637627709041215U, 4607118021058468598U, - 4603738491917026584U, 4605258978359093269U, - 13828631015213869077U, 4603738491917026584U, - 4606025850160239809U, 4602502755147763107U, - 13825874792002538915U, 4606025850160239809U, - 4598529532600161144U, 4606848731493011465U, - 13830220768347787273U, 4598529532600161144U, - 4606736437002195879U, 4599269903251194481U, - 13822641940105970289U, 4606736437002195879U, - 4601820425647934753U, 4606208206518262803U, - 13829580243373038611U, 4601820425647934753U, - 4605013567986435066U, 4604037525321326463U, - 13827409562176102271U, 4605013567986435066U, - 4590185751760970393U, 4607155938267770208U, - 13830527975122546016U, 4590185751760970393U, - 4607160003989618959U, 4589744810590291021U, - 13813116847445066829U, 4607160003989618959U, - 4604079374282302598U, 4604977468824438271U, - 13828349505679214079U, 4604079374282302598U, - 4606233055365547081U, 4601721693286060937U, - 13825093730140836745U, 4606233055365547081U, - 4599374859150636784U, 4606719100629313491U, - 13830091137484089299U, 4599374859150636784U, - 4606863472012527185U, 4598423001813699022U, - 13821795038668474830U, 4606863472012527185U, - 4602598930031891166U, 4605998608960791335U, - 13829370645815567143U, 4602598930031891166U, - 4605292980606880364U, 4603694922063032361U, - 13827066958917808169U, 4605292980606880364U, - 4593688012422887515U, 4607111255739239816U, - 13830483292594015624U, 4593688012422887515U, - 4607054494135176056U, 4595218635031890910U, - 13818590671886666718U, 4607054494135176056U, - 4603384207141321914U, 4605523422498301790U, - 13828895459353077598U, 4603384207141321914U, - 4605799732098147061U, 4602970680601913687U, - 13826342717456689495U, 4605799732098147061U, - 4597169786279785693U, 4606957467106717424U, - 13830329503961493232U, 4597169786279785693U, - 4606588777269136769U, 4600103317933788342U, - 13823475354788564150U, 4606588777269136769U, - 4601022290077223616U, 4606398451906509788U, - 13829770488761285596U, 4601022290077223616U, - 4604717681185626434U, 4604366005771528720U, - 13827738042626304528U, 4604717681185626434U, - 4583614727651146525U, 4607178985458280057U, - 13830551022313055865U, 4583614727651146525U, - 4607172882816799076U, 4586790578280679046U, - 13810162615135454854U, 4607172882816799076U, - 4604244531615310815U, 4604830524903495634U, - 13828202561758271442U, 4604244531615310815U, - 4606329407841126011U, 4601323770373937522U, - 13824695807228713330U, 4606329407841126011U, - 4599792496117920694U, 4606646545123403481U, - 13830018581978179289U, 4599792496117920694U, - 4606919157647773535U, 4597815040470278984U, - 13821187077325054792U, 4606919157647773535U, - 4602829525820289164U, 4605886709123365959U, - 13829258745978141767U, 4602829525820289164U, - 4605426297151190466U, 4603518581031047189U, - 13826890617885822997U, 4605426297151190466U, - 4594563856311064231U, 4607080832832247697U, - 13830452869687023505U, 4594563856311064231U, - 4607088942243446236U, 4594345179472540681U, - 13817717216327316489U, 4607088942243446236U, - 4603562972219549215U, 4605393374401988274U, - 13828765411256764082U, 4603562972219549215U, - 4605915122243179241U, 4602782121393764535U, - 13826154158248540343U, 4605915122243179241U, - 4598029484874872834U, 4606905728766014348U, - 13830277765620790156U, 4598029484874872834U, - 4606665164148251002U, 4599688422741010356U, - 13823060459595786164U, 4606665164148251002U, - 4601423692641949331U, 4606305777984577632U, - 13829677814839353440U, 4601423692641949331U, - 4604867640218014515U, 4604203581176243359U, - 13827575618031019167U, 4604867640218014515U, - 4587673791460508439U, 4607170170974224083U, - 13830542207828999891U, 4587673791460508439U, - 4607141713064252300U, 4591507261658050721U, - 13814879298512826529U, 4607141713064252300U, - 4603910660507251362U, 4605120315324767624U, - 13828492352179543432U, 4603910660507251362U, - 4606131849150971908U, 4602114767134999006U, - 13825486803989774814U, 4606131849150971908U, - 4598953786765296928U, 4606786509620734768U, - 13830158546475510576U, 4598953786765296928U, - 4606802552898869248U, 4598848011564831930U, - 13822220048419607738U, 4606802552898869248U, - 4602212250118051877U, 4606105796280968177U, - 13829477833135743985U, 4602212250118051877U, - 4605155376589456981U, 4603867938232615808U, - 13827239975087391616U, 4605155376589456981U, - 4591947271803021404U, 4607136295912168606U, - 13830508332766944414U, 4591947271803021404U, - 4607014697483910382U, 4596088445927168004U, - 13819460482781943812U, 4607014697483910382U, - 4603202304363743346U, 4605649044311923410U, - 13829021081166699218U, 4603202304363743346U, - 4605679749231851918U, 4603156351203636159U, - 13826528388058411967U, 4605679749231851918U, - 4596305267720071930U, 4607003915349878877U, - 13830375952204654685U, 4596305267720071930U, - 4606507322377452870U, 4600514338912178239U, - 13823886375766954047U, 4606507322377452870U, - 4600616459743653188U, 4606486172460753999U, - 13829858209315529807U, 4600616459743653188U, - 4604563781218984604U, 4604524701268679793U, - 13827896738123455601U, 4604563781218984604U, - 4569220649180767418U, 4607182376410422530U, - 13830554413265198338U, 4569220649180767418U -}; - -const fpr fpr_p2_tab[] = { - 4611686018427387904U, - 4607182418800017408U, - 4602678819172646912U, - 4598175219545276416U, - 4593671619917905920U, - 4589168020290535424U, - 4584664420663164928U, - 4580160821035794432U, - 4575657221408423936U, - 4571153621781053440U, - 4566650022153682944U -}; - -#elif FALCON_FPNATIVE // yyyFPEMU+0 yyyFPNATIVE+1 - -const fpr fpr_gm_tab[] = { - {0}, {0}, /* unused */ - {-0.000000000000000000000000000}, { 1.000000000000000000000000000}, - { 0.707106781186547524400844362}, { 0.707106781186547524400844362}, - {-0.707106781186547524400844362}, { 0.707106781186547524400844362}, - { 0.923879532511286756128183189}, { 0.382683432365089771728459984}, - {-0.382683432365089771728459984}, { 0.923879532511286756128183189}, - { 0.382683432365089771728459984}, { 0.923879532511286756128183189}, - {-0.923879532511286756128183189}, { 0.382683432365089771728459984}, - { 0.980785280403230449126182236}, { 0.195090322016128267848284868}, - {-0.195090322016128267848284868}, { 0.980785280403230449126182236}, - { 0.555570233019602224742830814}, { 0.831469612302545237078788378}, - {-0.831469612302545237078788378}, { 0.555570233019602224742830814}, - { 0.831469612302545237078788378}, { 0.555570233019602224742830814}, - {-0.555570233019602224742830814}, { 0.831469612302545237078788378}, - { 0.195090322016128267848284868}, { 0.980785280403230449126182236}, - {-0.980785280403230449126182236}, { 0.195090322016128267848284868}, - { 0.995184726672196886244836953}, { 0.098017140329560601994195564}, - {-0.098017140329560601994195564}, { 0.995184726672196886244836953}, - { 0.634393284163645498215171613}, { 0.773010453362736960810906610}, - {-0.773010453362736960810906610}, { 0.634393284163645498215171613}, - { 0.881921264348355029712756864}, { 0.471396736825997648556387626}, - {-0.471396736825997648556387626}, { 0.881921264348355029712756864}, - { 0.290284677254462367636192376}, { 0.956940335732208864935797887}, - {-0.956940335732208864935797887}, { 0.290284677254462367636192376}, - { 0.956940335732208864935797887}, { 0.290284677254462367636192376}, - {-0.290284677254462367636192376}, { 0.956940335732208864935797887}, - { 0.471396736825997648556387626}, { 0.881921264348355029712756864}, - {-0.881921264348355029712756864}, { 0.471396736825997648556387626}, - { 0.773010453362736960810906610}, { 0.634393284163645498215171613}, - {-0.634393284163645498215171613}, { 0.773010453362736960810906610}, - { 0.098017140329560601994195564}, { 0.995184726672196886244836953}, - {-0.995184726672196886244836953}, { 0.098017140329560601994195564}, - { 0.998795456205172392714771605}, { 0.049067674327418014254954977}, - {-0.049067674327418014254954977}, { 0.998795456205172392714771605}, - { 0.671558954847018400625376850}, { 0.740951125354959091175616897}, - {-0.740951125354959091175616897}, { 0.671558954847018400625376850}, - { 0.903989293123443331586200297}, { 0.427555093430282094320966857}, - {-0.427555093430282094320966857}, { 0.903989293123443331586200297}, - { 0.336889853392220050689253213}, { 0.941544065183020778412509403}, - {-0.941544065183020778412509403}, { 0.336889853392220050689253213}, - { 0.970031253194543992603984207}, { 0.242980179903263889948274162}, - {-0.242980179903263889948274162}, { 0.970031253194543992603984207}, - { 0.514102744193221726593693839}, { 0.857728610000272069902269984}, - {-0.857728610000272069902269984}, { 0.514102744193221726593693839}, - { 0.803207531480644909806676513}, { 0.595699304492433343467036529}, - {-0.595699304492433343467036529}, { 0.803207531480644909806676513}, - { 0.146730474455361751658850130}, { 0.989176509964780973451673738}, - {-0.989176509964780973451673738}, { 0.146730474455361751658850130}, - { 0.989176509964780973451673738}, { 0.146730474455361751658850130}, - {-0.146730474455361751658850130}, { 0.989176509964780973451673738}, - { 0.595699304492433343467036529}, { 0.803207531480644909806676513}, - {-0.803207531480644909806676513}, { 0.595699304492433343467036529}, - { 0.857728610000272069902269984}, { 0.514102744193221726593693839}, - {-0.514102744193221726593693839}, { 0.857728610000272069902269984}, - { 0.242980179903263889948274162}, { 0.970031253194543992603984207}, - {-0.970031253194543992603984207}, { 0.242980179903263889948274162}, - { 0.941544065183020778412509403}, { 0.336889853392220050689253213}, - {-0.336889853392220050689253213}, { 0.941544065183020778412509403}, - { 0.427555093430282094320966857}, { 0.903989293123443331586200297}, - {-0.903989293123443331586200297}, { 0.427555093430282094320966857}, - { 0.740951125354959091175616897}, { 0.671558954847018400625376850}, - {-0.671558954847018400625376850}, { 0.740951125354959091175616897}, - { 0.049067674327418014254954977}, { 0.998795456205172392714771605}, - {-0.998795456205172392714771605}, { 0.049067674327418014254954977}, - { 0.999698818696204220115765650}, { 0.024541228522912288031734529}, - {-0.024541228522912288031734529}, { 0.999698818696204220115765650}, - { 0.689540544737066924616730630}, { 0.724247082951466920941069243}, - {-0.724247082951466920941069243}, { 0.689540544737066924616730630}, - { 0.914209755703530654635014829}, { 0.405241314004989870908481306}, - {-0.405241314004989870908481306}, { 0.914209755703530654635014829}, - { 0.359895036534988148775104572}, { 0.932992798834738887711660256}, - {-0.932992798834738887711660256}, { 0.359895036534988148775104572}, - { 0.975702130038528544460395766}, { 0.219101240156869797227737547}, - {-0.219101240156869797227737547}, { 0.975702130038528544460395766}, - { 0.534997619887097210663076905}, { 0.844853565249707073259571205}, - {-0.844853565249707073259571205}, { 0.534997619887097210663076905}, - { 0.817584813151583696504920884}, { 0.575808191417845300745972454}, - {-0.575808191417845300745972454}, { 0.817584813151583696504920884}, - { 0.170961888760301226363642357}, { 0.985277642388941244774018433}, - {-0.985277642388941244774018433}, { 0.170961888760301226363642357}, - { 0.992479534598709998156767252}, { 0.122410675199216198498704474}, - {-0.122410675199216198498704474}, { 0.992479534598709998156767252}, - { 0.615231590580626845484913563}, { 0.788346427626606262009164705}, - {-0.788346427626606262009164705}, { 0.615231590580626845484913563}, - { 0.870086991108711418652292404}, { 0.492898192229784036873026689}, - {-0.492898192229784036873026689}, { 0.870086991108711418652292404}, - { 0.266712757474898386325286515}, { 0.963776065795439866686464356}, - {-0.963776065795439866686464356}, { 0.266712757474898386325286515}, - { 0.949528180593036667195936074}, { 0.313681740398891476656478846}, - {-0.313681740398891476656478846}, { 0.949528180593036667195936074}, - { 0.449611329654606600046294579}, { 0.893224301195515320342416447}, - {-0.893224301195515320342416447}, { 0.449611329654606600046294579}, - { 0.757208846506484547575464054}, { 0.653172842953776764084203014}, - {-0.653172842953776764084203014}, { 0.757208846506484547575464054}, - { 0.073564563599667423529465622}, { 0.997290456678690216135597140}, - {-0.997290456678690216135597140}, { 0.073564563599667423529465622}, - { 0.997290456678690216135597140}, { 0.073564563599667423529465622}, - {-0.073564563599667423529465622}, { 0.997290456678690216135597140}, - { 0.653172842953776764084203014}, { 0.757208846506484547575464054}, - {-0.757208846506484547575464054}, { 0.653172842953776764084203014}, - { 0.893224301195515320342416447}, { 0.449611329654606600046294579}, - {-0.449611329654606600046294579}, { 0.893224301195515320342416447}, - { 0.313681740398891476656478846}, { 0.949528180593036667195936074}, - {-0.949528180593036667195936074}, { 0.313681740398891476656478846}, - { 0.963776065795439866686464356}, { 0.266712757474898386325286515}, - {-0.266712757474898386325286515}, { 0.963776065795439866686464356}, - { 0.492898192229784036873026689}, { 0.870086991108711418652292404}, - {-0.870086991108711418652292404}, { 0.492898192229784036873026689}, - { 0.788346427626606262009164705}, { 0.615231590580626845484913563}, - {-0.615231590580626845484913563}, { 0.788346427626606262009164705}, - { 0.122410675199216198498704474}, { 0.992479534598709998156767252}, - {-0.992479534598709998156767252}, { 0.122410675199216198498704474}, - { 0.985277642388941244774018433}, { 0.170961888760301226363642357}, - {-0.170961888760301226363642357}, { 0.985277642388941244774018433}, - { 0.575808191417845300745972454}, { 0.817584813151583696504920884}, - {-0.817584813151583696504920884}, { 0.575808191417845300745972454}, - { 0.844853565249707073259571205}, { 0.534997619887097210663076905}, - {-0.534997619887097210663076905}, { 0.844853565249707073259571205}, - { 0.219101240156869797227737547}, { 0.975702130038528544460395766}, - {-0.975702130038528544460395766}, { 0.219101240156869797227737547}, - { 0.932992798834738887711660256}, { 0.359895036534988148775104572}, - {-0.359895036534988148775104572}, { 0.932992798834738887711660256}, - { 0.405241314004989870908481306}, { 0.914209755703530654635014829}, - {-0.914209755703530654635014829}, { 0.405241314004989870908481306}, - { 0.724247082951466920941069243}, { 0.689540544737066924616730630}, - {-0.689540544737066924616730630}, { 0.724247082951466920941069243}, - { 0.024541228522912288031734529}, { 0.999698818696204220115765650}, - {-0.999698818696204220115765650}, { 0.024541228522912288031734529}, - { 0.999924701839144540921646491}, { 0.012271538285719926079408262}, - {-0.012271538285719926079408262}, { 0.999924701839144540921646491}, - { 0.698376249408972853554813503}, { 0.715730825283818654125532623}, - {-0.715730825283818654125532623}, { 0.698376249408972853554813503}, - { 0.919113851690057743908477789}, { 0.393992040061048108596188661}, - {-0.393992040061048108596188661}, { 0.919113851690057743908477789}, - { 0.371317193951837543411934967}, { 0.928506080473215565937167396}, - {-0.928506080473215565937167396}, { 0.371317193951837543411934967}, - { 0.978317370719627633106240097}, { 0.207111376192218549708116020}, - {-0.207111376192218549708116020}, { 0.978317370719627633106240097}, - { 0.545324988422046422313987347}, { 0.838224705554838043186996856}, - {-0.838224705554838043186996856}, { 0.545324988422046422313987347}, - { 0.824589302785025264474803737}, { 0.565731810783613197389765011}, - {-0.565731810783613197389765011}, { 0.824589302785025264474803737}, - { 0.183039887955140958516532578}, { 0.983105487431216327180301155}, - {-0.983105487431216327180301155}, { 0.183039887955140958516532578}, - { 0.993906970002356041546922813}, { 0.110222207293883058807899140}, - {-0.110222207293883058807899140}, { 0.993906970002356041546922813}, - { 0.624859488142386377084072816}, { 0.780737228572094478301588484}, - {-0.780737228572094478301588484}, { 0.624859488142386377084072816}, - { 0.876070094195406607095844268}, { 0.482183772079122748517344481}, - {-0.482183772079122748517344481}, { 0.876070094195406607095844268}, - { 0.278519689385053105207848526}, { 0.960430519415565811199035138}, - {-0.960430519415565811199035138}, { 0.278519689385053105207848526}, - { 0.953306040354193836916740383}, { 0.302005949319228067003463232}, - {-0.302005949319228067003463232}, { 0.953306040354193836916740383}, - { 0.460538710958240023633181487}, { 0.887639620402853947760181617}, - {-0.887639620402853947760181617}, { 0.460538710958240023633181487}, - { 0.765167265622458925888815999}, { 0.643831542889791465068086063}, - {-0.643831542889791465068086063}, { 0.765167265622458925888815999}, - { 0.085797312344439890461556332}, { 0.996312612182778012627226190}, - {-0.996312612182778012627226190}, { 0.085797312344439890461556332}, - { 0.998118112900149207125155861}, { 0.061320736302208577782614593}, - {-0.061320736302208577782614593}, { 0.998118112900149207125155861}, - { 0.662415777590171761113069817}, { 0.749136394523459325469203257}, - {-0.749136394523459325469203257}, { 0.662415777590171761113069817}, - { 0.898674465693953843041976744}, { 0.438616238538527637647025738}, - {-0.438616238538527637647025738}, { 0.898674465693953843041976744}, - { 0.325310292162262934135954708}, { 0.945607325380521325730945387}, - {-0.945607325380521325730945387}, { 0.325310292162262934135954708}, - { 0.966976471044852109087220226}, { 0.254865659604514571553980779}, - {-0.254865659604514571553980779}, { 0.966976471044852109087220226}, - { 0.503538383725717558691867071}, { 0.863972856121586737918147054}, - {-0.863972856121586737918147054}, { 0.503538383725717558691867071}, - { 0.795836904608883536262791915}, { 0.605511041404325513920626941}, - {-0.605511041404325513920626941}, { 0.795836904608883536262791915}, - { 0.134580708507126186316358409}, { 0.990902635427780025108237011}, - {-0.990902635427780025108237011}, { 0.134580708507126186316358409}, - { 0.987301418157858382399815802}, { 0.158858143333861441684385360}, - {-0.158858143333861441684385360}, { 0.987301418157858382399815802}, - { 0.585797857456438860328080838}, { 0.810457198252594791726703434}, - {-0.810457198252594791726703434}, { 0.585797857456438860328080838}, - { 0.851355193105265142261290312}, { 0.524589682678468906215098464}, - {-0.524589682678468906215098464}, { 0.851355193105265142261290312}, - { 0.231058108280671119643236018}, { 0.972939952205560145467720114}, - {-0.972939952205560145467720114}, { 0.231058108280671119643236018}, - { 0.937339011912574923201899593}, { 0.348418680249434568419308588}, - {-0.348418680249434568419308588}, { 0.937339011912574923201899593}, - { 0.416429560097637182562598911}, { 0.909167983090522376563884788}, - {-0.909167983090522376563884788}, { 0.416429560097637182562598911}, - { 0.732654271672412834615546649}, { 0.680600997795453050594430464}, - {-0.680600997795453050594430464}, { 0.732654271672412834615546649}, - { 0.036807222941358832324332691}, { 0.999322384588349500896221011}, - {-0.999322384588349500896221011}, { 0.036807222941358832324332691}, - { 0.999322384588349500896221011}, { 0.036807222941358832324332691}, - {-0.036807222941358832324332691}, { 0.999322384588349500896221011}, - { 0.680600997795453050594430464}, { 0.732654271672412834615546649}, - {-0.732654271672412834615546649}, { 0.680600997795453050594430464}, - { 0.909167983090522376563884788}, { 0.416429560097637182562598911}, - {-0.416429560097637182562598911}, { 0.909167983090522376563884788}, - { 0.348418680249434568419308588}, { 0.937339011912574923201899593}, - {-0.937339011912574923201899593}, { 0.348418680249434568419308588}, - { 0.972939952205560145467720114}, { 0.231058108280671119643236018}, - {-0.231058108280671119643236018}, { 0.972939952205560145467720114}, - { 0.524589682678468906215098464}, { 0.851355193105265142261290312}, - {-0.851355193105265142261290312}, { 0.524589682678468906215098464}, - { 0.810457198252594791726703434}, { 0.585797857456438860328080838}, - {-0.585797857456438860328080838}, { 0.810457198252594791726703434}, - { 0.158858143333861441684385360}, { 0.987301418157858382399815802}, - {-0.987301418157858382399815802}, { 0.158858143333861441684385360}, - { 0.990902635427780025108237011}, { 0.134580708507126186316358409}, - {-0.134580708507126186316358409}, { 0.990902635427780025108237011}, - { 0.605511041404325513920626941}, { 0.795836904608883536262791915}, - {-0.795836904608883536262791915}, { 0.605511041404325513920626941}, - { 0.863972856121586737918147054}, { 0.503538383725717558691867071}, - {-0.503538383725717558691867071}, { 0.863972856121586737918147054}, - { 0.254865659604514571553980779}, { 0.966976471044852109087220226}, - {-0.966976471044852109087220226}, { 0.254865659604514571553980779}, - { 0.945607325380521325730945387}, { 0.325310292162262934135954708}, - {-0.325310292162262934135954708}, { 0.945607325380521325730945387}, - { 0.438616238538527637647025738}, { 0.898674465693953843041976744}, - {-0.898674465693953843041976744}, { 0.438616238538527637647025738}, - { 0.749136394523459325469203257}, { 0.662415777590171761113069817}, - {-0.662415777590171761113069817}, { 0.749136394523459325469203257}, - { 0.061320736302208577782614593}, { 0.998118112900149207125155861}, - {-0.998118112900149207125155861}, { 0.061320736302208577782614593}, - { 0.996312612182778012627226190}, { 0.085797312344439890461556332}, - {-0.085797312344439890461556332}, { 0.996312612182778012627226190}, - { 0.643831542889791465068086063}, { 0.765167265622458925888815999}, - {-0.765167265622458925888815999}, { 0.643831542889791465068086063}, - { 0.887639620402853947760181617}, { 0.460538710958240023633181487}, - {-0.460538710958240023633181487}, { 0.887639620402853947760181617}, - { 0.302005949319228067003463232}, { 0.953306040354193836916740383}, - {-0.953306040354193836916740383}, { 0.302005949319228067003463232}, - { 0.960430519415565811199035138}, { 0.278519689385053105207848526}, - {-0.278519689385053105207848526}, { 0.960430519415565811199035138}, - { 0.482183772079122748517344481}, { 0.876070094195406607095844268}, - {-0.876070094195406607095844268}, { 0.482183772079122748517344481}, - { 0.780737228572094478301588484}, { 0.624859488142386377084072816}, - {-0.624859488142386377084072816}, { 0.780737228572094478301588484}, - { 0.110222207293883058807899140}, { 0.993906970002356041546922813}, - {-0.993906970002356041546922813}, { 0.110222207293883058807899140}, - { 0.983105487431216327180301155}, { 0.183039887955140958516532578}, - {-0.183039887955140958516532578}, { 0.983105487431216327180301155}, - { 0.565731810783613197389765011}, { 0.824589302785025264474803737}, - {-0.824589302785025264474803737}, { 0.565731810783613197389765011}, - { 0.838224705554838043186996856}, { 0.545324988422046422313987347}, - {-0.545324988422046422313987347}, { 0.838224705554838043186996856}, - { 0.207111376192218549708116020}, { 0.978317370719627633106240097}, - {-0.978317370719627633106240097}, { 0.207111376192218549708116020}, - { 0.928506080473215565937167396}, { 0.371317193951837543411934967}, - {-0.371317193951837543411934967}, { 0.928506080473215565937167396}, - { 0.393992040061048108596188661}, { 0.919113851690057743908477789}, - {-0.919113851690057743908477789}, { 0.393992040061048108596188661}, - { 0.715730825283818654125532623}, { 0.698376249408972853554813503}, - {-0.698376249408972853554813503}, { 0.715730825283818654125532623}, - { 0.012271538285719926079408262}, { 0.999924701839144540921646491}, - {-0.999924701839144540921646491}, { 0.012271538285719926079408262}, - { 0.999981175282601142656990438}, { 0.006135884649154475359640235}, - {-0.006135884649154475359640235}, { 0.999981175282601142656990438}, - { 0.702754744457225302452914421}, { 0.711432195745216441522130290}, - {-0.711432195745216441522130290}, { 0.702754744457225302452914421}, - { 0.921514039342041943465396332}, { 0.388345046698826291624993541}, - {-0.388345046698826291624993541}, { 0.921514039342041943465396332}, - { 0.377007410216418256726567823}, { 0.926210242138311341974793388}, - {-0.926210242138311341974793388}, { 0.377007410216418256726567823}, - { 0.979569765685440534439326110}, { 0.201104634842091911558443546}, - {-0.201104634842091911558443546}, { 0.979569765685440534439326110}, - { 0.550457972936604802977289893}, { 0.834862874986380056304401383}, - {-0.834862874986380056304401383}, { 0.550457972936604802977289893}, - { 0.828045045257755752067527592}, { 0.560661576197336023839710223}, - {-0.560661576197336023839710223}, { 0.828045045257755752067527592}, - { 0.189068664149806212754997837}, { 0.981963869109555264072848154}, - {-0.981963869109555264072848154}, { 0.189068664149806212754997837}, - { 0.994564570734255452119106243}, { 0.104121633872054579120943880}, - {-0.104121633872054579120943880}, { 0.994564570734255452119106243}, - { 0.629638238914927025372981341}, { 0.776888465673232450040827983}, - {-0.776888465673232450040827983}, { 0.629638238914927025372981341}, - { 0.879012226428633477831323711}, { 0.476799230063322133342158117}, - {-0.476799230063322133342158117}, { 0.879012226428633477831323711}, - { 0.284407537211271843618310615}, { 0.958703474895871555374645792}, - {-0.958703474895871555374645792}, { 0.284407537211271843618310615}, - { 0.955141168305770721498157712}, { 0.296150888243623824121786128}, - {-0.296150888243623824121786128}, { 0.955141168305770721498157712}, - { 0.465976495767966177902756065}, { 0.884797098430937780104007041}, - {-0.884797098430937780104007041}, { 0.465976495767966177902756065}, - { 0.769103337645579639346626069}, { 0.639124444863775743801488193}, - {-0.639124444863775743801488193}, { 0.769103337645579639346626069}, - { 0.091908956497132728624990979}, { 0.995767414467659793982495643}, - {-0.995767414467659793982495643}, { 0.091908956497132728624990979}, - { 0.998475580573294752208559038}, { 0.055195244349689939809447526}, - {-0.055195244349689939809447526}, { 0.998475580573294752208559038}, - { 0.666999922303637506650154222}, { 0.745057785441465962407907310}, - {-0.745057785441465962407907310}, { 0.666999922303637506650154222}, - { 0.901348847046022014570746093}, { 0.433093818853151968484222638}, - {-0.433093818853151968484222638}, { 0.901348847046022014570746093}, - { 0.331106305759876401737190737}, { 0.943593458161960361495301445}, - {-0.943593458161960361495301445}, { 0.331106305759876401737190737}, - { 0.968522094274417316221088329}, { 0.248927605745720168110682816}, - {-0.248927605745720168110682816}, { 0.968522094274417316221088329}, - { 0.508830142543107036931749324}, { 0.860866938637767279344583877}, - {-0.860866938637767279344583877}, { 0.508830142543107036931749324}, - { 0.799537269107905033500246232}, { 0.600616479383868926653875896}, - {-0.600616479383868926653875896}, { 0.799537269107905033500246232}, - { 0.140658239332849230714788846}, { 0.990058210262297105505906464}, - {-0.990058210262297105505906464}, { 0.140658239332849230714788846}, - { 0.988257567730749491404792538}, { 0.152797185258443427720336613}, - {-0.152797185258443427720336613}, { 0.988257567730749491404792538}, - { 0.590759701858874228423887908}, { 0.806847553543799272206514313}, - {-0.806847553543799272206514313}, { 0.590759701858874228423887908}, - { 0.854557988365400520767862276}, { 0.519355990165589587361829932}, - {-0.519355990165589587361829932}, { 0.854557988365400520767862276}, - { 0.237023605994367206867735915}, { 0.971503890986251775537099622}, - {-0.971503890986251775537099622}, { 0.237023605994367206867735915}, - { 0.939459223602189911962669246}, { 0.342660717311994397592781983}, - {-0.342660717311994397592781983}, { 0.939459223602189911962669246}, - { 0.422000270799799685941287941}, { 0.906595704514915365332960588}, - {-0.906595704514915365332960588}, { 0.422000270799799685941287941}, - { 0.736816568877369875090132520}, { 0.676092703575315960360419228}, - {-0.676092703575315960360419228}, { 0.736816568877369875090132520}, - { 0.042938256934940823077124540}, { 0.999077727752645382888781997}, - {-0.999077727752645382888781997}, { 0.042938256934940823077124540}, - { 0.999529417501093163079703322}, { 0.030674803176636625934021028}, - {-0.030674803176636625934021028}, { 0.999529417501093163079703322}, - { 0.685083667772700381362052545}, { 0.728464390448225196492035438}, - {-0.728464390448225196492035438}, { 0.685083667772700381362052545}, - { 0.911706032005429851404397325}, { 0.410843171057903942183466675}, - {-0.410843171057903942183466675}, { 0.911706032005429851404397325}, - { 0.354163525420490382357395796}, { 0.935183509938947577642207480}, - {-0.935183509938947577642207480}, { 0.354163525420490382357395796}, - { 0.974339382785575860518721668}, { 0.225083911359792835991642120}, - {-0.225083911359792835991642120}, { 0.974339382785575860518721668}, - { 0.529803624686294668216054671}, { 0.848120344803297251279133563}, - {-0.848120344803297251279133563}, { 0.529803624686294668216054671}, - { 0.814036329705948361654516690}, { 0.580813958095764545075595272}, - {-0.580813958095764545075595272}, { 0.814036329705948361654516690}, - { 0.164913120489969921418189113}, { 0.986308097244598647863297524}, - {-0.986308097244598647863297524}, { 0.164913120489969921418189113}, - { 0.991709753669099522860049931}, { 0.128498110793793172624415589}, - {-0.128498110793793172624415589}, { 0.991709753669099522860049931}, - { 0.610382806276309452716352152}, { 0.792106577300212351782342879}, - {-0.792106577300212351782342879}, { 0.610382806276309452716352152}, - { 0.867046245515692651480195629}, { 0.498227666972781852410983869}, - {-0.498227666972781852410983869}, { 0.867046245515692651480195629}, - { 0.260794117915275518280186509}, { 0.965394441697689374550843858}, - {-0.965394441697689374550843858}, { 0.260794117915275518280186509}, - { 0.947585591017741134653387321}, { 0.319502030816015677901518272}, - {-0.319502030816015677901518272}, { 0.947585591017741134653387321}, - { 0.444122144570429231642069418}, { 0.895966249756185155914560282}, - {-0.895966249756185155914560282}, { 0.444122144570429231642069418}, - { 0.753186799043612482483430486}, { 0.657806693297078656931182264}, - {-0.657806693297078656931182264}, { 0.753186799043612482483430486}, - { 0.067443919563664057897972422}, { 0.997723066644191609848546728}, - {-0.997723066644191609848546728}, { 0.067443919563664057897972422}, - { 0.996820299291165714972629398}, { 0.079682437971430121147120656}, - {-0.079682437971430121147120656}, { 0.996820299291165714972629398}, - { 0.648514401022112445084560551}, { 0.761202385484261814029709836}, - {-0.761202385484261814029709836}, { 0.648514401022112445084560551}, - { 0.890448723244757889952150560}, { 0.455083587126343823535869268}, - {-0.455083587126343823535869268}, { 0.890448723244757889952150560}, - { 0.307849640041534893682063646}, { 0.951435020969008369549175569}, - {-0.951435020969008369549175569}, { 0.307849640041534893682063646}, - { 0.962121404269041595429604316}, { 0.272621355449948984493347477}, - {-0.272621355449948984493347477}, { 0.962121404269041595429604316}, - { 0.487550160148435954641485027}, { 0.873094978418290098636085973}, - {-0.873094978418290098636085973}, { 0.487550160148435954641485027}, - { 0.784556597155575233023892575}, { 0.620057211763289178646268191}, - {-0.620057211763289178646268191}, { 0.784556597155575233023892575}, - { 0.116318630911904767252544319}, { 0.993211949234794533104601012}, - {-0.993211949234794533104601012}, { 0.116318630911904767252544319}, - { 0.984210092386929073193874387}, { 0.177004220412148756196839844}, - {-0.177004220412148756196839844}, { 0.984210092386929073193874387}, - { 0.570780745886967280232652864}, { 0.821102514991104679060430820}, - {-0.821102514991104679060430820}, { 0.570780745886967280232652864}, - { 0.841554977436898409603499520}, { 0.540171472729892881297845480}, - {-0.540171472729892881297845480}, { 0.841554977436898409603499520}, - { 0.213110319916091373967757518}, { 0.977028142657754351485866211}, - {-0.977028142657754351485866211}, { 0.213110319916091373967757518}, - { 0.930766961078983731944872340}, { 0.365612997804773870011745909}, - {-0.365612997804773870011745909}, { 0.930766961078983731944872340}, - { 0.399624199845646828544117031}, { 0.916679059921042663116457013}, - {-0.916679059921042663116457013}, { 0.399624199845646828544117031}, - { 0.720002507961381629076682999}, { 0.693971460889654009003734389}, - {-0.693971460889654009003734389}, { 0.720002507961381629076682999}, - { 0.018406729905804820927366313}, { 0.999830581795823422015722275}, - {-0.999830581795823422015722275}, { 0.018406729905804820927366313}, - { 0.999830581795823422015722275}, { 0.018406729905804820927366313}, - {-0.018406729905804820927366313}, { 0.999830581795823422015722275}, - { 0.693971460889654009003734389}, { 0.720002507961381629076682999}, - {-0.720002507961381629076682999}, { 0.693971460889654009003734389}, - { 0.916679059921042663116457013}, { 0.399624199845646828544117031}, - {-0.399624199845646828544117031}, { 0.916679059921042663116457013}, - { 0.365612997804773870011745909}, { 0.930766961078983731944872340}, - {-0.930766961078983731944872340}, { 0.365612997804773870011745909}, - { 0.977028142657754351485866211}, { 0.213110319916091373967757518}, - {-0.213110319916091373967757518}, { 0.977028142657754351485866211}, - { 0.540171472729892881297845480}, { 0.841554977436898409603499520}, - {-0.841554977436898409603499520}, { 0.540171472729892881297845480}, - { 0.821102514991104679060430820}, { 0.570780745886967280232652864}, - {-0.570780745886967280232652864}, { 0.821102514991104679060430820}, - { 0.177004220412148756196839844}, { 0.984210092386929073193874387}, - {-0.984210092386929073193874387}, { 0.177004220412148756196839844}, - { 0.993211949234794533104601012}, { 0.116318630911904767252544319}, - {-0.116318630911904767252544319}, { 0.993211949234794533104601012}, - { 0.620057211763289178646268191}, { 0.784556597155575233023892575}, - {-0.784556597155575233023892575}, { 0.620057211763289178646268191}, - { 0.873094978418290098636085973}, { 0.487550160148435954641485027}, - {-0.487550160148435954641485027}, { 0.873094978418290098636085973}, - { 0.272621355449948984493347477}, { 0.962121404269041595429604316}, - {-0.962121404269041595429604316}, { 0.272621355449948984493347477}, - { 0.951435020969008369549175569}, { 0.307849640041534893682063646}, - {-0.307849640041534893682063646}, { 0.951435020969008369549175569}, - { 0.455083587126343823535869268}, { 0.890448723244757889952150560}, - {-0.890448723244757889952150560}, { 0.455083587126343823535869268}, - { 0.761202385484261814029709836}, { 0.648514401022112445084560551}, - {-0.648514401022112445084560551}, { 0.761202385484261814029709836}, - { 0.079682437971430121147120656}, { 0.996820299291165714972629398}, - {-0.996820299291165714972629398}, { 0.079682437971430121147120656}, - { 0.997723066644191609848546728}, { 0.067443919563664057897972422}, - {-0.067443919563664057897972422}, { 0.997723066644191609848546728}, - { 0.657806693297078656931182264}, { 0.753186799043612482483430486}, - {-0.753186799043612482483430486}, { 0.657806693297078656931182264}, - { 0.895966249756185155914560282}, { 0.444122144570429231642069418}, - {-0.444122144570429231642069418}, { 0.895966249756185155914560282}, - { 0.319502030816015677901518272}, { 0.947585591017741134653387321}, - {-0.947585591017741134653387321}, { 0.319502030816015677901518272}, - { 0.965394441697689374550843858}, { 0.260794117915275518280186509}, - {-0.260794117915275518280186509}, { 0.965394441697689374550843858}, - { 0.498227666972781852410983869}, { 0.867046245515692651480195629}, - {-0.867046245515692651480195629}, { 0.498227666972781852410983869}, - { 0.792106577300212351782342879}, { 0.610382806276309452716352152}, - {-0.610382806276309452716352152}, { 0.792106577300212351782342879}, - { 0.128498110793793172624415589}, { 0.991709753669099522860049931}, - {-0.991709753669099522860049931}, { 0.128498110793793172624415589}, - { 0.986308097244598647863297524}, { 0.164913120489969921418189113}, - {-0.164913120489969921418189113}, { 0.986308097244598647863297524}, - { 0.580813958095764545075595272}, { 0.814036329705948361654516690}, - {-0.814036329705948361654516690}, { 0.580813958095764545075595272}, - { 0.848120344803297251279133563}, { 0.529803624686294668216054671}, - {-0.529803624686294668216054671}, { 0.848120344803297251279133563}, - { 0.225083911359792835991642120}, { 0.974339382785575860518721668}, - {-0.974339382785575860518721668}, { 0.225083911359792835991642120}, - { 0.935183509938947577642207480}, { 0.354163525420490382357395796}, - {-0.354163525420490382357395796}, { 0.935183509938947577642207480}, - { 0.410843171057903942183466675}, { 0.911706032005429851404397325}, - {-0.911706032005429851404397325}, { 0.410843171057903942183466675}, - { 0.728464390448225196492035438}, { 0.685083667772700381362052545}, - {-0.685083667772700381362052545}, { 0.728464390448225196492035438}, - { 0.030674803176636625934021028}, { 0.999529417501093163079703322}, - {-0.999529417501093163079703322}, { 0.030674803176636625934021028}, - { 0.999077727752645382888781997}, { 0.042938256934940823077124540}, - {-0.042938256934940823077124540}, { 0.999077727752645382888781997}, - { 0.676092703575315960360419228}, { 0.736816568877369875090132520}, - {-0.736816568877369875090132520}, { 0.676092703575315960360419228}, - { 0.906595704514915365332960588}, { 0.422000270799799685941287941}, - {-0.422000270799799685941287941}, { 0.906595704514915365332960588}, - { 0.342660717311994397592781983}, { 0.939459223602189911962669246}, - {-0.939459223602189911962669246}, { 0.342660717311994397592781983}, - { 0.971503890986251775537099622}, { 0.237023605994367206867735915}, - {-0.237023605994367206867735915}, { 0.971503890986251775537099622}, - { 0.519355990165589587361829932}, { 0.854557988365400520767862276}, - {-0.854557988365400520767862276}, { 0.519355990165589587361829932}, - { 0.806847553543799272206514313}, { 0.590759701858874228423887908}, - {-0.590759701858874228423887908}, { 0.806847553543799272206514313}, - { 0.152797185258443427720336613}, { 0.988257567730749491404792538}, - {-0.988257567730749491404792538}, { 0.152797185258443427720336613}, - { 0.990058210262297105505906464}, { 0.140658239332849230714788846}, - {-0.140658239332849230714788846}, { 0.990058210262297105505906464}, - { 0.600616479383868926653875896}, { 0.799537269107905033500246232}, - {-0.799537269107905033500246232}, { 0.600616479383868926653875896}, - { 0.860866938637767279344583877}, { 0.508830142543107036931749324}, - {-0.508830142543107036931749324}, { 0.860866938637767279344583877}, - { 0.248927605745720168110682816}, { 0.968522094274417316221088329}, - {-0.968522094274417316221088329}, { 0.248927605745720168110682816}, - { 0.943593458161960361495301445}, { 0.331106305759876401737190737}, - {-0.331106305759876401737190737}, { 0.943593458161960361495301445}, - { 0.433093818853151968484222638}, { 0.901348847046022014570746093}, - {-0.901348847046022014570746093}, { 0.433093818853151968484222638}, - { 0.745057785441465962407907310}, { 0.666999922303637506650154222}, - {-0.666999922303637506650154222}, { 0.745057785441465962407907310}, - { 0.055195244349689939809447526}, { 0.998475580573294752208559038}, - {-0.998475580573294752208559038}, { 0.055195244349689939809447526}, - { 0.995767414467659793982495643}, { 0.091908956497132728624990979}, - {-0.091908956497132728624990979}, { 0.995767414467659793982495643}, - { 0.639124444863775743801488193}, { 0.769103337645579639346626069}, - {-0.769103337645579639346626069}, { 0.639124444863775743801488193}, - { 0.884797098430937780104007041}, { 0.465976495767966177902756065}, - {-0.465976495767966177902756065}, { 0.884797098430937780104007041}, - { 0.296150888243623824121786128}, { 0.955141168305770721498157712}, - {-0.955141168305770721498157712}, { 0.296150888243623824121786128}, - { 0.958703474895871555374645792}, { 0.284407537211271843618310615}, - {-0.284407537211271843618310615}, { 0.958703474895871555374645792}, - { 0.476799230063322133342158117}, { 0.879012226428633477831323711}, - {-0.879012226428633477831323711}, { 0.476799230063322133342158117}, - { 0.776888465673232450040827983}, { 0.629638238914927025372981341}, - {-0.629638238914927025372981341}, { 0.776888465673232450040827983}, - { 0.104121633872054579120943880}, { 0.994564570734255452119106243}, - {-0.994564570734255452119106243}, { 0.104121633872054579120943880}, - { 0.981963869109555264072848154}, { 0.189068664149806212754997837}, - {-0.189068664149806212754997837}, { 0.981963869109555264072848154}, - { 0.560661576197336023839710223}, { 0.828045045257755752067527592}, - {-0.828045045257755752067527592}, { 0.560661576197336023839710223}, - { 0.834862874986380056304401383}, { 0.550457972936604802977289893}, - {-0.550457972936604802977289893}, { 0.834862874986380056304401383}, - { 0.201104634842091911558443546}, { 0.979569765685440534439326110}, - {-0.979569765685440534439326110}, { 0.201104634842091911558443546}, - { 0.926210242138311341974793388}, { 0.377007410216418256726567823}, - {-0.377007410216418256726567823}, { 0.926210242138311341974793388}, - { 0.388345046698826291624993541}, { 0.921514039342041943465396332}, - {-0.921514039342041943465396332}, { 0.388345046698826291624993541}, - { 0.711432195745216441522130290}, { 0.702754744457225302452914421}, - {-0.702754744457225302452914421}, { 0.711432195745216441522130290}, - { 0.006135884649154475359640235}, { 0.999981175282601142656990438}, - {-0.999981175282601142656990438}, { 0.006135884649154475359640235}, - { 0.999995293809576171511580126}, { 0.003067956762965976270145365}, - {-0.003067956762965976270145365}, { 0.999995293809576171511580126}, - { 0.704934080375904908852523758}, { 0.709272826438865651316533772}, - {-0.709272826438865651316533772}, { 0.704934080375904908852523758}, - { 0.922701128333878570437264227}, { 0.385516053843918864075607949}, - {-0.385516053843918864075607949}, { 0.922701128333878570437264227}, - { 0.379847208924051170576281147}, { 0.925049240782677590302371869}, - {-0.925049240782677590302371869}, { 0.379847208924051170576281147}, - { 0.980182135968117392690210009}, { 0.198098410717953586179324918}, - {-0.198098410717953586179324918}, { 0.980182135968117392690210009}, - { 0.553016705580027531764226988}, { 0.833170164701913186439915922}, - {-0.833170164701913186439915922}, { 0.553016705580027531764226988}, - { 0.829761233794523042469023765}, { 0.558118531220556115693702964}, - {-0.558118531220556115693702964}, { 0.829761233794523042469023765}, - { 0.192080397049892441679288205}, { 0.981379193313754574318224190}, - {-0.981379193313754574318224190}, { 0.192080397049892441679288205}, - { 0.994879330794805620591166107}, { 0.101069862754827824987887585}, - {-0.101069862754827824987887585}, { 0.994879330794805620591166107}, - { 0.632018735939809021909403706}, { 0.774953106594873878359129282}, - {-0.774953106594873878359129282}, { 0.632018735939809021909403706}, - { 0.880470889052160770806542929}, { 0.474100214650550014398580015}, - {-0.474100214650550014398580015}, { 0.880470889052160770806542929}, - { 0.287347459544729526477331841}, { 0.957826413027532890321037029}, - {-0.957826413027532890321037029}, { 0.287347459544729526477331841}, - { 0.956045251349996443270479823}, { 0.293219162694258650606608599}, - {-0.293219162694258650606608599}, { 0.956045251349996443270479823}, - { 0.468688822035827933697617870}, { 0.883363338665731594736308015}, - {-0.883363338665731594736308015}, { 0.468688822035827933697617870}, - { 0.771060524261813773200605759}, { 0.636761861236284230413943435}, - {-0.636761861236284230413943435}, { 0.771060524261813773200605759}, - { 0.094963495329638998938034312}, { 0.995480755491926941769171600}, - {-0.995480755491926941769171600}, { 0.094963495329638998938034312}, - { 0.998640218180265222418199049}, { 0.052131704680283321236358216}, - {-0.052131704680283321236358216}, { 0.998640218180265222418199049}, - { 0.669282588346636065720696366}, { 0.743007952135121693517362293}, - {-0.743007952135121693517362293}, { 0.669282588346636065720696366}, - { 0.902673318237258806751502391}, { 0.430326481340082633908199031}, - {-0.430326481340082633908199031}, { 0.902673318237258806751502391}, - { 0.333999651442009404650865481}, { 0.942573197601446879280758735}, - {-0.942573197601446879280758735}, { 0.333999651442009404650865481}, - { 0.969281235356548486048290738}, { 0.245955050335794611599924709}, - {-0.245955050335794611599924709}, { 0.969281235356548486048290738}, - { 0.511468850437970399504391001}, { 0.859301818357008404783582139}, - {-0.859301818357008404783582139}, { 0.511468850437970399504391001}, - { 0.801376171723140219430247777}, { 0.598160706996342311724958652}, - {-0.598160706996342311724958652}, { 0.801376171723140219430247777}, - { 0.143695033150294454819773349}, { 0.989622017463200834623694454}, - {-0.989622017463200834623694454}, { 0.143695033150294454819773349}, - { 0.988721691960323767604516485}, { 0.149764534677321517229695737}, - {-0.149764534677321517229695737}, { 0.988721691960323767604516485}, - { 0.593232295039799808047809426}, { 0.805031331142963597922659282}, - {-0.805031331142963597922659282}, { 0.593232295039799808047809426}, - { 0.856147328375194481019630732}, { 0.516731799017649881508753876}, - {-0.516731799017649881508753876}, { 0.856147328375194481019630732}, - { 0.240003022448741486568922365}, { 0.970772140728950302138169611}, - {-0.970772140728950302138169611}, { 0.240003022448741486568922365}, - { 0.940506070593268323787291309}, { 0.339776884406826857828825803}, - {-0.339776884406826857828825803}, { 0.940506070593268323787291309}, - { 0.424779681209108833357226189}, { 0.905296759318118774354048329}, - {-0.905296759318118774354048329}, { 0.424779681209108833357226189}, - { 0.738887324460615147933116508}, { 0.673829000378756060917568372}, - {-0.673829000378756060917568372}, { 0.738887324460615147933116508}, - { 0.046003182130914628814301788}, { 0.998941293186856850633930266}, - {-0.998941293186856850633930266}, { 0.046003182130914628814301788}, - { 0.999618822495178597116830637}, { 0.027608145778965741612354872}, - {-0.027608145778965741612354872}, { 0.999618822495178597116830637}, - { 0.687315340891759108199186948}, { 0.726359155084345976817494315}, - {-0.726359155084345976817494315}, { 0.687315340891759108199186948}, - { 0.912962190428398164628018233}, { 0.408044162864978680820747499}, - {-0.408044162864978680820747499}, { 0.912962190428398164628018233}, - { 0.357030961233430032614954036}, { 0.934092550404258914729877883}, - {-0.934092550404258914729877883}, { 0.357030961233430032614954036}, - { 0.975025345066994146844913468}, { 0.222093620973203534094094721}, - {-0.222093620973203534094094721}, { 0.975025345066994146844913468}, - { 0.532403127877197971442805218}, { 0.846490938774052078300544488}, - {-0.846490938774052078300544488}, { 0.532403127877197971442805218}, - { 0.815814410806733789010772660}, { 0.578313796411655563342245019}, - {-0.578313796411655563342245019}, { 0.815814410806733789010772660}, - { 0.167938294974731178054745536}, { 0.985797509167567424700995000}, - {-0.985797509167567424700995000}, { 0.167938294974731178054745536}, - { 0.992099313142191757112085445}, { 0.125454983411546238542336453}, - {-0.125454983411546238542336453}, { 0.992099313142191757112085445}, - { 0.612810082429409703935211936}, { 0.790230221437310055030217152}, - {-0.790230221437310055030217152}, { 0.612810082429409703935211936}, - { 0.868570705971340895340449876}, { 0.495565261825772531150266670}, - {-0.495565261825772531150266670}, { 0.868570705971340895340449876}, - { 0.263754678974831383611349322}, { 0.964589793289812723836432159}, - {-0.964589793289812723836432159}, { 0.263754678974831383611349322}, - { 0.948561349915730288158494826}, { 0.316593375556165867243047035}, - {-0.316593375556165867243047035}, { 0.948561349915730288158494826}, - { 0.446868840162374195353044389}, { 0.894599485631382678433072126}, - {-0.894599485631382678433072126}, { 0.446868840162374195353044389}, - { 0.755201376896536527598710756}, { 0.655492852999615385312679701}, - {-0.655492852999615385312679701}, { 0.755201376896536527598710756}, - { 0.070504573389613863027351471}, { 0.997511456140303459699448390}, - {-0.997511456140303459699448390}, { 0.070504573389613863027351471}, - { 0.997060070339482978987989949}, { 0.076623861392031492278332463}, - {-0.076623861392031492278332463}, { 0.997060070339482978987989949}, - { 0.650846684996380915068975573}, { 0.759209188978388033485525443}, - {-0.759209188978388033485525443}, { 0.650846684996380915068975573}, - { 0.891840709392342727796478697}, { 0.452349587233770874133026703}, - {-0.452349587233770874133026703}, { 0.891840709392342727796478697}, - { 0.310767152749611495835997250}, { 0.950486073949481721759926101}, - {-0.950486073949481721759926101}, { 0.310767152749611495835997250}, - { 0.962953266873683886347921481}, { 0.269668325572915106525464462}, - {-0.269668325572915106525464462}, { 0.962953266873683886347921481}, - { 0.490226483288291154229598449}, { 0.871595086655951034842481435}, - {-0.871595086655951034842481435}, { 0.490226483288291154229598449}, - { 0.786455213599085757522319464}, { 0.617647307937803932403979402}, - {-0.617647307937803932403979402}, { 0.786455213599085757522319464}, - { 0.119365214810991364593637790}, { 0.992850414459865090793563344}, - {-0.992850414459865090793563344}, { 0.119365214810991364593637790}, - { 0.984748501801904218556553176}, { 0.173983873387463827950700807}, - {-0.173983873387463827950700807}, { 0.984748501801904218556553176}, - { 0.573297166698042212820171239}, { 0.819347520076796960824689637}, - {-0.819347520076796960824689637}, { 0.573297166698042212820171239}, - { 0.843208239641845437161743865}, { 0.537587076295645482502214932}, - {-0.537587076295645482502214932}, { 0.843208239641845437161743865}, - { 0.216106797076219509948385131}, { 0.976369731330021149312732194}, - {-0.976369731330021149312732194}, { 0.216106797076219509948385131}, - { 0.931884265581668106718557199}, { 0.362755724367397216204854462}, - {-0.362755724367397216204854462}, { 0.931884265581668106718557199}, - { 0.402434650859418441082533934}, { 0.915448716088267819566431292}, - {-0.915448716088267819566431292}, { 0.402434650859418441082533934}, - { 0.722128193929215321243607198}, { 0.691759258364157774906734132}, - {-0.691759258364157774906734132}, { 0.722128193929215321243607198}, - { 0.021474080275469507418374898}, { 0.999769405351215321657617036}, - {-0.999769405351215321657617036}, { 0.021474080275469507418374898}, - { 0.999882347454212525633049627}, { 0.015339206284988101044151868}, - {-0.015339206284988101044151868}, { 0.999882347454212525633049627}, - { 0.696177131491462944788582591}, { 0.717870045055731736211325329}, - {-0.717870045055731736211325329}, { 0.696177131491462944788582591}, - { 0.917900775621390457642276297}, { 0.396809987416710328595290911}, - {-0.396809987416710328595290911}, { 0.917900775621390457642276297}, - { 0.368466829953372331712746222}, { 0.929640895843181265457918066}, - {-0.929640895843181265457918066}, { 0.368466829953372331712746222}, - { 0.977677357824509979943404762}, { 0.210111836880469621717489972}, - {-0.210111836880469621717489972}, { 0.977677357824509979943404762}, - { 0.542750784864515906586768661}, { 0.839893794195999504583383987}, - {-0.839893794195999504583383987}, { 0.542750784864515906586768661}, - { 0.822849781375826332046780034}, { 0.568258952670131549790548489}, - {-0.568258952670131549790548489}, { 0.822849781375826332046780034}, - { 0.180022901405699522679906590}, { 0.983662419211730274396237776}, - {-0.983662419211730274396237776}, { 0.180022901405699522679906590}, - { 0.993564135520595333782021697}, { 0.113270952177564349018228733}, - {-0.113270952177564349018228733}, { 0.993564135520595333782021697}, - { 0.622461279374149972519166721}, { 0.782650596166575738458949301}, - {-0.782650596166575738458949301}, { 0.622461279374149972519166721}, - { 0.874586652278176112634431897}, { 0.484869248000791101822951699}, - {-0.484869248000791101822951699}, { 0.874586652278176112634431897}, - { 0.275571819310958163076425168}, { 0.961280485811320641748659653}, - {-0.961280485811320641748659653}, { 0.275571819310958163076425168}, - { 0.952375012719765858529893608}, { 0.304929229735402406490728633}, - {-0.304929229735402406490728633}, { 0.952375012719765858529893608}, - { 0.457813303598877221904961155}, { 0.889048355854664562540777729}, - {-0.889048355854664562540777729}, { 0.457813303598877221904961155}, - { 0.763188417263381271704838297}, { 0.646176012983316364832802220}, - {-0.646176012983316364832802220}, { 0.763188417263381271704838297}, - { 0.082740264549375693111987083}, { 0.996571145790554847093566910}, - {-0.996571145790554847093566910}, { 0.082740264549375693111987083}, - { 0.997925286198596012623025462}, { 0.064382630929857460819324537}, - {-0.064382630929857460819324537}, { 0.997925286198596012623025462}, - { 0.660114342067420478559490747}, { 0.751165131909686411205819422}, - {-0.751165131909686411205819422}, { 0.660114342067420478559490747}, - { 0.897324580705418281231391836}, { 0.441371268731716692879988968}, - {-0.441371268731716692879988968}, { 0.897324580705418281231391836}, - { 0.322407678801069848384807478}, { 0.946600913083283570044599823}, - {-0.946600913083283570044599823}, { 0.322407678801069848384807478}, - { 0.966190003445412555433832961}, { 0.257831102162159005614471295}, - {-0.257831102162159005614471295}, { 0.966190003445412555433832961}, - { 0.500885382611240786241285004}, { 0.865513624090569082825488358}, - {-0.865513624090569082825488358}, { 0.500885382611240786241285004}, - { 0.793975477554337164895083757}, { 0.607949784967773667243642671}, - {-0.607949784967773667243642671}, { 0.793975477554337164895083757}, - { 0.131540028702883111103387493}, { 0.991310859846115418957349799}, - {-0.991310859846115418957349799}, { 0.131540028702883111103387493}, - { 0.986809401814185476970235952}, { 0.161886393780111837641387995}, - {-0.161886393780111837641387995}, { 0.986809401814185476970235952}, - { 0.583308652937698294392830961}, { 0.812250586585203913049744181}, - {-0.812250586585203913049744181}, { 0.583308652937698294392830961}, - { 0.849741768000852489471268395}, { 0.527199134781901348464274575}, - {-0.527199134781901348464274575}, { 0.849741768000852489471268395}, - { 0.228072083170885739254457379}, { 0.973644249650811925318383912}, - {-0.973644249650811925318383912}, { 0.228072083170885739254457379}, - { 0.936265667170278246576310996}, { 0.351292756085567125601307623}, - {-0.351292756085567125601307623}, { 0.936265667170278246576310996}, - { 0.413638312238434547471944324}, { 0.910441292258067196934095369}, - {-0.910441292258067196934095369}, { 0.413638312238434547471944324}, - { 0.730562769227827561177758850}, { 0.682845546385248068164596123}, - {-0.682845546385248068164596123}, { 0.730562769227827561177758850}, - { 0.033741171851377584833716112}, { 0.999430604555461772019008327}, - {-0.999430604555461772019008327}, { 0.033741171851377584833716112}, - { 0.999204758618363895492950001}, { 0.039872927587739811128578738}, - {-0.039872927587739811128578738}, { 0.999204758618363895492950001}, - { 0.678350043129861486873655042}, { 0.734738878095963464563223604}, - {-0.734738878095963464563223604}, { 0.678350043129861486873655042}, - { 0.907886116487666212038681480}, { 0.419216888363223956433010020}, - {-0.419216888363223956433010020}, { 0.907886116487666212038681480}, - { 0.345541324963989065539191723}, { 0.938403534063108112192420774}, - {-0.938403534063108112192420774}, { 0.345541324963989065539191723}, - { 0.972226497078936305708321144}, { 0.234041958583543423191242045}, - {-0.234041958583543423191242045}, { 0.972226497078936305708321144}, - { 0.521975292937154342694258318}, { 0.852960604930363657746588082}, - {-0.852960604930363657746588082}, { 0.521975292937154342694258318}, - { 0.808656181588174991946968128}, { 0.588281548222645304786439813}, - {-0.588281548222645304786439813}, { 0.808656181588174991946968128}, - { 0.155828397654265235743101486}, { 0.987784141644572154230969032}, - {-0.987784141644572154230969032}, { 0.155828397654265235743101486}, - { 0.990485084256457037998682243}, { 0.137620121586486044948441663}, - {-0.137620121586486044948441663}, { 0.990485084256457037998682243}, - { 0.603066598540348201693430617}, { 0.797690840943391108362662755}, - {-0.797690840943391108362662755}, { 0.603066598540348201693430617}, - { 0.862423956111040538690933878}, { 0.506186645345155291048942344}, - {-0.506186645345155291048942344}, { 0.862423956111040538690933878}, - { 0.251897818154216950498106628}, { 0.967753837093475465243391912}, - {-0.967753837093475465243391912}, { 0.251897818154216950498106628}, - { 0.944604837261480265659265493}, { 0.328209843579092526107916817}, - {-0.328209843579092526107916817}, { 0.944604837261480265659265493}, - { 0.435857079922255491032544080}, { 0.900015892016160228714535267}, - {-0.900015892016160228714535267}, { 0.435857079922255491032544080}, - { 0.747100605980180144323078847}, { 0.664710978203344868130324985}, - {-0.664710978203344868130324985}, { 0.747100605980180144323078847}, - { 0.058258264500435759613979782}, { 0.998301544933892840738782163}, - {-0.998301544933892840738782163}, { 0.058258264500435759613979782}, - { 0.996044700901251989887944810}, { 0.088853552582524596561586535}, - {-0.088853552582524596561586535}, { 0.996044700901251989887944810}, - { 0.641481012808583151988739898}, { 0.767138911935820381181694573}, - {-0.767138911935820381181694573}, { 0.641481012808583151988739898}, - { 0.886222530148880631647990821}, { 0.463259783551860197390719637}, - {-0.463259783551860197390719637}, { 0.886222530148880631647990821}, - { 0.299079826308040476750336973}, { 0.954228095109105629780430732}, - {-0.954228095109105629780430732}, { 0.299079826308040476750336973}, - { 0.959571513081984528335528181}, { 0.281464937925757984095231007}, - {-0.281464937925757984095231007}, { 0.959571513081984528335528181}, - { 0.479493757660153026679839798}, { 0.877545290207261291668470750}, - {-0.877545290207261291668470750}, { 0.479493757660153026679839798}, - { 0.778816512381475953374724325}, { 0.627251815495144113509622565}, - {-0.627251815495144113509622565}, { 0.778816512381475953374724325}, - { 0.107172424956808849175529148}, { 0.994240449453187946358413442}, - {-0.994240449453187946358413442}, { 0.107172424956808849175529148}, - { 0.982539302287441255907040396}, { 0.186055151663446648105438304}, - {-0.186055151663446648105438304}, { 0.982539302287441255907040396}, - { 0.563199344013834115007363772}, { 0.826321062845663480311195452}, - {-0.826321062845663480311195452}, { 0.563199344013834115007363772}, - { 0.836547727223511984524285790}, { 0.547894059173100165608820571}, - {-0.547894059173100165608820571}, { 0.836547727223511984524285790}, - { 0.204108966092816874181696950}, { 0.978948175319062194715480124}, - {-0.978948175319062194715480124}, { 0.204108966092816874181696950}, - { 0.927362525650401087274536959}, { 0.374164062971457997104393020}, - {-0.374164062971457997104393020}, { 0.927362525650401087274536959}, - { 0.391170384302253888687512949}, { 0.920318276709110566440076541}, - {-0.920318276709110566440076541}, { 0.391170384302253888687512949}, - { 0.713584868780793592903125099}, { 0.700568793943248366792866380}, - {-0.700568793943248366792866380}, { 0.713584868780793592903125099}, - { 0.009203754782059819315102378}, { 0.999957644551963866333120920}, - {-0.999957644551963866333120920}, { 0.009203754782059819315102378}, - { 0.999957644551963866333120920}, { 0.009203754782059819315102378}, - {-0.009203754782059819315102378}, { 0.999957644551963866333120920}, - { 0.700568793943248366792866380}, { 0.713584868780793592903125099}, - {-0.713584868780793592903125099}, { 0.700568793943248366792866380}, - { 0.920318276709110566440076541}, { 0.391170384302253888687512949}, - {-0.391170384302253888687512949}, { 0.920318276709110566440076541}, - { 0.374164062971457997104393020}, { 0.927362525650401087274536959}, - {-0.927362525650401087274536959}, { 0.374164062971457997104393020}, - { 0.978948175319062194715480124}, { 0.204108966092816874181696950}, - {-0.204108966092816874181696950}, { 0.978948175319062194715480124}, - { 0.547894059173100165608820571}, { 0.836547727223511984524285790}, - {-0.836547727223511984524285790}, { 0.547894059173100165608820571}, - { 0.826321062845663480311195452}, { 0.563199344013834115007363772}, - {-0.563199344013834115007363772}, { 0.826321062845663480311195452}, - { 0.186055151663446648105438304}, { 0.982539302287441255907040396}, - {-0.982539302287441255907040396}, { 0.186055151663446648105438304}, - { 0.994240449453187946358413442}, { 0.107172424956808849175529148}, - {-0.107172424956808849175529148}, { 0.994240449453187946358413442}, - { 0.627251815495144113509622565}, { 0.778816512381475953374724325}, - {-0.778816512381475953374724325}, { 0.627251815495144113509622565}, - { 0.877545290207261291668470750}, { 0.479493757660153026679839798}, - {-0.479493757660153026679839798}, { 0.877545290207261291668470750}, - { 0.281464937925757984095231007}, { 0.959571513081984528335528181}, - {-0.959571513081984528335528181}, { 0.281464937925757984095231007}, - { 0.954228095109105629780430732}, { 0.299079826308040476750336973}, - {-0.299079826308040476750336973}, { 0.954228095109105629780430732}, - { 0.463259783551860197390719637}, { 0.886222530148880631647990821}, - {-0.886222530148880631647990821}, { 0.463259783551860197390719637}, - { 0.767138911935820381181694573}, { 0.641481012808583151988739898}, - {-0.641481012808583151988739898}, { 0.767138911935820381181694573}, - { 0.088853552582524596561586535}, { 0.996044700901251989887944810}, - {-0.996044700901251989887944810}, { 0.088853552582524596561586535}, - { 0.998301544933892840738782163}, { 0.058258264500435759613979782}, - {-0.058258264500435759613979782}, { 0.998301544933892840738782163}, - { 0.664710978203344868130324985}, { 0.747100605980180144323078847}, - {-0.747100605980180144323078847}, { 0.664710978203344868130324985}, - { 0.900015892016160228714535267}, { 0.435857079922255491032544080}, - {-0.435857079922255491032544080}, { 0.900015892016160228714535267}, - { 0.328209843579092526107916817}, { 0.944604837261480265659265493}, - {-0.944604837261480265659265493}, { 0.328209843579092526107916817}, - { 0.967753837093475465243391912}, { 0.251897818154216950498106628}, - {-0.251897818154216950498106628}, { 0.967753837093475465243391912}, - { 0.506186645345155291048942344}, { 0.862423956111040538690933878}, - {-0.862423956111040538690933878}, { 0.506186645345155291048942344}, - { 0.797690840943391108362662755}, { 0.603066598540348201693430617}, - {-0.603066598540348201693430617}, { 0.797690840943391108362662755}, - { 0.137620121586486044948441663}, { 0.990485084256457037998682243}, - {-0.990485084256457037998682243}, { 0.137620121586486044948441663}, - { 0.987784141644572154230969032}, { 0.155828397654265235743101486}, - {-0.155828397654265235743101486}, { 0.987784141644572154230969032}, - { 0.588281548222645304786439813}, { 0.808656181588174991946968128}, - {-0.808656181588174991946968128}, { 0.588281548222645304786439813}, - { 0.852960604930363657746588082}, { 0.521975292937154342694258318}, - {-0.521975292937154342694258318}, { 0.852960604930363657746588082}, - { 0.234041958583543423191242045}, { 0.972226497078936305708321144}, - {-0.972226497078936305708321144}, { 0.234041958583543423191242045}, - { 0.938403534063108112192420774}, { 0.345541324963989065539191723}, - {-0.345541324963989065539191723}, { 0.938403534063108112192420774}, - { 0.419216888363223956433010020}, { 0.907886116487666212038681480}, - {-0.907886116487666212038681480}, { 0.419216888363223956433010020}, - { 0.734738878095963464563223604}, { 0.678350043129861486873655042}, - {-0.678350043129861486873655042}, { 0.734738878095963464563223604}, - { 0.039872927587739811128578738}, { 0.999204758618363895492950001}, - {-0.999204758618363895492950001}, { 0.039872927587739811128578738}, - { 0.999430604555461772019008327}, { 0.033741171851377584833716112}, - {-0.033741171851377584833716112}, { 0.999430604555461772019008327}, - { 0.682845546385248068164596123}, { 0.730562769227827561177758850}, - {-0.730562769227827561177758850}, { 0.682845546385248068164596123}, - { 0.910441292258067196934095369}, { 0.413638312238434547471944324}, - {-0.413638312238434547471944324}, { 0.910441292258067196934095369}, - { 0.351292756085567125601307623}, { 0.936265667170278246576310996}, - {-0.936265667170278246576310996}, { 0.351292756085567125601307623}, - { 0.973644249650811925318383912}, { 0.228072083170885739254457379}, - {-0.228072083170885739254457379}, { 0.973644249650811925318383912}, - { 0.527199134781901348464274575}, { 0.849741768000852489471268395}, - {-0.849741768000852489471268395}, { 0.527199134781901348464274575}, - { 0.812250586585203913049744181}, { 0.583308652937698294392830961}, - {-0.583308652937698294392830961}, { 0.812250586585203913049744181}, - { 0.161886393780111837641387995}, { 0.986809401814185476970235952}, - {-0.986809401814185476970235952}, { 0.161886393780111837641387995}, - { 0.991310859846115418957349799}, { 0.131540028702883111103387493}, - {-0.131540028702883111103387493}, { 0.991310859846115418957349799}, - { 0.607949784967773667243642671}, { 0.793975477554337164895083757}, - {-0.793975477554337164895083757}, { 0.607949784967773667243642671}, - { 0.865513624090569082825488358}, { 0.500885382611240786241285004}, - {-0.500885382611240786241285004}, { 0.865513624090569082825488358}, - { 0.257831102162159005614471295}, { 0.966190003445412555433832961}, - {-0.966190003445412555433832961}, { 0.257831102162159005614471295}, - { 0.946600913083283570044599823}, { 0.322407678801069848384807478}, - {-0.322407678801069848384807478}, { 0.946600913083283570044599823}, - { 0.441371268731716692879988968}, { 0.897324580705418281231391836}, - {-0.897324580705418281231391836}, { 0.441371268731716692879988968}, - { 0.751165131909686411205819422}, { 0.660114342067420478559490747}, - {-0.660114342067420478559490747}, { 0.751165131909686411205819422}, - { 0.064382630929857460819324537}, { 0.997925286198596012623025462}, - {-0.997925286198596012623025462}, { 0.064382630929857460819324537}, - { 0.996571145790554847093566910}, { 0.082740264549375693111987083}, - {-0.082740264549375693111987083}, { 0.996571145790554847093566910}, - { 0.646176012983316364832802220}, { 0.763188417263381271704838297}, - {-0.763188417263381271704838297}, { 0.646176012983316364832802220}, - { 0.889048355854664562540777729}, { 0.457813303598877221904961155}, - {-0.457813303598877221904961155}, { 0.889048355854664562540777729}, - { 0.304929229735402406490728633}, { 0.952375012719765858529893608}, - {-0.952375012719765858529893608}, { 0.304929229735402406490728633}, - { 0.961280485811320641748659653}, { 0.275571819310958163076425168}, - {-0.275571819310958163076425168}, { 0.961280485811320641748659653}, - { 0.484869248000791101822951699}, { 0.874586652278176112634431897}, - {-0.874586652278176112634431897}, { 0.484869248000791101822951699}, - { 0.782650596166575738458949301}, { 0.622461279374149972519166721}, - {-0.622461279374149972519166721}, { 0.782650596166575738458949301}, - { 0.113270952177564349018228733}, { 0.993564135520595333782021697}, - {-0.993564135520595333782021697}, { 0.113270952177564349018228733}, - { 0.983662419211730274396237776}, { 0.180022901405699522679906590}, - {-0.180022901405699522679906590}, { 0.983662419211730274396237776}, - { 0.568258952670131549790548489}, { 0.822849781375826332046780034}, - {-0.822849781375826332046780034}, { 0.568258952670131549790548489}, - { 0.839893794195999504583383987}, { 0.542750784864515906586768661}, - {-0.542750784864515906586768661}, { 0.839893794195999504583383987}, - { 0.210111836880469621717489972}, { 0.977677357824509979943404762}, - {-0.977677357824509979943404762}, { 0.210111836880469621717489972}, - { 0.929640895843181265457918066}, { 0.368466829953372331712746222}, - {-0.368466829953372331712746222}, { 0.929640895843181265457918066}, - { 0.396809987416710328595290911}, { 0.917900775621390457642276297}, - {-0.917900775621390457642276297}, { 0.396809987416710328595290911}, - { 0.717870045055731736211325329}, { 0.696177131491462944788582591}, - {-0.696177131491462944788582591}, { 0.717870045055731736211325329}, - { 0.015339206284988101044151868}, { 0.999882347454212525633049627}, - {-0.999882347454212525633049627}, { 0.015339206284988101044151868}, - { 0.999769405351215321657617036}, { 0.021474080275469507418374898}, - {-0.021474080275469507418374898}, { 0.999769405351215321657617036}, - { 0.691759258364157774906734132}, { 0.722128193929215321243607198}, - {-0.722128193929215321243607198}, { 0.691759258364157774906734132}, - { 0.915448716088267819566431292}, { 0.402434650859418441082533934}, - {-0.402434650859418441082533934}, { 0.915448716088267819566431292}, - { 0.362755724367397216204854462}, { 0.931884265581668106718557199}, - {-0.931884265581668106718557199}, { 0.362755724367397216204854462}, - { 0.976369731330021149312732194}, { 0.216106797076219509948385131}, - {-0.216106797076219509948385131}, { 0.976369731330021149312732194}, - { 0.537587076295645482502214932}, { 0.843208239641845437161743865}, - {-0.843208239641845437161743865}, { 0.537587076295645482502214932}, - { 0.819347520076796960824689637}, { 0.573297166698042212820171239}, - {-0.573297166698042212820171239}, { 0.819347520076796960824689637}, - { 0.173983873387463827950700807}, { 0.984748501801904218556553176}, - {-0.984748501801904218556553176}, { 0.173983873387463827950700807}, - { 0.992850414459865090793563344}, { 0.119365214810991364593637790}, - {-0.119365214810991364593637790}, { 0.992850414459865090793563344}, - { 0.617647307937803932403979402}, { 0.786455213599085757522319464}, - {-0.786455213599085757522319464}, { 0.617647307937803932403979402}, - { 0.871595086655951034842481435}, { 0.490226483288291154229598449}, - {-0.490226483288291154229598449}, { 0.871595086655951034842481435}, - { 0.269668325572915106525464462}, { 0.962953266873683886347921481}, - {-0.962953266873683886347921481}, { 0.269668325572915106525464462}, - { 0.950486073949481721759926101}, { 0.310767152749611495835997250}, - {-0.310767152749611495835997250}, { 0.950486073949481721759926101}, - { 0.452349587233770874133026703}, { 0.891840709392342727796478697}, - {-0.891840709392342727796478697}, { 0.452349587233770874133026703}, - { 0.759209188978388033485525443}, { 0.650846684996380915068975573}, - {-0.650846684996380915068975573}, { 0.759209188978388033485525443}, - { 0.076623861392031492278332463}, { 0.997060070339482978987989949}, - {-0.997060070339482978987989949}, { 0.076623861392031492278332463}, - { 0.997511456140303459699448390}, { 0.070504573389613863027351471}, - {-0.070504573389613863027351471}, { 0.997511456140303459699448390}, - { 0.655492852999615385312679701}, { 0.755201376896536527598710756}, - {-0.755201376896536527598710756}, { 0.655492852999615385312679701}, - { 0.894599485631382678433072126}, { 0.446868840162374195353044389}, - {-0.446868840162374195353044389}, { 0.894599485631382678433072126}, - { 0.316593375556165867243047035}, { 0.948561349915730288158494826}, - {-0.948561349915730288158494826}, { 0.316593375556165867243047035}, - { 0.964589793289812723836432159}, { 0.263754678974831383611349322}, - {-0.263754678974831383611349322}, { 0.964589793289812723836432159}, - { 0.495565261825772531150266670}, { 0.868570705971340895340449876}, - {-0.868570705971340895340449876}, { 0.495565261825772531150266670}, - { 0.790230221437310055030217152}, { 0.612810082429409703935211936}, - {-0.612810082429409703935211936}, { 0.790230221437310055030217152}, - { 0.125454983411546238542336453}, { 0.992099313142191757112085445}, - {-0.992099313142191757112085445}, { 0.125454983411546238542336453}, - { 0.985797509167567424700995000}, { 0.167938294974731178054745536}, - {-0.167938294974731178054745536}, { 0.985797509167567424700995000}, - { 0.578313796411655563342245019}, { 0.815814410806733789010772660}, - {-0.815814410806733789010772660}, { 0.578313796411655563342245019}, - { 0.846490938774052078300544488}, { 0.532403127877197971442805218}, - {-0.532403127877197971442805218}, { 0.846490938774052078300544488}, - { 0.222093620973203534094094721}, { 0.975025345066994146844913468}, - {-0.975025345066994146844913468}, { 0.222093620973203534094094721}, - { 0.934092550404258914729877883}, { 0.357030961233430032614954036}, - {-0.357030961233430032614954036}, { 0.934092550404258914729877883}, - { 0.408044162864978680820747499}, { 0.912962190428398164628018233}, - {-0.912962190428398164628018233}, { 0.408044162864978680820747499}, - { 0.726359155084345976817494315}, { 0.687315340891759108199186948}, - {-0.687315340891759108199186948}, { 0.726359155084345976817494315}, - { 0.027608145778965741612354872}, { 0.999618822495178597116830637}, - {-0.999618822495178597116830637}, { 0.027608145778965741612354872}, - { 0.998941293186856850633930266}, { 0.046003182130914628814301788}, - {-0.046003182130914628814301788}, { 0.998941293186856850633930266}, - { 0.673829000378756060917568372}, { 0.738887324460615147933116508}, - {-0.738887324460615147933116508}, { 0.673829000378756060917568372}, - { 0.905296759318118774354048329}, { 0.424779681209108833357226189}, - {-0.424779681209108833357226189}, { 0.905296759318118774354048329}, - { 0.339776884406826857828825803}, { 0.940506070593268323787291309}, - {-0.940506070593268323787291309}, { 0.339776884406826857828825803}, - { 0.970772140728950302138169611}, { 0.240003022448741486568922365}, - {-0.240003022448741486568922365}, { 0.970772140728950302138169611}, - { 0.516731799017649881508753876}, { 0.856147328375194481019630732}, - {-0.856147328375194481019630732}, { 0.516731799017649881508753876}, - { 0.805031331142963597922659282}, { 0.593232295039799808047809426}, - {-0.593232295039799808047809426}, { 0.805031331142963597922659282}, - { 0.149764534677321517229695737}, { 0.988721691960323767604516485}, - {-0.988721691960323767604516485}, { 0.149764534677321517229695737}, - { 0.989622017463200834623694454}, { 0.143695033150294454819773349}, - {-0.143695033150294454819773349}, { 0.989622017463200834623694454}, - { 0.598160706996342311724958652}, { 0.801376171723140219430247777}, - {-0.801376171723140219430247777}, { 0.598160706996342311724958652}, - { 0.859301818357008404783582139}, { 0.511468850437970399504391001}, - {-0.511468850437970399504391001}, { 0.859301818357008404783582139}, - { 0.245955050335794611599924709}, { 0.969281235356548486048290738}, - {-0.969281235356548486048290738}, { 0.245955050335794611599924709}, - { 0.942573197601446879280758735}, { 0.333999651442009404650865481}, - {-0.333999651442009404650865481}, { 0.942573197601446879280758735}, - { 0.430326481340082633908199031}, { 0.902673318237258806751502391}, - {-0.902673318237258806751502391}, { 0.430326481340082633908199031}, - { 0.743007952135121693517362293}, { 0.669282588346636065720696366}, - {-0.669282588346636065720696366}, { 0.743007952135121693517362293}, - { 0.052131704680283321236358216}, { 0.998640218180265222418199049}, - {-0.998640218180265222418199049}, { 0.052131704680283321236358216}, - { 0.995480755491926941769171600}, { 0.094963495329638998938034312}, - {-0.094963495329638998938034312}, { 0.995480755491926941769171600}, - { 0.636761861236284230413943435}, { 0.771060524261813773200605759}, - {-0.771060524261813773200605759}, { 0.636761861236284230413943435}, - { 0.883363338665731594736308015}, { 0.468688822035827933697617870}, - {-0.468688822035827933697617870}, { 0.883363338665731594736308015}, - { 0.293219162694258650606608599}, { 0.956045251349996443270479823}, - {-0.956045251349996443270479823}, { 0.293219162694258650606608599}, - { 0.957826413027532890321037029}, { 0.287347459544729526477331841}, - {-0.287347459544729526477331841}, { 0.957826413027532890321037029}, - { 0.474100214650550014398580015}, { 0.880470889052160770806542929}, - {-0.880470889052160770806542929}, { 0.474100214650550014398580015}, - { 0.774953106594873878359129282}, { 0.632018735939809021909403706}, - {-0.632018735939809021909403706}, { 0.774953106594873878359129282}, - { 0.101069862754827824987887585}, { 0.994879330794805620591166107}, - {-0.994879330794805620591166107}, { 0.101069862754827824987887585}, - { 0.981379193313754574318224190}, { 0.192080397049892441679288205}, - {-0.192080397049892441679288205}, { 0.981379193313754574318224190}, - { 0.558118531220556115693702964}, { 0.829761233794523042469023765}, - {-0.829761233794523042469023765}, { 0.558118531220556115693702964}, - { 0.833170164701913186439915922}, { 0.553016705580027531764226988}, - {-0.553016705580027531764226988}, { 0.833170164701913186439915922}, - { 0.198098410717953586179324918}, { 0.980182135968117392690210009}, - {-0.980182135968117392690210009}, { 0.198098410717953586179324918}, - { 0.925049240782677590302371869}, { 0.379847208924051170576281147}, - {-0.379847208924051170576281147}, { 0.925049240782677590302371869}, - { 0.385516053843918864075607949}, { 0.922701128333878570437264227}, - {-0.922701128333878570437264227}, { 0.385516053843918864075607949}, - { 0.709272826438865651316533772}, { 0.704934080375904908852523758}, - {-0.704934080375904908852523758}, { 0.709272826438865651316533772}, - { 0.003067956762965976270145365}, { 0.999995293809576171511580126}, - {-0.999995293809576171511580126}, { 0.003067956762965976270145365} -}; - -const fpr fpr_p2_tab[] = { - { 2.00000000000 }, - { 1.00000000000 }, - { 0.50000000000 }, - { 0.25000000000 }, - { 0.12500000000 }, - { 0.06250000000 }, - { 0.03125000000 }, - { 0.01562500000 }, - { 0.00781250000 }, - { 0.00390625000 }, - { 0.00195312500 } -}; - -#else // yyyFPNATIVE+0 yyyFPEMU+0 - -#error No FP implementation selected - -#endif // yyyFPNATIVE- yyyFPEMU- diff --git a/crypto_sign/falcon-1024/m4-ct/fpr.h b/crypto_sign/falcon-1024/m4-ct/fpr.h deleted file mode 100644 index 8176212d..00000000 --- a/crypto_sign/falcon-1024/m4-ct/fpr.h +++ /dev/null @@ -1,893 +0,0 @@ -/* - * Floating-point operations. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#if FALCON_FPEMU // yyyFPEMU+1 yyyFPNATIVE+0 - -/* ====================================================================== */ -/* - * Custom floating-point implementation with integer arithmetics. We - * use IEEE-754 "binary64" format, with some simplifications: - * - * - Top bit is s = 1 for negative, 0 for positive. - * - * - Exponent e uses the next 11 bits (bits 52 to 62, inclusive). - * - * - Mantissa m uses the 52 low bits. - * - * Encoded value is, in general: (-1)^s * 2^(e-1023) * (1 + m*2^(-52)) - * i.e. the mantissa really is a 53-bit number (less than 2.0, but not - * less than 1.0), but the top bit (equal to 1 by definition) is omitted - * in the encoding. - * - * In IEEE-754, there are some special values: - * - * - If e = 2047, then the value is either an infinite (m = 0) or - * a NaN (m != 0). - * - * - If e = 0, then the value is either a zero (m = 0) or a subnormal, - * aka "denormalized number" (m != 0). - * - * Of these, we only need the zeros. The caller is responsible for not - * providing operands that would lead to infinites, NaNs or subnormals. - * If inputs are such that values go out of range, then indeterminate - * values are returned (it would still be deterministic, but no specific - * value may be relied upon). - * - * At the C level, the three parts are stored in a 64-bit unsigned - * word. - * - * One may note that a property of the IEEE-754 format is that order - * is preserved for positive values: if two positive floating-point - * values x and y are such that x < y, then their respective encodings - * as _signed_ 64-bit integers i64(x) and i64(y) will be such that - * i64(x) < i64(y). For negative values, order is reversed: if x < 0, - * y < 0, and x < y, then ia64(x) > ia64(y). - * - * IMPORTANT ASSUMPTIONS: - * ====================== - * - * For proper computations, and constant-time behaviour, we assume the - * following: - * - * - 32x32->64 multiplication (unsigned) has an execution time that - * is independent of its operands. This is true of most modern - * x86 and ARM cores. Notable exceptions are the ARM Cortex M0, M0+ - * and M3 (in the M0 and M0+, this is done in software, so it depends - * on that routine), and the PowerPC cores from the G3/G4 lines. - * For more info, see: https://www.bearssl.org/ctmul.html - * - * - Left-shifts and right-shifts of 32-bit values have an execution - * time which does not depend on the shifted value nor on the - * shift count. An historical exception is the Pentium IV, but most - * modern CPU have barrel shifters. Some small microcontrollers - * might have varying-time shifts (not the ARM Cortex M*, though). - * - * - Right-shift of a signed negative value performs a sign extension. - * As per the C standard, this operation returns an - * implementation-defined result (this is NOT an "undefined - * behaviour"). On most/all systems, an arithmetic shift is - * performed, because this is what makes most sense. - */ - -/* - * Normally we should declare the 'fpr' type to be a struct or union - * around the internal 64-bit value; however, we want to use the - * direct 64-bit integer type to enable a lighter call convention on - * ARM platforms. This means that direct (invalid) use of operators - * such as '*' or '+' will not be caught by the compiler. We rely on - * the "normal" (non-emulated) code to detect such instances. - */ -typedef uint64_t fpr; - -/* - * For computations, we split values into an integral mantissa in the - * 2^54..2^55 range, and an (adjusted) exponent. The lowest bit is - * "sticky" (it is set to 1 if any of the bits below it is 1); when - * re-encoding, the low two bits are dropped, but may induce an - * increment in the value for proper rounding. - */ - -/* - * Right-shift a 64-bit unsigned value by a possibly secret shift count. - * We assumed that the underlying architecture had a barrel shifter for - * 32-bit shifts, but for 64-bit shifts on a 32-bit system, this will - * typically invoke a software routine that is not necessarily - * constant-time; hence the function below. - * - * Shift count n MUST be in the 0..63 range. - */ -static inline uint64_t -fpr_ursh(uint64_t x, int n) -{ - x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5); - return x >> (n & 31); -} - -/* - * Right-shift a 64-bit signed value by a possibly secret shift count - * (see fpr_ursh() for the rationale). - * - * Shift count n MUST be in the 0..63 range. - */ -static inline int64_t -fpr_irsh(int64_t x, int n) -{ - x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5); - return x >> (n & 31); -} - -/* - * Left-shift a 64-bit unsigned value by a possibly secret shift count - * (see fpr_ursh() for the rationale). - * - * Shift count n MUST be in the 0..63 range. - */ -static inline uint64_t -fpr_ulsh(uint64_t x, int n) -{ - x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5); - return x << (n & 31); -} - -/* - * Expectations: - * s = 0 or 1 - * exponent e is "arbitrary" and unbiased - * 2^54 <= m < 2^55 - * Numerical value is (-1)^2 * m * 2^e - * - * Exponents which are too low lead to value zero. If the exponent is - * too large, the returned value is indeterminate. - * - * If m = 0, then a zero is returned (using the provided sign). - * If e < -1076, then a zero is returned (regardless of the value of m). - * If e >= -1076 and e != 0, m must be within the expected range - * (2^54 to 2^55-1). - */ -static inline fpr -FPR(int s, int e, uint64_t m) -{ - fpr x; - uint32_t t; - unsigned f; - - /* - * If e >= -1076, then the value is "normal"; otherwise, it - * should be a subnormal, which we clamp down to zero. - */ - e += 1076; - t = (uint32_t)e >> 31; - m &= (uint64_t)t - 1; - - /* - * If m = 0 then we want a zero; make e = 0 too, but conserve - * the sign. - */ - t = (uint32_t)(m >> 54); - e &= -(int)t; - - /* - * The 52 mantissa bits come from m. Value m has its top bit set - * (unless it is a zero); we leave it "as is": the top bit will - * increment the exponent by 1, except when m = 0, which is - * exactly what we want. - */ - x = (((uint64_t)s << 63) | (m >> 2)) + ((uint64_t)(uint32_t)e << 52); - - /* - * Rounding: if the low three bits of m are 011, 110 or 111, - * then the value should be incremented to get the next - * representable value. This implements the usual - * round-to-nearest rule (with preference to even values in case - * of a tie). Note that the increment may make a carry spill - * into the exponent field, which is again exactly what we want - * in that case. - */ - f = (unsigned)m & 7U; - x += (0xC8U >> f) & 1; - return x; -} - -#define fpr_scaled Zf(fpr_scaled) -fpr fpr_scaled(int64_t i, int sc); - -static inline fpr -fpr_of(int64_t i) -{ - return fpr_scaled(i, 0); -} - -static const fpr fpr_q = 4667981563525332992; -static const fpr fpr_inverse_of_q = 4545632735260551042; -static const fpr fpr_inv_2sqrsigma0 = 4594603506513722306; -static const fpr fpr_inv_sigma = 4573359825155195350; -static const fpr fpr_sigma_min_9 = 4608495221497168882; -static const fpr fpr_sigma_min_10 = 4608586345619182117; -static const fpr fpr_log2 = 4604418534313441775; -static const fpr fpr_inv_log2 = 4609176140021203710; -static const fpr fpr_bnorm_max = 4670353323383631276; -static const fpr fpr_zero = 0; -static const fpr fpr_one = 4607182418800017408; -static const fpr fpr_two = 4611686018427387904; -static const fpr fpr_onehalf = 4602678819172646912; -static const fpr fpr_invsqrt2 = 4604544271217802189; -static const fpr fpr_invsqrt8 = 4600040671590431693; -static const fpr fpr_ptwo31 = 4746794007248502784; -static const fpr fpr_ptwo31m1 = 4746794007244308480; -static const fpr fpr_mtwo31m1 = 13970166044099084288U; -static const fpr fpr_ptwo63m1 = 4890909195324358656; -static const fpr fpr_mtwo63m1 = 14114281232179134464U; -static const fpr fpr_ptwo63 = 4890909195324358656; - -static inline int64_t -fpr_rint(fpr x) -{ - uint64_t m, d; - int e; - uint32_t s, dd, f; - - /* - * We assume that the value fits in -(2^63-1)..+(2^63-1). We can - * thus extract the mantissa as a 63-bit integer, then right-shift - * it as needed. - */ - m = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1); - e = 1085 - ((int)(x >> 52) & 0x7FF); - - /* - * If a shift of more than 63 bits is needed, then simply set m - * to zero. This also covers the case of an input operand equal - * to zero. - */ - m &= -(uint64_t)((uint32_t)(e - 64) >> 31); - e &= 63; - - /* - * Right-shift m as needed. Shift count is e. Proper rounding - * mandates that: - * - If the highest dropped bit is zero, then round low. - * - If the highest dropped bit is one, and at least one of the - * other dropped bits is one, then round up. - * - If the highest dropped bit is one, and all other dropped - * bits are zero, then round up if the lowest kept bit is 1, - * or low otherwise (i.e. ties are broken by "rounding to even"). - * - * We thus first extract a word consisting of all the dropped bit - * AND the lowest kept bit; then we shrink it down to three bits, - * the lowest being "sticky". - */ - d = fpr_ulsh(m, 63 - e); - dd = (uint32_t)d | ((uint32_t)(d >> 32) & 0x1FFFFFFF); - f = (uint32_t)(d >> 61) | ((dd | -dd) >> 31); - m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U); - - /* - * Apply the sign bit. - */ - s = (uint32_t)(x >> 63); - return ((int64_t)m ^ -(int64_t)s) + (int64_t)s; -} - -static inline int64_t -fpr_floor(fpr x) -{ - uint64_t t; - int64_t xi; - int e, cc; - - /* - * We extract the integer as a _signed_ 64-bit integer with - * a scaling factor. Since we assume that the value fits - * in the -(2^63-1)..+(2^63-1) range, we can left-shift the - * absolute value to make it in the 2^62..2^63-1 range: we - * will only need a right-shift afterwards. - */ - e = (int)(x >> 52) & 0x7FF; - t = x >> 63; - xi = (int64_t)(((x << 10) | ((uint64_t)1 << 62)) - & (((uint64_t)1 << 63) - 1)); - xi = (xi ^ -(int64_t)t) + (int64_t)t; - cc = 1085 - e; - - /* - * We perform an arithmetic right-shift on the value. This - * applies floor() semantics on both positive and negative values - * (rounding toward minus infinity). - */ - xi = fpr_irsh(xi, cc & 63); - - /* - * If the true shift count was 64 or more, then we should instead - * replace xi with 0 (if nonnegative) or -1 (if negative). Edge - * case: -0 will be floored to -1, not 0 (whether this is correct - * is debatable; in any case, the other functions normalize zero - * to +0). - * - * For an input of zero, the non-shifted xi was incorrect (we used - * a top implicit bit of value 1, not 0), but this does not matter - * since this operation will clamp it down. - */ - xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31); - return xi; -} - -static inline int64_t -fpr_trunc(fpr x) -{ - uint64_t t, xu; - int e, cc; - - /* - * Extract the absolute value. Since we assume that the value - * fits in the -(2^63-1)..+(2^63-1) range, we can left-shift - * the absolute value into the 2^62..2^63-1 range, and then - * do a right shift afterwards. - */ - e = (int)(x >> 52) & 0x7FF; - xu = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1); - cc = 1085 - e; - xu = fpr_ursh(xu, cc & 63); - - /* - * If the exponent is too low (cc > 63), then the shift was wrong - * and we must clamp the value to 0. This also covers the case - * of an input equal to zero. - */ - xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31); - - /* - * Apply back the sign, if the source value is negative. - */ - t = x >> 63; - xu = (xu ^ -t) + t; - return *(int64_t *)&xu; -} - -#define fpr_add Zf(fpr_add) -fpr fpr_add(fpr x, fpr y); - -static inline fpr -fpr_sub(fpr x, fpr y) -{ - y ^= (uint64_t)1 << 63; - return fpr_add(x, y); -} - -static inline fpr -fpr_neg(fpr x) -{ - x ^= (uint64_t)1 << 63; - return x; -} - -static inline fpr -fpr_half(fpr x) -{ - /* - * To divide a value by 2, we just have to subtract 1 from its - * exponent, but we have to take care of zero. - */ - uint32_t t; - - x -= (uint64_t)1 << 52; - t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11; - x &= (uint64_t)t - 1; - return x; -} - -static inline fpr -fpr_double(fpr x) -{ - /* - * To double a value, we just increment by one the exponent. We - * don't care about infinites or NaNs; however, 0 is a - * special case. - */ - x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52; - return x; -} - -#define fpr_mul Zf(fpr_mul) -fpr fpr_mul(fpr x, fpr y); - -static inline fpr -fpr_sqr(fpr x) -{ - return fpr_mul(x, x); -} - -#define fpr_div Zf(fpr_div) -fpr fpr_div(fpr x, fpr y); - -static inline fpr -fpr_inv(fpr x) -{ - return fpr_div(4607182418800017408u, x); -} - -#define fpr_sqrt Zf(fpr_sqrt) -fpr fpr_sqrt(fpr x); - -static inline int -fpr_lt(fpr x, fpr y) -{ - /* - * If x >= 0 or y >= 0, a signed comparison yields the proper - * result: - * - For positive values, the order is preserved. - * - The sign bit is at the same place as in integers, so - * sign is preserved. - * - * If both x and y are negative, then the order is reversed. - * We cannot simply invert the comparison result in that case - * because it would not handle the edge case x = y properly. - */ - int cc0, cc1; - - cc0 = *(int64_t *)&x < *(int64_t *)&y; - cc1 = *(int64_t *)&x > *(int64_t *)&y; - return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63)); -} - -/* - * Compute exp(x) for x such that |x| <= ln 2. We want a precision of 50 - * bits or so. - */ -#define fpr_expm_p63 Zf(fpr_expm_p63) -uint64_t fpr_expm_p63(fpr x, fpr ccs); - -#define fpr_gm_tab Zf(fpr_gm_tab) -extern const fpr fpr_gm_tab[]; - -#define fpr_p2_tab Zf(fpr_p2_tab) -extern const fpr fpr_p2_tab[]; - -/* ====================================================================== */ - -#elif FALCON_FPNATIVE // yyyFPEMU+0 yyyFPNATIVE+1 - -/* ====================================================================== */ - -#include - -/* - * We wrap the native 'double' type into a structure so that the C compiler - * complains if we inadvertently use raw arithmetic operators on the 'fpr' - * type instead of using the inline functions below. This should have no - * extra runtime cost, since all the functions below are 'inline'. - */ -typedef struct { double v; } fpr; - -static inline fpr -FPR(double v) -{ - fpr x; - - x.v = v; - return x; -} - -static inline fpr -fpr_of(int64_t i) -{ - return FPR((double)i); -} - -static const fpr fpr_q = { 12289.0 }; -static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 }; -static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 }; -static const fpr fpr_inv_sigma = { .005819826392951607426919370871 }; -static const fpr fpr_sigma_min_9 = { 1.291500756233514568549480827642 }; -static const fpr fpr_sigma_min_10 = { 1.311734375905083682667395805765 }; -static const fpr fpr_log2 = { 0.69314718055994530941723212146 }; -static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 }; -static const fpr fpr_bnorm_max = { 16822.4121 }; -static const fpr fpr_zero = { 0.0 }; -static const fpr fpr_one = { 1.0 }; -static const fpr fpr_two = { 2.0 }; -static const fpr fpr_onehalf = { 0.5 }; -static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 }; -static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 }; -static const fpr fpr_ptwo31 = { 2147483648.0 }; -static const fpr fpr_ptwo31m1 = { 2147483647.0 }; -static const fpr fpr_mtwo31m1 = { -2147483647.0 }; -static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 }; -static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 }; -static const fpr fpr_ptwo63 = { 9223372036854775808.0 }; - -static inline int64_t -fpr_rint(fpr x) -{ - /* - * We do not want to use llrint() since it might be not - * constant-time. - * - * Suppose that x >= 0. If x >= 2^52, then it is already an - * integer. Otherwise, if x < 2^52, then computing x+2^52 will - * yield a value that will be rounded to the nearest integer - * with exactly the right rules (round-to-nearest-even). - * - * In order to have constant-time processing, we must do the - * computation for both x >= 0 and x < 0 cases, and use a - * cast to an integer to access the sign and select the proper - * value. Such casts also allow us to find out if |x| < 2^52. - */ - int64_t sx, tx, rp, rn, m; - uint32_t ub; - - sx = (int64_t)(x.v - 1.0); - tx = (int64_t)x.v; - rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496; - rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496; - - /* - * If tx >= 2^52 or tx < -2^52, then result is tx. - * Otherwise, if sx >= 0, then result is rp. - * Otherwise, result is rn. We use the fact that when x is - * close to 0 (|x| <= 0.25) then both rp and rn are correct; - * and if x is not close to 0, then trunc(x-1.0) yields the - * appropriate sign. - */ - - /* - * Clamp rp to zero if tx < 0. - * Clamp rn to zero if tx >= 0. - */ - m = sx >> 63; - rn &= m; - rp &= ~m; - - /* - * Get the 12 upper bits of tx; if they are not all zeros or - * all ones, then tx >= 2^52 or tx < -2^52, and we clamp both - * rp and rn to zero. Otherwise, we clamp tx to zero. - */ - ub = (uint32_t)((uint64_t)tx >> 52); - m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31); - rp &= m; - rn &= m; - tx &= ~m; - - /* - * Only one of tx, rn or rp (at most) can be non-zero at this - * point. - */ - return tx | rn | rp; -} - -static inline int64_t -fpr_floor(fpr x) -{ - int64_t r; - - /* - * The cast performs a trunc() (rounding toward 0) and thus is - * wrong by 1 for most negative values. The correction below is - * constant-time as long as the compiler turns the - * floating-point conversion result into a 0/1 integer without a - * conditional branch or another non-constant-time construction. - * This should hold on all modern architectures with an FPU (and - * if it is false on a given arch, then chances are that the FPU - * itself is not constant-time, making the point moot). - */ - r = (int64_t)x.v; - return r - (x.v < (double)r); -} - -static inline int64_t -fpr_trunc(fpr x) -{ - return (int64_t)x.v; -} - -static inline fpr -fpr_add(fpr x, fpr y) -{ - return FPR(x.v + y.v); -} - -static inline fpr -fpr_sub(fpr x, fpr y) -{ - return FPR(x.v - y.v); -} - -static inline fpr -fpr_neg(fpr x) -{ - return FPR(-x.v); -} - -static inline fpr -fpr_half(fpr x) -{ - return FPR(x.v * 0.5); -} - -static inline fpr -fpr_double(fpr x) -{ - return FPR(x.v + x.v); -} - -static inline fpr -fpr_mul(fpr x, fpr y) -{ - return FPR(x.v * y.v); -} - -static inline fpr -fpr_sqr(fpr x) -{ - return FPR(x.v * x.v); -} - -static inline fpr -fpr_inv(fpr x) -{ - return FPR(1.0 / x.v); -} - -static inline fpr -fpr_div(fpr x, fpr y) -{ - return FPR(x.v / y.v); -} - -#if FALCON_AVX2 // yyyAVX2+1 -TARGET_AVX2 -static inline void -fpr_sqrt_avx2(double *t) -{ - __m128d x; - - x = _mm_load1_pd(t); - x = _mm_sqrt_pd(x); - _mm_storel_pd(t, x); -} -#endif // yyyAVX2- - -static inline fpr -fpr_sqrt(fpr x) -{ - /* - * We prefer not to have a dependency on libm when it can be - * avoided. On x86, calling the sqrt() libm function inlines - * the relevant opcode (fsqrt or sqrtsd, depending on whether - * the 387 FPU or SSE2 is used for floating-point operations) - * but then makes an optional call to the library function - * for proper error handling, in case the operand is negative. - * - * To avoid this dependency, we use intrinsics or inline assembly - * on recognized platforms: - * - * - If AVX2 is explicitly enabled, then we use SSE2 intrinsics. - * - * - On GCC/Clang with SSE maths, we use SSE2 intrinsics. - * - * - On GCC/Clang on i386, or MSVC on i386, we use inline assembly - * to call the 387 FPU fsqrt opcode. - * - * - On GCC/Clang/XLC on PowerPC, we use inline assembly to call - * the fsqrt opcode (Clang needs a special hack). - * - * - On GCC/Clang on ARM with hardware floating-point, we use - * inline assembly to call the vqsrt.f64 opcode. Due to a - * complex ecosystem of compilers and assembly syntaxes, we - * have to call it "fsqrt" or "fsqrtd", depending on case. - * - * If the platform is not recognized, a call to the system - * library function sqrt() is performed. On some compilers, this - * may actually inline the relevant opcode, and call the library - * function only when the input is invalid (e.g. negative); - * Falcon never actually calls sqrt() on a negative value, but - * the dependency to libm will still be there. - */ - -#if FALCON_AVX2 // yyyAVX2+1 - fpr_sqrt_avx2(&x.v); - return x; -#else // yyyAVX2+0 -#if defined __GNUC__ && defined __SSE2_MATH__ - return FPR(_mm_cvtsd_f64(_mm_sqrt_pd(_mm_set1_pd(x.v)))); -#elif defined __GNUC__ && defined __i386__ - __asm__ __volatile__ ( - "fldl %0\n\t" - "fsqrt\n\t" - "fstpl %0\n\t" - : "+m" (x.v) : : ); - return x; -#elif defined _M_IX86 - __asm { - fld x.v - fsqrt - fstp x.v - } - return x; -#elif defined __PPC__ && defined __GNUC__ - fpr y; - -#if defined __clang__ - /* - * Normally we should use a 'd' constraint (register that contains - * a 'double' value) but Clang 3.8.1 chokes on it. Instead we use - * an 'f' constraint, counting on the fact that 'float' values - * are managed in double-precision registers anyway, and the - * compiler will not add extra rounding steps. - */ - __asm__ ( "fsqrt %0, %1" : "=f" (y.v) : "f" (x.v) : ); -#else - __asm__ ( "fsqrt %0, %1" : "=d" (y.v) : "d" (x.v) : ); -#endif - return y; -#elif (defined __ARM_FP && ((__ARM_FP & 0x08) == 0x08)) \ - || (!defined __ARM_FP && defined __ARM_VFPV2__) - /* - * On ARM, assembly syntaxes are a bit of a mess, depending on - * whether GCC or Clang is used, and the binutils version, and - * whether this is 32-bit or 64-bit mode. The code below appears - * to work on: - * 32-bit GCC-4.9.2 Clang-3.5 Binutils-2.25 - * 64-bit GCC-6.3.0 Clang-3.9 Binutils-2.28 - */ -#if defined __aarch64__ && __aarch64__ - __asm__ ( "fsqrt %d0, %d0" : "+w" (x.v) : : ); -#else - __asm__ ( "fsqrtd %P0, %P0" : "+w" (x.v) : : ); -#endif - return x; -#else - return FPR(sqrt(x.v)); -#endif -#endif // yyyAVX2- -} - -static inline int -fpr_lt(fpr x, fpr y) -{ - return x.v < y.v; -} - -TARGET_AVX2 -static inline uint64_t -fpr_expm_p63(fpr x, fpr ccs) -{ - /* - * Polynomial approximation of exp(-x) is taken from FACCT: - * https://eprint.iacr.org/2018/1234 - * Specifically, values are extracted from the implementation - * referenced from the FACCT article, and available at: - * https://github.com/raykzhao/gaussian - * Tests over more than 24 billions of random inputs in the - * 0..log(2) range have never shown a deviation larger than - * 2^(-50) from the true mathematical value. - */ - -#if FALCON_AVX2 // yyyAVX2+1 - - /* - * AVX2 implementation uses more operations than Horner's method, - * but with a lower expression tree depth. This helps because - * additions and multiplications have a latency of 4 cycles on - * a Skylake, but the CPU can issue two of them per cycle. - */ - - static const union { - double d[12]; - __m256d v[3]; - } c = { - { - 0.999999999999994892974086724280, - 0.500000000000019206858326015208, - 0.166666666666984014666397229121, - 0.041666666666110491190622155955, - 0.008333333327800835146903501993, - 0.001388888894063186997887560103, - 0.000198412739277311890541063977, - 0.000024801566833585381209939524, - 0.000002755586350219122514855659, - 0.000000275607356160477811864927, - 0.000000025299506379442070029551, - 0.000000002073772366009083061987 - } - }; - - double d1, d2, d4, d8, y; - __m256d d14, d58, d9c; - - d1 = -x.v; - d2 = d1 * d1; - d4 = d2 * d2; - d8 = d4 * d4; - d14 = _mm256_set_pd(d4, d2 * d1, d2, d1); - d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4)); - d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8)); - d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0])); - d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14); - d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58); - d9c = _mm256_hadd_pd(d9c, d9c); - y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c) - + _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1)); - y *= ccs.v; - - /* - * Final conversion goes through int64_t first, because that's what - * the underlying opcode (vcvttsd2si) will do, and we know that the - * result will fit, since x >= 0 and ccs < 1. If we did the - * conversion directly to uint64_t, then the compiler would add some - * extra code to cover the case of a source value of 2^63 or more, - * and though the alternate path would never be exercised, the - * extra comparison would cost us some cycles. - */ - return (uint64_t)(int64_t)(y * fpr_ptwo63.v); - -#else // yyyAVX2+0 - - /* - * Normal implementation uses Horner's method, which minimizes - * the number of operations. - */ - - double d, y; - - d = x.v; - y = 0.000000002073772366009083061987; - y = 0.000000025299506379442070029551 - y * d; - y = 0.000000275607356160477811864927 - y * d; - y = 0.000002755586350219122514855659 - y * d; - y = 0.000024801566833585381209939524 - y * d; - y = 0.000198412739277311890541063977 - y * d; - y = 0.001388888894063186997887560103 - y * d; - y = 0.008333333327800835146903501993 - y * d; - y = 0.041666666666110491190622155955 - y * d; - y = 0.166666666666984014666397229121 - y * d; - y = 0.500000000000019206858326015208 - y * d; - y = 0.999999999999994892974086724280 - y * d; - y = 1.000000000000000000000000000000 - y * d; - y *= ccs.v; - return (uint64_t)(y * fpr_ptwo63.v); - -#endif // yyyAVX2- -} - -#define fpr_gm_tab Zf(fpr_gm_tab) -extern const fpr fpr_gm_tab[]; - -#define fpr_p2_tab Zf(fpr_p2_tab) -extern const fpr fpr_p2_tab[]; - -/* ====================================================================== */ - -#else // yyyFPEMU+0 yyyFPNATIVE+0 - -#error No FP implementation selected - -#endif // yyyFPEMU- yyyFPNATIVE- diff --git a/crypto_sign/falcon-1024/m4-ct/inner.h b/crypto_sign/falcon-1024/m4-ct/inner.h deleted file mode 100644 index 1f7d0819..00000000 --- a/crypto_sign/falcon-1024/m4-ct/inner.h +++ /dev/null @@ -1,1168 +0,0 @@ -#ifndef FALCON_INNER_H__ -#define FALCON_INNER_H__ - -/* - * Internal functions for Falcon. This is not the API intended to be - * used by applications; instead, this internal API provides all the - * primitives on which wrappers build to provide external APIs. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -/* - * IMPORTANT API RULES - * ------------------- - * - * This API has some non-trivial usage rules: - * - * - * - All public functions (i.e. the non-static ones) must be referenced - * with the Zf() macro (e.g. Zf(verify_raw) for the verify_raw() - * function). That macro adds a prefix to the name, which is - * configurable with the FALCON_PREFIX macro. This allows compiling - * the code into a specific "namespace" and potentially including - * several versions of this code into a single application (e.g. to - * have an AVX2 and a non-AVX2 variants and select the one to use at - * runtime based on availability of AVX2 opcodes). - * - * - Functions that need temporary buffers expects them as a final - * tmp[] array of type uint8_t*, with a size which is documented for - * each function. However, most have some alignment requirements, - * because they will use the array to store 16-bit, 32-bit or 64-bit - * values (e.g. uint64_t or double). The caller must ensure proper - * alignment. What happens on unaligned access depends on the - * underlying architecture, ranging from a slight time penalty - * to immediate termination of the process. - * - * - Some functions rely on specific rounding rules and precision for - * floating-point numbers. On some systems (in particular 32-bit x86 - * with the 387 FPU), this requires setting an hardware control - * word. The caller MUST use set_fpu_cw() to ensure proper precision: - * - * oldcw = set_fpu_cw(2); - * Zf(sign_dyn)(...); - * set_fpu_cw(oldcw); - * - * On systems where the native floating-point precision is already - * proper, or integer-based emulation is used, the set_fpu_cw() - * function does nothing, so it can be called systematically. - */ - -// yyyPQCLEAN+0 yyyNIST+0 yyySUPERCOP+0 -#include "config.h" -// yyyPQCLEAN- yyyNIST- yyySUPERCOP- -// yyySUPERCOP+1 -// yyyCONF* -// yyySUPERCOP- - -#include -#include -#include - -#if defined FALCON_AVX2 && FALCON_AVX2 // yyyAVX2+1 -/* - * This implementation uses AVX2 and optionally FMA intrinsics. - */ -#include -#ifndef FALCON_LE -#define FALCON_LE 1 -#endif -#ifndef FALCON_UNALIGNED -#define FALCON_UNALIGNED 1 -#endif -#if defined __GNUC__ -#if defined FALCON_FMA && FALCON_FMA -#define TARGET_AVX2 __attribute__((target("avx2,fma"))) -#else -#define TARGET_AVX2 __attribute__((target("avx2"))) -#endif -#elif defined _MSC_VER && _MSC_VER -#pragma warning( disable : 4752 ) -#endif -#if defined FALCON_FMA && FALCON_FMA -#define FMADD(a, b, c) _mm256_fmadd_pd(a, b, c) -#define FMSUB(a, b, c) _mm256_fmsub_pd(a, b, c) -#else -#define FMADD(a, b, c) _mm256_add_pd(_mm256_mul_pd(a, b), c) -#define FMSUB(a, b, c) _mm256_sub_pd(_mm256_mul_pd(a, b), c) -#endif -#endif // yyyAVX2- - -// yyyNIST+0 yyyPQCLEAN+0 -/* - * On MSVC, disable warning about applying unary minus on an unsigned - * type: this is perfectly defined standard behaviour and we do it - * quite often. - */ -#if defined _MSC_VER && _MSC_VER -#pragma warning( disable : 4146 ) -#endif - -// yyySUPERCOP+0 -/* - * Enable ARM assembly on any ARMv7m platform (if it was not done before). - */ -#ifndef FALCON_ASM_CORTEXM4 -#if (defined __ARM_ARCH_7EM__ && __ARM_ARCH_7EM__) \ - && (defined __ARM_FEATURE_DSP && __ARM_FEATURE_DSP) -#define FALCON_ASM_CORTEXM4 1 -#else -#define FALCON_ASM_CORTEXM4 0 -#endif -#endif -// yyySUPERCOP- - -#if defined __i386__ || defined _M_IX86 \ - || defined __x86_64__ || defined _M_X64 || \ - (defined _ARCH_PWR8 && \ - (defined __LITTLE_ENDIAN || defined __LITTLE_ENDIAN__)) - -#ifndef FALCON_LE -#define FALCON_LE 1 -#endif -#ifndef FALCON_UNALIGNED -#define FALCON_UNALIGNED 1 -#endif - -#elif defined FALCON_ASM_CORTEXM4 && FALCON_ASM_CORTEXM4 - -#ifndef FALCON_LE -#define FALCON_LE 1 -#endif -#ifndef FALCON_UNALIGNED -#define FALCON_UNALIGNED 0 -#endif - -#elif (defined __LITTLE_ENDIAN__ && __LITTLE_ENDIAN__) \ - || (defined __BYTE_ORDER__ && defined __ORDER_LITTLE_ENDIAN__ \ - && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) - -#ifndef FALCON_LE -#define FALCON_LE 1 -#endif -#ifndef FALCON_UNALIGNED -#define FALCON_UNALIGNED 0 -#endif - -#else - -#ifndef FALCON_LE -#define FALCON_LE 0 -#endif -#ifndef FALCON_UNALIGNED -#define FALCON_UNALIGNED 0 -#endif - -#endif - -/* - * We ensure that both FALCON_FPEMU and FALCON_FPNATIVE are defined, - * with compatible values (exactly one of them must be non-zero). - * If none is defined, then default FP implementation is 'native' - * except on ARM Cortex M4. - */ -#if !defined FALCON_FPEMU && !defined FALCON_FPNATIVE - -#if (defined __ARM_FP && ((__ARM_FP & 0x08) == 0x08)) \ - || (!defined __ARM_FP && defined __ARM_VFPV2__) -#define FALCON_FPEMU 0 -#define FALCON_FPNATIVE 1 -#elif defined FALCON_ASM_CORTEXM4 && FALCON_ASM_CORTEXM4 -#define FALCON_FPEMU 1 -#define FALCON_FPNATIVE 0 -#else -#define FALCON_FPEMU 0 -#define FALCON_FPNATIVE 1 -#endif - -#elif defined FALCON_FPEMU && !defined FALCON_FPNATIVE - -#if FALCON_FPEMU -#define FALCON_FPNATIVE 0 -#else -#define FALCON_FPNATIVE 1 -#endif - -#elif defined FALCON_FPNATIVE && !defined FALCON_FPEMU - -#if FALCON_FPNATIVE -#define FALCON_FPEMU 0 -#else -#define FALCON_FPEMU 1 -#endif - -#endif - -#if (FALCON_FPEMU && FALCON_FPNATIVE) || (!FALCON_FPEMU && !FALCON_FPNATIVE) -#error Exactly one of FALCON_FPEMU and FALCON_FPNATIVE must be selected -#endif - -// yyySUPERCOP+0 -/* - * For seed generation from the operating system: - * - On Linux and glibc-2.25+, FreeBSD 12+ and OpenBSD, use getentropy(). - * - On Unix-like systems, use /dev/urandom (including as a fallback - * for failed getentropy() calls). - * - On Windows, use CryptGenRandom(). - */ - -#ifndef FALCON_RAND_GETENTROPY -#if (defined __linux__ && defined __GLIBC__ \ - && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 25))) \ - || (defined __FreeBSD__ && __FreeBSD__ >= 12) \ - || defined __OpenBSD__ -#define FALCON_RAND_GETENTROPY 1 -#else -#define FALCON_RAND_GETENTROPY 0 -#endif -#endif - -#ifndef FALCON_RAND_URANDOM -#if defined _AIX \ - || defined __ANDROID__ \ - || defined __FreeBSD__ \ - || defined __NetBSD__ \ - || defined __OpenBSD__ \ - || defined __DragonFly__ \ - || defined __linux__ \ - || (defined __sun && (defined __SVR4 || defined __svr4__)) \ - || (defined __APPLE__ && defined __MACH__) -#define FALCON_RAND_URANDOM 1 -#else -#define FALCON_RAND_URANDOM 0 -#endif -#endif - -#ifndef FALCON_RAND_WIN32 -#if defined _WIN32 || defined _WIN64 -#define FALCON_RAND_WIN32 1 -#else -#define FALCON_RAND_WIN32 0 -#endif -#endif -// yyySUPERCOP- - -/* - * For still undefined compile-time macros, define them to 0 to avoid - * warnings with -Wundef. - */ -#ifndef FALCON_AVX2 -#define FALCON_AVX2 0 -#endif -#ifndef FALCON_FMA -#define FALCON_FMA 0 -#endif -#ifndef FALCON_KG_CHACHA20 -#define FALCON_KG_CHACHA20 0 -#endif -// yyyNIST- yyyPQCLEAN- - -// yyyPQCLEAN+0 yyySUPERCOP+0 -/* - * "Naming" macro used to apply a consistent prefix over all global - * symbols. - */ -#ifndef FALCON_PREFIX -#define FALCON_PREFIX falcon_inner -#endif -#define Zf(name) Zf_(FALCON_PREFIX, name) -#define Zf_(prefix, name) Zf__(prefix, name) -#define Zf__(prefix, name) prefix ## _ ## name -// yyyPQCLEAN- yyySUPERCOP- - -// yyyAVX2+1 -/* - * We use the TARGET_AVX2 macro to tag some functions which, in some - * configurations, may use AVX2 and FMA intrinsics; this depends on - * the compiler. In all other cases, we just define it to emptiness - * (i.e. it will have no effect). - */ -#ifndef TARGET_AVX2 -#define TARGET_AVX2 -#endif -// yyyAVX2- - -/* - * Some computations with floating-point elements, in particular - * rounding to the nearest integer, rely on operations using _exactly_ - * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit - * x86, the 387 FPU may be used (depending on the target OS) and, in - * that case, may use more precision bits (i.e. 64 bits, for an 80-bit - * total type length); to prevent miscomputations, we define an explicit - * function that modifies the precision in the FPU control word. - * - * set_fpu_cw() sets the precision to the provided value, and returns - * the previously set precision; callers are supposed to restore the - * previous precision on exit. The correct (52-bit) precision is - * configured with the value "2". On unsupported compilers, or on - * targets other than 32-bit x86, or when the native 'double' type is - * not used, the set_fpu_cw() function does nothing at all. - */ -#if FALCON_FPNATIVE // yyyFPNATIVE+1 -#if defined __GNUC__ && defined __i386__ -static inline unsigned -set_fpu_cw(unsigned x) -{ - unsigned short t; - unsigned old; - - __asm__ __volatile__ ("fstcw %0" : "=m" (t) : : ); - old = (t & 0x0300u) >> 8; - t = (unsigned short)((t & ~0x0300u) | (x << 8)); - __asm__ __volatile__ ("fldcw %0" : : "m" (t) : ); - return old; -} -#elif defined _M_IX86 -static inline unsigned -set_fpu_cw(unsigned x) -{ - unsigned short t; - unsigned old; - - __asm { fstcw t } - old = (t & 0x0300u) >> 8; - t = (unsigned short)((t & ~0x0300u) | (x << 8)); - __asm { fldcw t } - return old; -} -#else -static inline unsigned -set_fpu_cw(unsigned x) -{ - return x; -} -#endif -#else // yyyFPNATIVE+0 -static inline unsigned -set_fpu_cw(unsigned x) -{ - return x; -} -#endif // yyyFPNATIVE- - -#if FALCON_FPNATIVE && !FALCON_AVX2 // yyyFPNATIVE+1 yyyAVX2+0 -/* - * If using the native 'double' type but not AVX2 code, on an x86 - * machine with SSE2 activated for maths, then we will use the - * SSE2 intrinsics. - */ -#if defined __GNUC__ && defined __SSE2_MATH__ -#include -#endif -#endif // yyyFPNATIVE- yyyAVX2- - -#if FALCON_FPNATIVE // yyyFPNATIVE+1 -/* - * For optimal reproducibility of values, we need to disable contraction - * of floating-point expressions; otherwise, on some architectures (e.g. - * PowerPC), the compiler may generate fused-multiply-add opcodes that - * may round differently than two successive separate opcodes. C99 defines - * a standard pragma for that, but GCC-6.2.2 appears to ignore it, - * hence the GCC-specific pragma (that Clang does not support). - */ -#if defined __clang__ -#pragma STDC FP_CONTRACT OFF -#elif defined __GNUC__ -#pragma GCC optimize ("fp-contract=off") -#endif -#endif // yyyFPNATIVE- - -// yyyPQCLEAN+0 -/* - * MSVC 2015 does not know the C99 keyword 'restrict'. - */ -#if defined _MSC_VER && _MSC_VER -#ifndef restrict -#define restrict __restrict -#endif -#endif -// yyyPQCLEAN- - -/* ==================================================================== */ -/* - * SHAKE256 implementation (shake.c). - * - * API is defined to be easily replaced with the fips202.h API defined - * as part of PQClean. - */ - -// yyyPQCLEAN+0 -/* -typedef struct { - union { - uint64_t A[25]; - uint8_t dbuf[200]; - } st; - uint64_t dptr; -} inner_shake256_context; - -#define inner_shake256_init Zf(i_shake256_init) -#define inner_shake256_inject Zf(i_shake256_inject) -#define inner_shake256_flip Zf(i_shake256_flip) -#define inner_shake256_extract Zf(i_shake256_extract) - -void Zf(i_shake256_init)( - inner_shake256_context *sc); -void Zf(i_shake256_inject)( - inner_shake256_context *sc, const uint8_t *in, size_t len); -void Zf(i_shake256_flip)( - inner_shake256_context *sc); -void Zf(i_shake256_extract)( - inner_shake256_context *sc, uint8_t *out, size_t len); -*/ - -// yyyPQCLEAN+1 - -#include "fips202.h" - -#define inner_shake256_context shake256incctx -#define inner_shake256_init(sc) shake256_inc_init(sc) -#define inner_shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len) -#define inner_shake256_flip(sc) shake256_inc_finalize(sc) -#define inner_shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc) - -// yyyPQCLEAN+0 - -// yyyPQCLEAN- - -/* ==================================================================== */ -/* - * Encoding/decoding functions (codec.c). - * - * Encoding functions take as parameters an output buffer (out) with - * a given maximum length (max_out_len); returned value is the actual - * number of bytes which have been written. If the output buffer is - * not large enough, then 0 is returned (some bytes may have been - * written to the buffer). If 'out' is NULL, then 'max_out_len' is - * ignored; instead, the function computes and returns the actual - * required output length (in bytes). - * - * Decoding functions take as parameters an input buffer (in) with - * its maximum length (max_in_len); returned value is the actual number - * of bytes that have been read from the buffer. If the provided length - * is too short, then 0 is returned. - * - * Values to encode or decode are vectors of integers, with N = 2^logn - * elements. - * - * Three encoding formats are defined: - * - * - modq: sequence of values modulo 12289, each encoded over exactly - * 14 bits. The encoder and decoder verify that integers are within - * the valid range (0..12288). Values are arrays of uint16. - * - * - trim: sequence of signed integers, a specified number of bits - * each. The number of bits is provided as parameter and includes - * the sign bit. Each integer x must be such that |x| < 2^(bits-1) - * (which means that the -2^(bits-1) value is forbidden); encode and - * decode functions check that property. Values are arrays of - * int16_t or int8_t, corresponding to names 'trim_i16' and - * 'trim_i8', respectively. - * - * - comp: variable-length encoding for signed integers; each integer - * uses a minimum of 9 bits, possibly more. This is normally used - * only for signatures. - * - */ - -size_t Zf(modq_encode)(void *out, size_t max_out_len, - const uint16_t *x, unsigned logn); -size_t Zf(trim_i16_encode)(void *out, size_t max_out_len, - const int16_t *x, unsigned logn, unsigned bits); -size_t Zf(trim_i8_encode)(void *out, size_t max_out_len, - const int8_t *x, unsigned logn, unsigned bits); -size_t Zf(comp_encode)(void *out, size_t max_out_len, - const int16_t *x, unsigned logn); - -size_t Zf(modq_decode)(uint16_t *x, unsigned logn, - const void *in, size_t max_in_len); -size_t Zf(trim_i16_decode)(int16_t *x, unsigned logn, unsigned bits, - const void *in, size_t max_in_len); -size_t Zf(trim_i8_decode)(int8_t *x, unsigned logn, unsigned bits, - const void *in, size_t max_in_len); -size_t Zf(comp_decode)(int16_t *x, unsigned logn, - const void *in, size_t max_in_len); - -/* - * Number of bits for key elements, indexed by logn (1 to 10). This - * is at most 8 bits for all degrees, but some degrees may have shorter - * elements. - */ -extern const uint8_t Zf(max_fg_bits)[]; -extern const uint8_t Zf(max_FG_bits)[]; - -/* - * Maximum size, in bits, of elements in a signature, indexed by logn - * (1 to 10). The size includes the sign bit. - */ -extern const uint8_t Zf(max_sig_bits)[]; - -/* ==================================================================== */ -/* - * Support functions used for both signature generation and signature - * verification (common.c). - */ - -/* - * From a SHAKE256 context (must be already flipped), produce a new - * point. This is the non-constant-time version, which may leak enough - * information to serve as a stop condition on a brute force attack on - * the hashed message (provided that the nonce value is known). - */ -void Zf(hash_to_point_vartime)(inner_shake256_context *sc, - uint16_t *x, unsigned logn); - -/* - * From a SHAKE256 context (must be already flipped), produce a new - * point. The temporary buffer (tmp) must have room for 2*2^logn bytes. - * This function is constant-time but is typically more expensive than - * Zf(hash_to_point_vartime)(). - * - * tmp[] must have 16-bit alignment. - */ -void Zf(hash_to_point_ct)(inner_shake256_context *sc, - uint16_t *x, unsigned logn, uint8_t *tmp); - -/* - * Tell whether a given vector (2N coordinates, in two halves) is - * acceptable as a signature. This compares the appropriate norm of the - * vector with the acceptance bound. Returned value is 1 on success - * (vector is short enough to be acceptable), 0 otherwise. - */ -int Zf(is_short)(const int16_t *s1, const int16_t *s2, unsigned logn); - -/* - * Tell whether a given vector (2N coordinates, in two halves) is - * acceptable as a signature. Instead of the first half s1, this - * function receives the "saturated squared norm" of s1, i.e. the - * sum of the squares of the coordinates of s1 (saturated at 2^32-1 - * if the sum exceeds 2^31-1). - * - * Returned value is 1 on success (vector is short enough to be - * acceptable), 0 otherwise. - */ -int Zf(is_short_half)(uint32_t sqn, const int16_t *s2, unsigned logn); - -/* ==================================================================== */ -/* - * Signature verification functions (vrfy.c). - */ - -/* - * Convert a public key to NTT + Montgomery format. Conversion is done - * in place. - */ -void Zf(to_ntt_monty)(uint16_t *h, unsigned logn); - -/* - * Internal signature verification code: - * c0[] contains the hashed nonce+message - * s2[] is the decoded signature - * h[] contains the public key, in NTT + Montgomery format - * logn is the degree log - * tmp[] temporary, must have at least 2*2^logn bytes - * Returned value is 1 on success, 0 on error. - * - * tmp[] must have 16-bit alignment. - */ -int Zf(verify_raw)(const uint16_t *c0, const int16_t *s2, - const uint16_t *h, unsigned logn, uint8_t *tmp); - -/* - * Compute the public key h[], given the private key elements f[] and - * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial - * modulus. This function returns 1 on success, 0 on error (an error is - * reported if f is not invertible mod phi mod q). - * - * The tmp[] array must have room for at least 2*2^logn elements. - * tmp[] must have 16-bit alignment. - */ -int Zf(compute_public)(uint16_t *h, - const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp); - -/* - * Recompute the fourth private key element. Private key consists in - * four polynomials with small coefficients f, g, F and G, which are - * such that fG - gF = q mod phi; furthermore, f is invertible modulo - * phi and modulo q. This function recomputes G from f, g and F. - * - * The tmp[] array must have room for at least 4*2^logn bytes. - * - * Returned value is 1 in success, 0 on error (f not invertible). - * tmp[] must have 16-bit alignment. - */ -int Zf(complete_private)(int8_t *G, - const int8_t *f, const int8_t *g, const int8_t *F, - unsigned logn, uint8_t *tmp); - -/* - * Test whether a given polynomial is invertible modulo phi and q. - * Polynomial coefficients are small integers. - * - * tmp[] must have 16-bit alignment. - */ -int Zf(is_invertible)( - const int16_t *s2, unsigned logn, uint8_t *tmp); - -/* - * Count the number of elements of value zero in the NTT representation - * of the given polynomial: this is the number of primitive 2n-th roots - * of unity (modulo q = 12289) that are roots of the provided polynomial - * (taken modulo q). - * - * tmp[] must have 16-bit alignment. - */ -int Zf(count_nttzero)(const int16_t *sig, unsigned logn, uint8_t *tmp); - -/* - * Internal signature verification with public key recovery: - * h[] receives the public key (NOT in NTT/Montgomery format) - * c0[] contains the hashed nonce+message - * s1[] is the first signature half - * s2[] is the second signature half - * logn is the degree log - * tmp[] temporary, must have at least 2*2^logn bytes - * Returned value is 1 on success, 0 on error. Success is returned if - * the signature is a short enough vector; in that case, the public - * key has been written to h[]. However, the caller must still - * verify that h[] is the correct value (e.g. with regards to a known - * hash of the public key). - * - * h[] may not overlap with any of the other arrays. - * - * tmp[] must have 16-bit alignment. - */ -int Zf(verify_recover)(uint16_t *h, - const uint16_t *c0, const int16_t *s1, const int16_t *s2, - unsigned logn, uint8_t *tmp); - -/* ==================================================================== */ -/* - * Implementation of floating-point real numbers (fpr.h, fpr.c). - */ - -/* - * Real numbers are implemented by an extra header file, included below. - * This is meant to support pluggable implementations. The default - * implementation relies on the C type 'double'. - * - * The included file must define the following types, functions and - * constants: - * - * fpr - * type for a real number - * - * fpr fpr_of(int64_t i) - * cast an integer into a real number; source must be in the - * -(2^63-1)..+(2^63-1) range - * - * fpr fpr_scaled(int64_t i, int sc) - * compute i*2^sc as a real number; source 'i' must be in the - * -(2^63-1)..+(2^63-1) range - * - * fpr fpr_ldexp(fpr x, int e) - * compute x*2^e - * - * int64_t fpr_rint(fpr x) - * round x to the nearest integer; x must be in the -(2^63-1) - * to +(2^63-1) range - * - * int64_t fpr_trunc(fpr x) - * round to an integer; this rounds towards zero; value must - * be in the -(2^63-1) to +(2^63-1) range - * - * fpr fpr_add(fpr x, fpr y) - * compute x + y - * - * fpr fpr_sub(fpr x, fpr y) - * compute x - y - * - * fpr fpr_neg(fpr x) - * compute -x - * - * fpr fpr_half(fpr x) - * compute x/2 - * - * fpr fpr_double(fpr x) - * compute x*2 - * - * fpr fpr_mul(fpr x, fpr y) - * compute x * y - * - * fpr fpr_sqr(fpr x) - * compute x * x - * - * fpr fpr_inv(fpr x) - * compute 1/x - * - * fpr fpr_div(fpr x, fpr y) - * compute x/y - * - * fpr fpr_sqrt(fpr x) - * compute the square root of x - * - * int fpr_lt(fpr x, fpr y) - * return 1 if x < y, 0 otherwise - * - * uint64_t fpr_expm_p63(fpr x) - * return exp(x), assuming that 0 <= x < log(2). Returned value - * is scaled to 63 bits (i.e. it really returns 2^63*exp(-x), - * rounded to the nearest integer). Computation should have a - * precision of at least 45 bits. - * - * const fpr fpr_gm_tab[] - * array of constants for FFT / iFFT - * - * const fpr fpr_p2_tab[] - * precomputed powers of 2 (by index, 0 to 10) - * - * Constants of type 'fpr': - * - * fpr fpr_q 12289 - * fpr fpr_inverse_of_q 1/12289 - * fpr fpr_inv_2sqrsigma0 1/(2*(1.8205^2)) - * fpr fpr_inv_sigma 1/(1.55*sqrt(12289)) - * fpr fpr_sigma_min_9 1.291500756233514568549480827642 - * fpr fpr_sigma_min_10 1.311734375905083682667395805765 - * fpr fpr_log2 log(2) - * fpr fpr_inv_log2 1/log(2) - * fpr fpr_bnorm_max 16822.4121 - * fpr fpr_zero 0 - * fpr fpr_one 1 - * fpr fpr_two 2 - * fpr fpr_onehalf 0.5 - * fpr fpr_ptwo31 2^31 - * fpr fpr_ptwo31m1 2^31-1 - * fpr fpr_mtwo31m1 -(2^31-1) - * fpr fpr_ptwo63m1 2^63-1 - * fpr fpr_mtwo63m1 -(2^63-1) - * fpr fpr_ptwo63 2^63 - */ -#include "fpr.h" - -/* ==================================================================== */ -/* - * RNG (rng.c). - * - * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256 - * context (flipped) and is used for bulk pseudorandom generation. - * A system-dependent seed generator is also provided. - */ - -/* - * Obtain a random seed from the system RNG. - * - * Returned value is 1 on success, 0 on error. - */ -int Zf(get_seed)(void *seed, size_t seed_len); - -/* - * Structure for a PRNG. This includes a large buffer so that values - * get generated in advance. The 'state' is used to keep the current - * PRNG algorithm state (contents depend on the selected algorithm). - * - * The unions with 'dummy_u64' are there to ensure proper alignment for - * 64-bit direct access. - */ -typedef struct { - union { - uint8_t d[512]; /* MUST be 512, exactly */ - uint64_t dummy_u64; - } buf; - size_t ptr; - union { - uint8_t d[256]; - uint64_t dummy_u64; - } state; - int type; -} prng; - -/* - * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256 - * context (in "flipped" state) to obtain its initial state. - */ -void Zf(prng_init)(prng *p, inner_shake256_context *src); - -/* - * Refill the PRNG buffer. This is normally invoked automatically, and - * is declared here only so that prng_get_u64() may be inlined. - */ -void Zf(prng_refill)(prng *p); - -/* - * Get some bytes from a PRNG. - */ -void Zf(prng_get_bytes)(prng *p, void *dst, size_t len); - -/* - * Get a 64-bit random value from a PRNG. - */ -static inline uint64_t -prng_get_u64(prng *p) -{ - size_t u; - - /* - * If there are less than 9 bytes in the buffer, we refill it. - * This means that we may drop the last few bytes, but this allows - * for faster extraction code. Also, it means that we never leave - * an empty buffer. - */ - u = p->ptr; - if (u >= (sizeof p->buf.d) - 9) { - Zf(prng_refill)(p); - u = 0; - } - p->ptr = u + 8; - - /* - * On systems that use little-endian encoding and allow - * unaligned accesses, we can simply read the data where it is. - */ -#if FALCON_LE && FALCON_UNALIGNED // yyyLEU+1 - return *(uint64_t *)(p->buf.d + u); -#else // yyyLEU+0 - return (uint64_t)p->buf.d[u + 0] - | ((uint64_t)p->buf.d[u + 1] << 8) - | ((uint64_t)p->buf.d[u + 2] << 16) - | ((uint64_t)p->buf.d[u + 3] << 24) - | ((uint64_t)p->buf.d[u + 4] << 32) - | ((uint64_t)p->buf.d[u + 5] << 40) - | ((uint64_t)p->buf.d[u + 6] << 48) - | ((uint64_t)p->buf.d[u + 7] << 56); -#endif // yyyLEU- -} - -/* - * Get an 8-bit random value from a PRNG. - */ -static inline unsigned -prng_get_u8(prng *p) -{ - unsigned v; - - v = p->buf.d[p->ptr ++]; - if (p->ptr == sizeof p->buf.d) { - Zf(prng_refill)(p); - } - return v; -} - -/* ==================================================================== */ -/* - * FFT (falcon-fft.c). - * - * A real polynomial is represented as an array of N 'fpr' elements. - * The FFT representation of a real polynomial contains N/2 complex - * elements; each is stored as two real numbers, for the real and - * imaginary parts, respectively. See falcon-fft.c for details on the - * internal representation. - */ - -/* - * Compute FFT in-place: the source array should contain a real - * polynomial (N coefficients); its storage area is reused to store - * the FFT representation of that polynomial (N/2 complex numbers). - * - * 'logn' MUST lie between 1 and 10 (inclusive). - */ -void Zf(FFT)(fpr *f, unsigned logn); - -/* - * Compute the inverse FFT in-place: the source array should contain the - * FFT representation of a real polynomial (N/2 elements); the resulting - * real polynomial (N coefficients of type 'fpr') is written over the - * array. - * - * 'logn' MUST lie between 1 and 10 (inclusive). - */ -void Zf(iFFT)(fpr *f, unsigned logn); - -/* - * Add polynomial b to polynomial a. a and b MUST NOT overlap. This - * function works in both normal and FFT representations. - */ -void Zf(poly_add)(fpr *restrict a, const fpr *restrict b, unsigned logn); - -/* - * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This - * function works in both normal and FFT representations. - */ -void Zf(poly_sub)(fpr *restrict a, const fpr *restrict b, unsigned logn); - -/* - * Negate polynomial a. This function works in both normal and FFT - * representations. - */ -void Zf(poly_neg)(fpr *a, unsigned logn); - -/* - * Compute adjoint of polynomial a. This function works only in FFT - * representation. - */ -void Zf(poly_adj_fft)(fpr *a, unsigned logn); - -/* - * Multiply polynomial a with polynomial b. a and b MUST NOT overlap. - * This function works only in FFT representation. - */ -void Zf(poly_mul_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn); - -/* - * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT - * overlap. This function works only in FFT representation. - */ -void Zf(poly_muladj_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn); - -/* - * Multiply polynomial with its own adjoint. This function works only in FFT - * representation. - */ -void Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn); - -/* - * Multiply polynomial with a real constant. This function works in both - * normal and FFT representations. - */ -void Zf(poly_mulconst)(fpr *a, fpr x, unsigned logn); - -/* - * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation). - * a and b MUST NOT overlap. - */ -void Zf(poly_div_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn); - -/* - * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g)) - * (also in FFT representation). Since the result is auto-adjoint, all its - * coordinates in FFT representation are real; as such, only the first N/2 - * values of d[] are filled (the imaginary parts are skipped). - * - * Array d MUST NOT overlap with either a or b. - */ -void Zf(poly_invnorm2_fft)(fpr *restrict d, - const fpr *restrict a, const fpr *restrict b, unsigned logn); - -/* - * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g) - * (also in FFT representation). Destination d MUST NOT overlap with - * any of the source arrays. - */ -void Zf(poly_add_muladj_fft)(fpr *restrict d, - const fpr *restrict F, const fpr *restrict G, - const fpr *restrict f, const fpr *restrict g, unsigned logn); - -/* - * Multiply polynomial a by polynomial b, where b is autoadjoint. Both - * a and b are in FFT representation. Since b is autoadjoint, all its - * FFT coefficients are real, and the array b contains only N/2 elements. - * a and b MUST NOT overlap. - */ -void Zf(poly_mul_autoadj_fft)(fpr *restrict a, - const fpr *restrict b, unsigned logn); - -/* - * Divide polynomial a by polynomial b, where b is autoadjoint. Both - * a and b are in FFT representation. Since b is autoadjoint, all its - * FFT coefficients are real, and the array b contains only N/2 elements. - * a and b MUST NOT overlap. - */ -void Zf(poly_div_autoadj_fft)(fpr *restrict a, - const fpr *restrict b, unsigned logn); - -/* - * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT - * representation. On input, g00, g01 and g11 are provided (where the - * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10 - * and d11 values are written in g00, g01 and g11, respectively - * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]). - * (In fact, d00 = g00, so the g00 operand is left unmodified.) - */ -void Zf(poly_LDL_fft)(const fpr *restrict g00, - fpr *restrict g01, fpr *restrict g11, unsigned logn); - -/* - * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT - * representation. This is identical to poly_LDL_fft() except that - * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written - * in two other separate buffers provided as extra parameters. - */ -void Zf(poly_LDLmv_fft)(fpr *restrict d11, fpr *restrict l10, - const fpr *restrict g00, const fpr *restrict g01, - const fpr *restrict g11, unsigned logn); - -/* - * Apply "split" operation on a polynomial in FFT representation: - * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1 - * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap. - */ -void Zf(poly_split_fft)(fpr *restrict f0, fpr *restrict f1, - const fpr *restrict f, unsigned logn); - -/* - * Apply "merge" operation on two polynomials in FFT representation: - * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes - * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1. - * f MUST NOT overlap with either f0 or f1. - */ -void Zf(poly_merge_fft)(fpr *restrict f, - const fpr *restrict f0, const fpr *restrict f1, unsigned logn); - -/* ==================================================================== */ -/* - * Key pair generation. - */ - -/* - * Required sizes of the temporary buffer (in bytes). - * - * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1 - * or 2) where it is slightly greater. - */ -#define FALCON_KEYGEN_TEMP_1 136 -#define FALCON_KEYGEN_TEMP_2 272 -#define FALCON_KEYGEN_TEMP_3 224 -#define FALCON_KEYGEN_TEMP_4 448 -#define FALCON_KEYGEN_TEMP_5 896 -#define FALCON_KEYGEN_TEMP_6 1792 -#define FALCON_KEYGEN_TEMP_7 3584 -#define FALCON_KEYGEN_TEMP_8 7168 -#define FALCON_KEYGEN_TEMP_9 14336 -#define FALCON_KEYGEN_TEMP_10 28672 - -/* - * Generate a new key pair. Randomness is extracted from the provided - * SHAKE256 context, which must have already been seeded and flipped. - * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_* - * macros) and be aligned for the uint32_t, uint64_t and fpr types. - * - * The private key elements are written in f, g, F and G, and the - * public key is written in h. Either or both of G and h may be NULL, - * in which case the corresponding element is not returned (they can - * be recomputed from f, g and F). - * - * tmp[] must have 64-bit alignment. - * This function uses floating-point rounding (see set_fpu_cw()). - */ -void Zf(keygen)(inner_shake256_context *rng, - int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h, - unsigned logn, uint8_t *tmp); - -/* ==================================================================== */ -/* - * Signature generation. - */ - -/* - * Expand a private key into the B0 matrix in FFT representation and - * the LDL tree. All the values are written in 'expanded_key', for - * a total of (8*logn+40)*2^logn bytes. - * - * The tmp[] array must have room for at least 48*2^logn bytes. - * - * tmp[] must have 64-bit alignment. - * This function uses floating-point rounding (see set_fpu_cw()). - */ -void Zf(expand_privkey)(fpr *restrict expanded_key, - const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G, - unsigned logn, uint8_t *restrict tmp); - -/* - * Compute a signature over the provided hashed message (hm); the - * signature value is one short vector. This function uses an - * expanded key (as generated by Zf(expand_privkey)()). - * - * The sig[] and hm[] buffers may overlap. - * - * On successful output, the start of the tmp[] buffer contains the s1 - * vector (as int16_t elements). - * - * The minimal size (in bytes) of tmp[] is 48*2^logn bytes. - * - * tmp[] must have 64-bit alignment. - * This function uses floating-point rounding (see set_fpu_cw()). - */ -void Zf(sign_tree)(int16_t *sig, inner_shake256_context *rng, - const fpr *restrict expanded_key, - const uint16_t *hm, unsigned logn, uint8_t *tmp); - -/* - * Compute a signature over the provided hashed message (hm); the - * signature value is one short vector. This function uses a raw - * key and dynamically recompute the B0 matrix and LDL tree; this - * saves RAM since there is no needed for an expanded key, but - * increases the signature cost. - * - * The sig[] and hm[] buffers may overlap. - * - * On successful output, the start of the tmp[] buffer contains the s1 - * vector (as int16_t elements). - * - * The minimal size (in bytes) of tmp[] is 72*2^logn bytes. - * - * tmp[] must have 64-bit alignment. - * This function uses floating-point rounding (see set_fpu_cw()). - */ -void Zf(sign_dyn)(int16_t *sig, inner_shake256_context *rng, - const int8_t *restrict f, const int8_t *restrict g, - const int8_t *restrict F, const int8_t *restrict G, - const uint16_t *hm, unsigned logn, uint8_t *tmp); - -/* - * Internal sampler engine. Exported for tests. - * - * sampler_context wraps around a source of random numbers (PRNG) and - * the sigma_min value (nominally dependent on the degree). - * - * sampler() takes as parameters: - * ctx pointer to the sampler_context structure - * mu center for the distribution - * isigma inverse of the distribution standard deviation - * It returns an integer sampled along the Gaussian distribution centered - * on mu and of standard deviation sigma = 1/isigma. - * - * gaussian0_sampler() takes as parameter a pointer to a PRNG, and - * returns an integer sampled along a half-Gaussian with standard - * deviation sigma0 = 1.8205 (center is 0, returned value is - * nonnegative). - */ - -typedef struct { - prng p; - fpr sigma_min; -} sampler_context; - -TARGET_AVX2 -int Zf(sampler)(void *ctx, fpr mu, fpr isigma); - -TARGET_AVX2 -int Zf(gaussian0_sampler)(prng *p); - -/* ==================================================================== */ - -#endif diff --git a/crypto_sign/falcon-1024/m4-ct/keygen.c b/crypto_sign/falcon-1024/m4-ct/keygen.c deleted file mode 100644 index cf7de008..00000000 --- a/crypto_sign/falcon-1024/m4-ct/keygen.c +++ /dev/null @@ -1,4301 +0,0 @@ -/* - * Falcon key pair generation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -#define MKN(logn) ((size_t)1 << (logn)) - -/* ==================================================================== */ -/* - * Modular arithmetics. - * - * We implement a few functions for computing modulo a small integer p. - * - * All functions require that 2^30 < p < 2^31. Moreover, operands must - * be in the 0..p-1 range. - * - * Modular addition and subtraction work for all such p. - * - * Montgomery multiplication requires that p is odd, and must be provided - * with an additional value p0i = -1/p mod 2^31. See below for some basics - * on Montgomery multiplication. - * - * Division computes an inverse modulo p by an exponentiation (with - * exponent p-2): this works only if p is prime. Multiplication - * requirements also apply, i.e. p must be odd and p0i must be provided. - * - * The NTT and inverse NTT need all of the above, and also that - * p = 1 mod 2048. - * - * ----------------------------------------------------------------------- - * - * We use Montgomery representation with 31-bit values: - * - * Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p. - * Montgomery representation of an integer x modulo p is x*R mod p. - * - * Montgomery multiplication computes (x*y)/R mod p for - * operands x and y. Therefore: - * - * - if operands are x*R and y*R (Montgomery representations of x and - * y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R - * mod p, which is the Montgomery representation of the product x*y; - * - * - if operands are x*R and y (or x and y*R), then Montgomery - * multiplication returns x*y mod p: mixed-representation - * multiplications yield results in normal representation. - * - * To convert to Montgomery representation, we multiply by R, which is done - * by Montgomery-multiplying by R^2. Stand-alone conversion back from - * Montgomery representation is Montgomery-multiplication by 1. - */ - -/* - * Precomputed small primes. Each element contains the following: - * - * p The prime itself. - * - * g A primitive root of phi = X^N+1 (in field Z_p). - * - * s The inverse of the product of all previous primes in the array, - * computed modulo p and in Montgomery representation. - * - * All primes are such that p = 1 mod 2048, and are lower than 2^31. They - * are listed in decreasing order. - */ - -typedef struct { - uint32_t p; - uint32_t g; - uint32_t s; -} small_prime; - -static const small_prime PRIMES[] = { - { 2147473409, 383167813, 10239 }, - { 2147389441, 211808905, 471403745 }, - { 2147387393, 37672282, 1329335065 }, - { 2147377153, 1977035326, 968223422 }, - { 2147358721, 1067163706, 132460015 }, - { 2147352577, 1606082042, 598693809 }, - { 2147346433, 2033915641, 1056257184 }, - { 2147338241, 1653770625, 421286710 }, - { 2147309569, 631200819, 1111201074 }, - { 2147297281, 2038364663, 1042003613 }, - { 2147295233, 1962540515, 19440033 }, - { 2147239937, 2100082663, 353296760 }, - { 2147235841, 1991153006, 1703918027 }, - { 2147217409, 516405114, 1258919613 }, - { 2147205121, 409347988, 1089726929 }, - { 2147196929, 927788991, 1946238668 }, - { 2147178497, 1136922411, 1347028164 }, - { 2147100673, 868626236, 701164723 }, - { 2147082241, 1897279176, 617820870 }, - { 2147074049, 1888819123, 158382189 }, - { 2147051521, 25006327, 522758543 }, - { 2147043329, 327546255, 37227845 }, - { 2147039233, 766324424, 1133356428 }, - { 2146988033, 1862817362, 73861329 }, - { 2146963457, 404622040, 653019435 }, - { 2146959361, 1936581214, 995143093 }, - { 2146938881, 1559770096, 634921513 }, - { 2146908161, 422623708, 1985060172 }, - { 2146885633, 1751189170, 298238186 }, - { 2146871297, 578919515, 291810829 }, - { 2146846721, 1114060353, 915902322 }, - { 2146834433, 2069565474, 47859524 }, - { 2146818049, 1552824584, 646281055 }, - { 2146775041, 1906267847, 1597832891 }, - { 2146756609, 1847414714, 1228090888 }, - { 2146744321, 1818792070, 1176377637 }, - { 2146738177, 1118066398, 1054971214 }, - { 2146736129, 52057278, 933422153 }, - { 2146713601, 592259376, 1406621510 }, - { 2146695169, 263161877, 1514178701 }, - { 2146656257, 685363115, 384505091 }, - { 2146650113, 927727032, 537575289 }, - { 2146646017, 52575506, 1799464037 }, - { 2146643969, 1276803876, 1348954416 }, - { 2146603009, 814028633, 1521547704 }, - { 2146572289, 1846678872, 1310832121 }, - { 2146547713, 919368090, 1019041349 }, - { 2146508801, 671847612, 38582496 }, - { 2146492417, 283911680, 532424562 }, - { 2146490369, 1780044827, 896447978 }, - { 2146459649, 327980850, 1327906900 }, - { 2146447361, 1310561493, 958645253 }, - { 2146441217, 412148926, 287271128 }, - { 2146437121, 293186449, 2009822534 }, - { 2146430977, 179034356, 1359155584 }, - { 2146418689, 1517345488, 1790248672 }, - { 2146406401, 1615820390, 1584833571 }, - { 2146404353, 826651445, 607120498 }, - { 2146379777, 3816988, 1897049071 }, - { 2146363393, 1221409784, 1986921567 }, - { 2146355201, 1388081168, 849968120 }, - { 2146336769, 1803473237, 1655544036 }, - { 2146312193, 1023484977, 273671831 }, - { 2146293761, 1074591448, 467406983 }, - { 2146283521, 831604668, 1523950494 }, - { 2146203649, 712865423, 1170834574 }, - { 2146154497, 1764991362, 1064856763 }, - { 2146142209, 627386213, 1406840151 }, - { 2146127873, 1638674429, 2088393537 }, - { 2146099201, 1516001018, 690673370 }, - { 2146093057, 1294931393, 315136610 }, - { 2146091009, 1942399533, 973539425 }, - { 2146078721, 1843461814, 2132275436 }, - { 2146060289, 1098740778, 360423481 }, - { 2146048001, 1617213232, 1951981294 }, - { 2146041857, 1805783169, 2075683489 }, - { 2146019329, 272027909, 1753219918 }, - { 2145986561, 1206530344, 2034028118 }, - { 2145976321, 1243769360, 1173377644 }, - { 2145964033, 887200839, 1281344586 }, - { 2145906689, 1651026455, 906178216 }, - { 2145875969, 1673238256, 1043521212 }, - { 2145871873, 1226591210, 1399796492 }, - { 2145841153, 1465353397, 1324527802 }, - { 2145832961, 1150638905, 554084759 }, - { 2145816577, 221601706, 427340863 }, - { 2145785857, 608896761, 316590738 }, - { 2145755137, 1712054942, 1684294304 }, - { 2145742849, 1302302867, 724873116 }, - { 2145728513, 516717693, 431671476 }, - { 2145699841, 524575579, 1619722537 }, - { 2145691649, 1925625239, 982974435 }, - { 2145687553, 463795662, 1293154300 }, - { 2145673217, 771716636, 881778029 }, - { 2145630209, 1509556977, 837364988 }, - { 2145595393, 229091856, 851648427 }, - { 2145587201, 1796903241, 635342424 }, - { 2145525761, 715310882, 1677228081 }, - { 2145495041, 1040930522, 200685896 }, - { 2145466369, 949804237, 1809146322 }, - { 2145445889, 1673903706, 95316881 }, - { 2145390593, 806941852, 1428671135 }, - { 2145372161, 1402525292, 159350694 }, - { 2145361921, 2124760298, 1589134749 }, - { 2145359873, 1217503067, 1561543010 }, - { 2145355777, 338341402, 83865711 }, - { 2145343489, 1381532164, 641430002 }, - { 2145325057, 1883895478, 1528469895 }, - { 2145318913, 1335370424, 65809740 }, - { 2145312769, 2000008042, 1919775760 }, - { 2145300481, 961450962, 1229540578 }, - { 2145282049, 910466767, 1964062701 }, - { 2145232897, 816527501, 450152063 }, - { 2145218561, 1435128058, 1794509700 }, - { 2145187841, 33505311, 1272467582 }, - { 2145181697, 269767433, 1380363849 }, - { 2145175553, 56386299, 1316870546 }, - { 2145079297, 2106880293, 1391797340 }, - { 2145021953, 1347906152, 720510798 }, - { 2145015809, 206769262, 1651459955 }, - { 2145003521, 1885513236, 1393381284 }, - { 2144960513, 1810381315, 31937275 }, - { 2144944129, 1306487838, 2019419520 }, - { 2144935937, 37304730, 1841489054 }, - { 2144894977, 1601434616, 157985831 }, - { 2144888833, 98749330, 2128592228 }, - { 2144880641, 1772327002, 2076128344 }, - { 2144864257, 1404514762, 2029969964 }, - { 2144827393, 801236594, 406627220 }, - { 2144806913, 349217443, 1501080290 }, - { 2144796673, 1542656776, 2084736519 }, - { 2144778241, 1210734884, 1746416203 }, - { 2144759809, 1146598851, 716464489 }, - { 2144757761, 286328400, 1823728177 }, - { 2144729089, 1347555695, 1836644881 }, - { 2144727041, 1795703790, 520296412 }, - { 2144696321, 1302475157, 852964281 }, - { 2144667649, 1075877614, 504992927 }, - { 2144573441, 198765808, 1617144982 }, - { 2144555009, 321528767, 155821259 }, - { 2144550913, 814139516, 1819937644 }, - { 2144536577, 571143206, 962942255 }, - { 2144524289, 1746733766, 2471321 }, - { 2144512001, 1821415077, 124190939 }, - { 2144468993, 917871546, 1260072806 }, - { 2144458753, 378417981, 1569240563 }, - { 2144421889, 175229668, 1825620763 }, - { 2144409601, 1699216963, 351648117 }, - { 2144370689, 1071885991, 958186029 }, - { 2144348161, 1763151227, 540353574 }, - { 2144335873, 1060214804, 919598847 }, - { 2144329729, 663515846, 1448552668 }, - { 2144327681, 1057776305, 590222840 }, - { 2144309249, 1705149168, 1459294624 }, - { 2144296961, 325823721, 1649016934 }, - { 2144290817, 738775789, 447427206 }, - { 2144243713, 962347618, 893050215 }, - { 2144237569, 1655257077, 900860862 }, - { 2144161793, 242206694, 1567868672 }, - { 2144155649, 769415308, 1247993134 }, - { 2144137217, 320492023, 515841070 }, - { 2144120833, 1639388522, 770877302 }, - { 2144071681, 1761785233, 964296120 }, - { 2144065537, 419817825, 204564472 }, - { 2144028673, 666050597, 2091019760 }, - { 2144010241, 1413657615, 1518702610 }, - { 2143952897, 1238327946, 475672271 }, - { 2143940609, 307063413, 1176750846 }, - { 2143918081, 2062905559, 786785803 }, - { 2143899649, 1338112849, 1562292083 }, - { 2143891457, 68149545, 87166451 }, - { 2143885313, 921750778, 394460854 }, - { 2143854593, 719766593, 133877196 }, - { 2143836161, 1149399850, 1861591875 }, - { 2143762433, 1848739366, 1335934145 }, - { 2143756289, 1326674710, 102999236 }, - { 2143713281, 808061791, 1156900308 }, - { 2143690753, 388399459, 1926468019 }, - { 2143670273, 1427891374, 1756689401 }, - { 2143666177, 1912173949, 986629565 }, - { 2143645697, 2041160111, 371842865 }, - { 2143641601, 1279906897, 2023974350 }, - { 2143635457, 720473174, 1389027526 }, - { 2143621121, 1298309455, 1732632006 }, - { 2143598593, 1548762216, 1825417506 }, - { 2143567873, 620475784, 1073787233 }, - { 2143561729, 1932954575, 949167309 }, - { 2143553537, 354315656, 1652037534 }, - { 2143541249, 577424288, 1097027618 }, - { 2143531009, 357862822, 478640055 }, - { 2143522817, 2017706025, 1550531668 }, - { 2143506433, 2078127419, 1824320165 }, - { 2143488001, 613475285, 1604011510 }, - { 2143469569, 1466594987, 502095196 }, - { 2143426561, 1115430331, 1044637111 }, - { 2143383553, 9778045, 1902463734 }, - { 2143377409, 1557401276, 2056861771 }, - { 2143363073, 652036455, 1965915971 }, - { 2143260673, 1464581171, 1523257541 }, - { 2143246337, 1876119649, 764541916 }, - { 2143209473, 1614992673, 1920672844 }, - { 2143203329, 981052047, 2049774209 }, - { 2143160321, 1847355533, 728535665 }, - { 2143129601, 965558457, 603052992 }, - { 2143123457, 2140817191, 8348679 }, - { 2143100929, 1547263683, 694209023 }, - { 2143092737, 643459066, 1979934533 }, - { 2143082497, 188603778, 2026175670 }, - { 2143062017, 1657329695, 377451099 }, - { 2143051777, 114967950, 979255473 }, - { 2143025153, 1698431342, 1449196896 }, - { 2143006721, 1862741675, 1739650365 }, - { 2142996481, 756660457, 996160050 }, - { 2142976001, 927864010, 1166847574 }, - { 2142965761, 905070557, 661974566 }, - { 2142916609, 40932754, 1787161127 }, - { 2142892033, 1987985648, 675335382 }, - { 2142885889, 797497211, 1323096997 }, - { 2142871553, 2068025830, 1411877159 }, - { 2142861313, 1217177090, 1438410687 }, - { 2142830593, 409906375, 1767860634 }, - { 2142803969, 1197788993, 359782919 }, - { 2142785537, 643817365, 513932862 }, - { 2142779393, 1717046338, 218943121 }, - { 2142724097, 89336830, 416687049 }, - { 2142707713, 5944581, 1356813523 }, - { 2142658561, 887942135, 2074011722 }, - { 2142638081, 151851972, 1647339939 }, - { 2142564353, 1691505537, 1483107336 }, - { 2142533633, 1989920200, 1135938817 }, - { 2142529537, 959263126, 1531961857 }, - { 2142527489, 453251129, 1725566162 }, - { 2142502913, 1536028102, 182053257 }, - { 2142498817, 570138730, 701443447 }, - { 2142416897, 326965800, 411931819 }, - { 2142363649, 1675665410, 1517191733 }, - { 2142351361, 968529566, 1575712703 }, - { 2142330881, 1384953238, 1769087884 }, - { 2142314497, 1977173242, 1833745524 }, - { 2142289921, 95082313, 1714775493 }, - { 2142283777, 109377615, 1070584533 }, - { 2142277633, 16960510, 702157145 }, - { 2142263297, 553850819, 431364395 }, - { 2142208001, 241466367, 2053967982 }, - { 2142164993, 1795661326, 1031836848 }, - { 2142097409, 1212530046, 712772031 }, - { 2142087169, 1763869720, 822276067 }, - { 2142078977, 644065713, 1765268066 }, - { 2142074881, 112671944, 643204925 }, - { 2142044161, 1387785471, 1297890174 }, - { 2142025729, 783885537, 1000425730 }, - { 2142011393, 905662232, 1679401033 }, - { 2141974529, 799788433, 468119557 }, - { 2141943809, 1932544124, 449305555 }, - { 2141933569, 1527403256, 841867925 }, - { 2141931521, 1247076451, 743823916 }, - { 2141902849, 1199660531, 401687910 }, - { 2141890561, 150132350, 1720336972 }, - { 2141857793, 1287438162, 663880489 }, - { 2141833217, 618017731, 1819208266 }, - { 2141820929, 999578638, 1403090096 }, - { 2141786113, 81834325, 1523542501 }, - { 2141771777, 120001928, 463556492 }, - { 2141759489, 122455485, 2124928282 }, - { 2141749249, 141986041, 940339153 }, - { 2141685761, 889088734, 477141499 }, - { 2141673473, 324212681, 1122558298 }, - { 2141669377, 1175806187, 1373818177 }, - { 2141655041, 1113654822, 296887082 }, - { 2141587457, 991103258, 1585913875 }, - { 2141583361, 1401451409, 1802457360 }, - { 2141575169, 1571977166, 712760980 }, - { 2141546497, 1107849376, 1250270109 }, - { 2141515777, 196544219, 356001130 }, - { 2141495297, 1733571506, 1060744866 }, - { 2141483009, 321552363, 1168297026 }, - { 2141458433, 505818251, 733225819 }, - { 2141360129, 1026840098, 948342276 }, - { 2141325313, 945133744, 2129965998 }, - { 2141317121, 1871100260, 1843844634 }, - { 2141286401, 1790639498, 1750465696 }, - { 2141267969, 1376858592, 186160720 }, - { 2141255681, 2129698296, 1876677959 }, - { 2141243393, 2138900688, 1340009628 }, - { 2141214721, 1933049835, 1087819477 }, - { 2141212673, 1898664939, 1786328049 }, - { 2141202433, 990234828, 940682169 }, - { 2141175809, 1406392421, 993089586 }, - { 2141165569, 1263518371, 289019479 }, - { 2141073409, 1485624211, 507864514 }, - { 2141052929, 1885134788, 311252465 }, - { 2141040641, 1285021247, 280941862 }, - { 2141028353, 1527610374, 375035110 }, - { 2141011969, 1400626168, 164696620 }, - { 2140999681, 632959608, 966175067 }, - { 2140997633, 2045628978, 1290889438 }, - { 2140993537, 1412755491, 375366253 }, - { 2140942337, 719477232, 785367828 }, - { 2140925953, 45224252, 836552317 }, - { 2140917761, 1157376588, 1001839569 }, - { 2140887041, 278480752, 2098732796 }, - { 2140837889, 1663139953, 924094810 }, - { 2140788737, 802501511, 2045368990 }, - { 2140766209, 1820083885, 1800295504 }, - { 2140764161, 1169561905, 2106792035 }, - { 2140696577, 127781498, 1885987531 }, - { 2140684289, 16014477, 1098116827 }, - { 2140653569, 665960598, 1796728247 }, - { 2140594177, 1043085491, 377310938 }, - { 2140579841, 1732838211, 1504505945 }, - { 2140569601, 302071939, 358291016 }, - { 2140567553, 192393733, 1909137143 }, - { 2140557313, 406595731, 1175330270 }, - { 2140549121, 1748850918, 525007007 }, - { 2140477441, 499436566, 1031159814 }, - { 2140469249, 1886004401, 1029951320 }, - { 2140426241, 1483168100, 1676273461 }, - { 2140420097, 1779917297, 846024476 }, - { 2140413953, 522948893, 1816354149 }, - { 2140383233, 1931364473, 1296921241 }, - { 2140366849, 1917356555, 147196204 }, - { 2140354561, 16466177, 1349052107 }, - { 2140348417, 1875366972, 1860485634 }, - { 2140323841, 456498717, 1790256483 }, - { 2140321793, 1629493973, 150031888 }, - { 2140315649, 1904063898, 395510935 }, - { 2140280833, 1784104328, 831417909 }, - { 2140250113, 256087139, 697349101 }, - { 2140229633, 388553070, 243875754 }, - { 2140223489, 747459608, 1396270850 }, - { 2140200961, 507423743, 1895572209 }, - { 2140162049, 580106016, 2045297469 }, - { 2140149761, 712426444, 785217995 }, - { 2140137473, 1441607584, 536866543 }, - { 2140119041, 346538902, 1740434653 }, - { 2140090369, 282642885, 21051094 }, - { 2140076033, 1407456228, 319910029 }, - { 2140047361, 1619330500, 1488632070 }, - { 2140041217, 2089408064, 2012026134 }, - { 2140008449, 1705524800, 1613440760 }, - { 2139924481, 1846208233, 1280649481 }, - { 2139906049, 989438755, 1185646076 }, - { 2139867137, 1522314850, 372783595 }, - { 2139842561, 1681587377, 216848235 }, - { 2139826177, 2066284988, 1784999464 }, - { 2139824129, 480888214, 1513323027 }, - { 2139789313, 847937200, 858192859 }, - { 2139783169, 1642000434, 1583261448 }, - { 2139770881, 940699589, 179702100 }, - { 2139768833, 315623242, 964612676 }, - { 2139666433, 331649203, 764666914 }, - { 2139641857, 2118730799, 1313764644 }, - { 2139635713, 519149027, 519212449 }, - { 2139598849, 1526413634, 1769667104 }, - { 2139574273, 551148610, 820739925 }, - { 2139568129, 1386800242, 472447405 }, - { 2139549697, 813760130, 1412328531 }, - { 2139537409, 1615286260, 1609362979 }, - { 2139475969, 1352559299, 1696720421 }, - { 2139455489, 1048691649, 1584935400 }, - { 2139432961, 836025845, 950121150 }, - { 2139424769, 1558281165, 1635486858 }, - { 2139406337, 1728402143, 1674423301 }, - { 2139396097, 1727715782, 1483470544 }, - { 2139383809, 1092853491, 1741699084 }, - { 2139369473, 690776899, 1242798709 }, - { 2139351041, 1768782380, 2120712049 }, - { 2139334657, 1739968247, 1427249225 }, - { 2139332609, 1547189119, 623011170 }, - { 2139310081, 1346827917, 1605466350 }, - { 2139303937, 369317948, 828392831 }, - { 2139301889, 1560417239, 1788073219 }, - { 2139283457, 1303121623, 595079358 }, - { 2139248641, 1354555286, 573424177 }, - { 2139240449, 60974056, 885781403 }, - { 2139222017, 355573421, 1221054839 }, - { 2139215873, 566477826, 1724006500 }, - { 2139150337, 871437673, 1609133294 }, - { 2139144193, 1478130914, 1137491905 }, - { 2139117569, 1854880922, 964728507 }, - { 2139076609, 202405335, 756508944 }, - { 2139062273, 1399715741, 884826059 }, - { 2139045889, 1051045798, 1202295476 }, - { 2139033601, 1707715206, 632234634 }, - { 2139006977, 2035853139, 231626690 }, - { 2138951681, 183867876, 838350879 }, - { 2138945537, 1403254661, 404460202 }, - { 2138920961, 310865011, 1282911681 }, - { 2138910721, 1328496553, 103472415 }, - { 2138904577, 78831681, 993513549 }, - { 2138902529, 1319697451, 1055904361 }, - { 2138816513, 384338872, 1706202469 }, - { 2138810369, 1084868275, 405677177 }, - { 2138787841, 401181788, 1964773901 }, - { 2138775553, 1850532988, 1247087473 }, - { 2138767361, 874261901, 1576073565 }, - { 2138757121, 1187474742, 993541415 }, - { 2138748929, 1782458888, 1043206483 }, - { 2138744833, 1221500487, 800141243 }, - { 2138738689, 413465368, 1450660558 }, - { 2138695681, 739045140, 342611472 }, - { 2138658817, 1355845756, 672674190 }, - { 2138644481, 608379162, 1538874380 }, - { 2138632193, 1444914034, 686911254 }, - { 2138607617, 484707818, 1435142134 }, - { 2138591233, 539460669, 1290458549 }, - { 2138572801, 2093538990, 2011138646 }, - { 2138552321, 1149786988, 1076414907 }, - { 2138546177, 840688206, 2108985273 }, - { 2138533889, 209669619, 198172413 }, - { 2138523649, 1975879426, 1277003968 }, - { 2138490881, 1351891144, 1976858109 }, - { 2138460161, 1817321013, 1979278293 }, - { 2138429441, 1950077177, 203441928 }, - { 2138400769, 908970113, 628395069 }, - { 2138398721, 219890864, 758486760 }, - { 2138376193, 1306654379, 977554090 }, - { 2138351617, 298822498, 2004708503 }, - { 2138337281, 441457816, 1049002108 }, - { 2138320897, 1517731724, 1442269609 }, - { 2138290177, 1355911197, 1647139103 }, - { 2138234881, 531313247, 1746591962 }, - { 2138214401, 1899410930, 781416444 }, - { 2138202113, 1813477173, 1622508515 }, - { 2138191873, 1086458299, 1025408615 }, - { 2138183681, 1998800427, 827063290 }, - { 2138173441, 1921308898, 749670117 }, - { 2138103809, 1620902804, 2126787647 }, - { 2138099713, 828647069, 1892961817 }, - { 2138085377, 179405355, 1525506535 }, - { 2138060801, 615683235, 1259580138 }, - { 2138044417, 2030277840, 1731266562 }, - { 2138042369, 2087222316, 1627902259 }, - { 2138032129, 126388712, 1108640984 }, - { 2138011649, 715026550, 1017980050 }, - { 2137993217, 1693714349, 1351778704 }, - { 2137888769, 1289762259, 1053090405 }, - { 2137853953, 199991890, 1254192789 }, - { 2137833473, 941421685, 896995556 }, - { 2137817089, 750416446, 1251031181 }, - { 2137792513, 798075119, 368077456 }, - { 2137786369, 878543495, 1035375025 }, - { 2137767937, 9351178, 1156563902 }, - { 2137755649, 1382297614, 1686559583 }, - { 2137724929, 1345472850, 1681096331 }, - { 2137704449, 834666929, 630551727 }, - { 2137673729, 1646165729, 1892091571 }, - { 2137620481, 778943821, 48456461 }, - { 2137618433, 1730837875, 1713336725 }, - { 2137581569, 805610339, 1378891359 }, - { 2137538561, 204342388, 1950165220 }, - { 2137526273, 1947629754, 1500789441 }, - { 2137516033, 719902645, 1499525372 }, - { 2137491457, 230451261, 556382829 }, - { 2137440257, 979573541, 412760291 }, - { 2137374721, 927841248, 1954137185 }, - { 2137362433, 1243778559, 861024672 }, - { 2137313281, 1341338501, 980638386 }, - { 2137311233, 937415182, 1793212117 }, - { 2137255937, 795331324, 1410253405 }, - { 2137243649, 150756339, 1966999887 }, - { 2137182209, 163346914, 1939301431 }, - { 2137171969, 1952552395, 758913141 }, - { 2137159681, 570788721, 218668666 }, - { 2137147393, 1896656810, 2045670345 }, - { 2137141249, 358493842, 518199643 }, - { 2137139201, 1505023029, 674695848 }, - { 2137133057, 27911103, 830956306 }, - { 2137122817, 439771337, 1555268614 }, - { 2137116673, 790988579, 1871449599 }, - { 2137110529, 432109234, 811805080 }, - { 2137102337, 1357900653, 1184997641 }, - { 2137098241, 515119035, 1715693095 }, - { 2137090049, 408575203, 2085660657 }, - { 2137085953, 2097793407, 1349626963 }, - { 2137055233, 1556739954, 1449960883 }, - { 2137030657, 1545758650, 1369303716 }, - { 2136987649, 332602570, 103875114 }, - { 2136969217, 1499989506, 1662964115 }, - { 2136924161, 857040753, 4738842 }, - { 2136895489, 1948872712, 570436091 }, - { 2136893441, 58969960, 1568349634 }, - { 2136887297, 2127193379, 273612548 }, - { 2136850433, 111208983, 1181257116 }, - { 2136809473, 1627275942, 1680317971 }, - { 2136764417, 1574888217, 14011331 }, - { 2136741889, 14011055, 1129154251 }, - { 2136727553, 35862563, 1838555253 }, - { 2136721409, 310235666, 1363928244 }, - { 2136698881, 1612429202, 1560383828 }, - { 2136649729, 1138540131, 800014364 }, - { 2136606721, 602323503, 1433096652 }, - { 2136563713, 182209265, 1919611038 }, - { 2136555521, 324156477, 165591039 }, - { 2136549377, 195513113, 217165345 }, - { 2136526849, 1050768046, 939647887 }, - { 2136508417, 1886286237, 1619926572 }, - { 2136477697, 609647664, 35065157 }, - { 2136471553, 679352216, 1452259468 }, - { 2136457217, 128630031, 824816521 }, - { 2136422401, 19787464, 1526049830 }, - { 2136420353, 698316836, 1530623527 }, - { 2136371201, 1651862373, 1804812805 }, - { 2136334337, 326596005, 336977082 }, - { 2136322049, 63253370, 1904972151 }, - { 2136297473, 312176076, 172182411 }, - { 2136248321, 381261841, 369032670 }, - { 2136242177, 358688773, 1640007994 }, - { 2136229889, 512677188, 75585225 }, - { 2136219649, 2095003250, 1970086149 }, - { 2136207361, 1909650722, 537760675 }, - { 2136176641, 1334616195, 1533487619 }, - { 2136158209, 2096285632, 1793285210 }, - { 2136143873, 1897347517, 293843959 }, - { 2136133633, 923586222, 1022655978 }, - { 2136096769, 1464868191, 1515074410 }, - { 2136094721, 2020679520, 2061636104 }, - { 2136076289, 290798503, 1814726809 }, - { 2136041473, 156415894, 1250757633 }, - { 2135996417, 297459940, 1132158924 }, - { 2135955457, 538755304, 1688831340 }, - { 0, 0, 0 } -}; - -/* - * Reduce a small signed integer modulo a small prime. The source - * value x MUST be such that -p < x < p. - */ -static inline uint32_t -modp_set(int32_t x, uint32_t p) -{ - uint32_t w; - - w = (uint32_t)x; - w += p & -(w >> 31); - return w; -} - -/* - * Normalize a modular integer around 0. - */ -static inline int32_t -modp_norm(uint32_t x, uint32_t p) -{ - return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1))); -} - -/* - * Compute -1/p mod 2^31. This works for all odd integers p that fit - * on 31 bits. - */ -static uint32_t -modp_ninv31(uint32_t p) -{ - uint32_t y; - - y = 2 - p; - y *= 2 - p * y; - y *= 2 - p * y; - y *= 2 - p * y; - y *= 2 - p * y; - return (uint32_t)0x7FFFFFFF & -y; -} - -/* - * Compute R = 2^31 mod p. - */ -static inline uint32_t -modp_R(uint32_t p) -{ - /* - * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply - * 2^31 - p. - */ - return ((uint32_t)1 << 31) - p; -} - -/* - * Addition modulo p. - */ -static inline uint32_t -modp_add(uint32_t a, uint32_t b, uint32_t p) -{ - uint32_t d; - - d = a + b - p; - d += p & -(d >> 31); - return d; -} - -/* - * Subtraction modulo p. - */ -static inline uint32_t -modp_sub(uint32_t a, uint32_t b, uint32_t p) -{ - uint32_t d; - - d = a - b; - d += p & -(d >> 31); - return d; -} - -/* - * Halving modulo p. - */ -/* unused -static inline uint32_t -modp_half(uint32_t a, uint32_t p) -{ - a += p & -(a & 1); - return a >> 1; -} -*/ - -/* - * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31. - * It is required that p is an odd integer. - */ -static inline uint32_t -modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i) -{ - uint64_t z, w; - uint32_t d; - - z = (uint64_t)a * (uint64_t)b; - w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p; - d = (uint32_t)((z + w) >> 31) - p; - d += p & -(d >> 31); - return d; -} - -/* - * Compute R2 = 2^62 mod p. - */ -static uint32_t -modp_R2(uint32_t p, uint32_t p0i) -{ - uint32_t z; - - /* - * Compute z = 2^31 mod p (this is the value 1 in Montgomery - * representation), then double it with an addition. - */ - z = modp_R(p); - z = modp_add(z, z, p); - - /* - * Square it five times to obtain 2^32 in Montgomery representation - * (i.e. 2^63 mod p). - */ - z = modp_montymul(z, z, p, p0i); - z = modp_montymul(z, z, p, p0i); - z = modp_montymul(z, z, p, p0i); - z = modp_montymul(z, z, p, p0i); - z = modp_montymul(z, z, p, p0i); - - /* - * Halve the value mod p to get 2^62. - */ - z = (z + (p & -(z & 1))) >> 1; - return z; -} - -/* - * Compute 2^(31*x) modulo p. This works for integers x up to 2^11. - * p must be prime such that 2^30 < p < 2^31; p0i must be equal to - * -1/p mod 2^31; R2 must be equal to 2^62 mod p. - */ -static inline uint32_t -modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2) -{ - int i; - uint32_t r, z; - - /* - * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery - * representation of (2^31)^e mod p, where e = x-1. - * R2 is 2^31 in Montgomery representation. - */ - x --; - r = R2; - z = modp_R(p); - for (i = 0; (1U << i) <= x; i ++) { - if ((x & (1U << i)) != 0) { - z = modp_montymul(z, r, p, p0i); - } - r = modp_montymul(r, r, p, p0i); - } - return z; -} - -/* - * Division modulo p. If the divisor (b) is 0, then 0 is returned. - * This function computes proper results only when p is prime. - * Parameters: - * a dividend - * b divisor - * p odd prime modulus - * p0i -1/p mod 2^31 - * R 2^31 mod R - */ -static uint32_t -modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R) -{ - uint32_t z, e; - int i; - - e = p - 2; - z = R; - for (i = 30; i >= 0; i --) { - uint32_t z2; - - z = modp_montymul(z, z, p, p0i); - z2 = modp_montymul(z, b, p, p0i); - z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1); - } - - /* - * The loop above just assumed that b was in Montgomery - * representation, i.e. really contained b*R; under that - * assumption, it returns 1/b in Montgomery representation, - * which is R/b. But we gave it b in normal representation, - * so the loop really returned R/(b/R) = R^2/b. - * - * We want a/b, so we need one Montgomery multiplication with a, - * which also remove one of the R factors, and another such - * multiplication to remove the second R factor. - */ - z = modp_montymul(z, 1, p, p0i); - return modp_montymul(a, z, p, p0i); -} - -/* - * Bit-reversal index table. - */ -static const uint16_t REV10[] = { - 0, 512, 256, 768, 128, 640, 384, 896, 64, 576, 320, 832, - 192, 704, 448, 960, 32, 544, 288, 800, 160, 672, 416, 928, - 96, 608, 352, 864, 224, 736, 480, 992, 16, 528, 272, 784, - 144, 656, 400, 912, 80, 592, 336, 848, 208, 720, 464, 976, - 48, 560, 304, 816, 176, 688, 432, 944, 112, 624, 368, 880, - 240, 752, 496, 1008, 8, 520, 264, 776, 136, 648, 392, 904, - 72, 584, 328, 840, 200, 712, 456, 968, 40, 552, 296, 808, - 168, 680, 424, 936, 104, 616, 360, 872, 232, 744, 488, 1000, - 24, 536, 280, 792, 152, 664, 408, 920, 88, 600, 344, 856, - 216, 728, 472, 984, 56, 568, 312, 824, 184, 696, 440, 952, - 120, 632, 376, 888, 248, 760, 504, 1016, 4, 516, 260, 772, - 132, 644, 388, 900, 68, 580, 324, 836, 196, 708, 452, 964, - 36, 548, 292, 804, 164, 676, 420, 932, 100, 612, 356, 868, - 228, 740, 484, 996, 20, 532, 276, 788, 148, 660, 404, 916, - 84, 596, 340, 852, 212, 724, 468, 980, 52, 564, 308, 820, - 180, 692, 436, 948, 116, 628, 372, 884, 244, 756, 500, 1012, - 12, 524, 268, 780, 140, 652, 396, 908, 76, 588, 332, 844, - 204, 716, 460, 972, 44, 556, 300, 812, 172, 684, 428, 940, - 108, 620, 364, 876, 236, 748, 492, 1004, 28, 540, 284, 796, - 156, 668, 412, 924, 92, 604, 348, 860, 220, 732, 476, 988, - 60, 572, 316, 828, 188, 700, 444, 956, 124, 636, 380, 892, - 252, 764, 508, 1020, 2, 514, 258, 770, 130, 642, 386, 898, - 66, 578, 322, 834, 194, 706, 450, 962, 34, 546, 290, 802, - 162, 674, 418, 930, 98, 610, 354, 866, 226, 738, 482, 994, - 18, 530, 274, 786, 146, 658, 402, 914, 82, 594, 338, 850, - 210, 722, 466, 978, 50, 562, 306, 818, 178, 690, 434, 946, - 114, 626, 370, 882, 242, 754, 498, 1010, 10, 522, 266, 778, - 138, 650, 394, 906, 74, 586, 330, 842, 202, 714, 458, 970, - 42, 554, 298, 810, 170, 682, 426, 938, 106, 618, 362, 874, - 234, 746, 490, 1002, 26, 538, 282, 794, 154, 666, 410, 922, - 90, 602, 346, 858, 218, 730, 474, 986, 58, 570, 314, 826, - 186, 698, 442, 954, 122, 634, 378, 890, 250, 762, 506, 1018, - 6, 518, 262, 774, 134, 646, 390, 902, 70, 582, 326, 838, - 198, 710, 454, 966, 38, 550, 294, 806, 166, 678, 422, 934, - 102, 614, 358, 870, 230, 742, 486, 998, 22, 534, 278, 790, - 150, 662, 406, 918, 86, 598, 342, 854, 214, 726, 470, 982, - 54, 566, 310, 822, 182, 694, 438, 950, 118, 630, 374, 886, - 246, 758, 502, 1014, 14, 526, 270, 782, 142, 654, 398, 910, - 78, 590, 334, 846, 206, 718, 462, 974, 46, 558, 302, 814, - 174, 686, 430, 942, 110, 622, 366, 878, 238, 750, 494, 1006, - 30, 542, 286, 798, 158, 670, 414, 926, 94, 606, 350, 862, - 222, 734, 478, 990, 62, 574, 318, 830, 190, 702, 446, 958, - 126, 638, 382, 894, 254, 766, 510, 1022, 1, 513, 257, 769, - 129, 641, 385, 897, 65, 577, 321, 833, 193, 705, 449, 961, - 33, 545, 289, 801, 161, 673, 417, 929, 97, 609, 353, 865, - 225, 737, 481, 993, 17, 529, 273, 785, 145, 657, 401, 913, - 81, 593, 337, 849, 209, 721, 465, 977, 49, 561, 305, 817, - 177, 689, 433, 945, 113, 625, 369, 881, 241, 753, 497, 1009, - 9, 521, 265, 777, 137, 649, 393, 905, 73, 585, 329, 841, - 201, 713, 457, 969, 41, 553, 297, 809, 169, 681, 425, 937, - 105, 617, 361, 873, 233, 745, 489, 1001, 25, 537, 281, 793, - 153, 665, 409, 921, 89, 601, 345, 857, 217, 729, 473, 985, - 57, 569, 313, 825, 185, 697, 441, 953, 121, 633, 377, 889, - 249, 761, 505, 1017, 5, 517, 261, 773, 133, 645, 389, 901, - 69, 581, 325, 837, 197, 709, 453, 965, 37, 549, 293, 805, - 165, 677, 421, 933, 101, 613, 357, 869, 229, 741, 485, 997, - 21, 533, 277, 789, 149, 661, 405, 917, 85, 597, 341, 853, - 213, 725, 469, 981, 53, 565, 309, 821, 181, 693, 437, 949, - 117, 629, 373, 885, 245, 757, 501, 1013, 13, 525, 269, 781, - 141, 653, 397, 909, 77, 589, 333, 845, 205, 717, 461, 973, - 45, 557, 301, 813, 173, 685, 429, 941, 109, 621, 365, 877, - 237, 749, 493, 1005, 29, 541, 285, 797, 157, 669, 413, 925, - 93, 605, 349, 861, 221, 733, 477, 989, 61, 573, 317, 829, - 189, 701, 445, 957, 125, 637, 381, 893, 253, 765, 509, 1021, - 3, 515, 259, 771, 131, 643, 387, 899, 67, 579, 323, 835, - 195, 707, 451, 963, 35, 547, 291, 803, 163, 675, 419, 931, - 99, 611, 355, 867, 227, 739, 483, 995, 19, 531, 275, 787, - 147, 659, 403, 915, 83, 595, 339, 851, 211, 723, 467, 979, - 51, 563, 307, 819, 179, 691, 435, 947, 115, 627, 371, 883, - 243, 755, 499, 1011, 11, 523, 267, 779, 139, 651, 395, 907, - 75, 587, 331, 843, 203, 715, 459, 971, 43, 555, 299, 811, - 171, 683, 427, 939, 107, 619, 363, 875, 235, 747, 491, 1003, - 27, 539, 283, 795, 155, 667, 411, 923, 91, 603, 347, 859, - 219, 731, 475, 987, 59, 571, 315, 827, 187, 699, 443, 955, - 123, 635, 379, 891, 251, 763, 507, 1019, 7, 519, 263, 775, - 135, 647, 391, 903, 71, 583, 327, 839, 199, 711, 455, 967, - 39, 551, 295, 807, 167, 679, 423, 935, 103, 615, 359, 871, - 231, 743, 487, 999, 23, 535, 279, 791, 151, 663, 407, 919, - 87, 599, 343, 855, 215, 727, 471, 983, 55, 567, 311, 823, - 183, 695, 439, 951, 119, 631, 375, 887, 247, 759, 503, 1015, - 15, 527, 271, 783, 143, 655, 399, 911, 79, 591, 335, 847, - 207, 719, 463, 975, 47, 559, 303, 815, 175, 687, 431, 943, - 111, 623, 367, 879, 239, 751, 495, 1007, 31, 543, 287, 799, - 159, 671, 415, 927, 95, 607, 351, 863, 223, 735, 479, 991, - 63, 575, 319, 831, 191, 703, 447, 959, 127, 639, 383, 895, - 255, 767, 511, 1023 -}; - -/* - * Compute the roots for NTT and inverse NTT (binary case). Input - * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 = - * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g: - * gm[rev(i)] = g^i mod p - * igm[rev(i)] = (1/g)^i mod p - * where rev() is the "bit reversal" function over 10 bits. It fills - * the arrays only up to N = 2^logn values. - * - * The values stored in gm[] and igm[] are in Montgomery representation. - * - * p must be a prime such that p = 1 mod 2048. - */ -static void -modp_mkgm2(uint32_t *restrict gm, uint32_t *restrict igm, unsigned logn, - uint32_t g, uint32_t p, uint32_t p0i) -{ - size_t u, n; - unsigned k; - uint32_t ig, x1, x2, R2; - - n = (size_t)1 << logn; - - /* - * We want g such that g^(2N) = 1 mod p, but the provided - * generator has order 2048. We must square it a few times. - */ - R2 = modp_R2(p, p0i); - g = modp_montymul(g, R2, p, p0i); - for (k = logn; k < 10; k ++) { - g = modp_montymul(g, g, p, p0i); - } - - ig = modp_div(R2, g, p, p0i, modp_R(p)); - k = 10 - logn; - x1 = x2 = modp_R(p); - for (u = 0; u < n; u ++) { - size_t v; - - v = REV10[u << k]; - gm[v] = x1; - igm[v] = x2; - x1 = modp_montymul(x1, g, p, p0i); - x2 = modp_montymul(x2, ig, p, p0i); - } -} - -/* - * Compute the NTT over a polynomial (binary case). Polynomial elements - * are a[0], a[stride], a[2 * stride]... - */ -static void -modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn, - uint32_t p, uint32_t p0i) -{ - size_t t, m, n; - - if (logn == 0) { - return; - } - n = (size_t)1 << logn; - t = n; - for (m = 1; m < n; m <<= 1) { - size_t ht, u, v1; - - ht = t >> 1; - for (u = 0, v1 = 0; u < m; u ++, v1 += t) { - uint32_t s; - size_t v; - uint32_t *r1, *r2; - - s = gm[m + u]; - r1 = a + v1 * stride; - r2 = r1 + ht * stride; - for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) { - uint32_t x, y; - - x = *r1; - y = modp_montymul(*r2, s, p, p0i); - *r1 = modp_add(x, y, p); - *r2 = modp_sub(x, y, p); - } - } - t = ht; - } -} - -/* - * Compute the inverse NTT over a polynomial (binary case). - */ -static void -modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn, - uint32_t p, uint32_t p0i) -{ - size_t t, m, n, k; - uint32_t ni; - uint32_t *r; - - if (logn == 0) { - return; - } - n = (size_t)1 << logn; - t = 1; - for (m = n; m > 1; m >>= 1) { - size_t hm, dt, u, v1; - - hm = m >> 1; - dt = t << 1; - for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) { - uint32_t s; - size_t v; - uint32_t *r1, *r2; - - s = igm[hm + u]; - r1 = a + v1 * stride; - r2 = r1 + t * stride; - for (v = 0; v < t; v ++, r1 += stride, r2 += stride) { - uint32_t x, y; - - x = *r1; - y = *r2; - *r1 = modp_add(x, y, p); - *r2 = modp_montymul( - modp_sub(x, y, p), s, p, p0i);; - } - } - t = dt; - } - - /* - * We need 1/n in Montgomery representation, i.e. R/n. Since - * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p, - * thus a simple shift will do. - */ - ni = (uint32_t)1 << (31 - logn); - for (k = 0, r = a; k < n; k ++, r += stride) { - *r = modp_montymul(*r, ni, p, p0i); - } -} - -/* - * Simplified macros for NTT and iNTT (binary case) when the elements - * are consecutive in RAM. - */ -#define modp_NTT2(a, gm, logn, p, p0i) modp_NTT2_ext(a, 1, gm, logn, p, p0i) -#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i) - -/* - * Given polynomial f in NTT representation modulo p, compute f' of degree - * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are - * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2). - * - * The new polynomial is written "in place" over the first N/2 elements - * of f. - * - * If applied logn times successively on a given polynomial, the resulting - * degree-0 polynomial is the resultant of f and X^N+1 modulo p. - * - * This function applies only to the binary case; it is invoked from - * solve_NTRU_binary_depth1(). - */ -static void -modp_poly_rec_res(uint32_t *f, unsigned logn, - uint32_t p, uint32_t p0i, uint32_t R2) -{ - size_t hn, u; - - hn = (size_t)1 << (logn - 1); - for (u = 0; u < hn; u ++) { - uint32_t w0, w1; - - w0 = f[(u << 1) + 0]; - w1 = f[(u << 1) + 1]; - f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i); - } -} - -/* ==================================================================== */ -/* - * Custom bignum implementation. - * - * This is a very reduced set of functionalities. We need to do the - * following operations: - * - * - Rebuild the resultant and the polynomial coefficients from their - * values modulo small primes (of length 31 bits each). - * - * - Compute an extended GCD between the two computed resultants. - * - * - Extract top bits and add scaled values during the successive steps - * of Babai rounding. - * - * When rebuilding values using CRT, we must also recompute the product - * of the small prime factors. We always do it one small factor at a - * time, so the "complicated" operations can be done modulo the small - * prime with the modp_* functions. CRT coefficients (inverses) are - * precomputed. - * - * All values are positive until the last step: when the polynomial - * coefficients have been rebuilt, we normalize them around 0. But then, - * only additions and subtractions on the upper few bits are needed - * afterwards. - * - * We keep big integers as arrays of 31-bit words (in uint32_t values); - * the top bit of each uint32_t is kept equal to 0. Using 31-bit words - * makes it easier to keep track of carries. When negative values are - * used, two's complement is used. - */ - -/* - * Subtract integer b from integer a. Both integers are supposed to have - * the same size. The carry (0 or 1) is returned. Source arrays a and b - * MUST be distinct. - * - * The operation is performed as described above if ctr = 1. If - * ctl = 0, the value a[] is unmodified, but all memory accesses are - * still performed, and the carry is computed and returned. - */ -static uint32_t -zint_sub(uint32_t *restrict a, const uint32_t *restrict b, size_t len, - uint32_t ctl) -{ - size_t u; - uint32_t cc, m; - - cc = 0; - m = -ctl; - for (u = 0; u < len; u ++) { - uint32_t aw, w; - - aw = a[u]; - w = aw - b[u] - cc; - cc = w >> 31; - aw ^= ((w & 0x7FFFFFFF) ^ aw) & m; - a[u] = aw; - } - return cc; -} - -/* - * Mutiply the provided big integer m with a small value x. - * This function assumes that x < 2^31. The carry word is returned. - */ -static uint32_t -zint_mul_small(uint32_t *m, size_t mlen, uint32_t x) -{ - size_t u; - uint32_t cc; - - cc = 0; - for (u = 0; u < mlen; u ++) { - uint64_t z; - - z = (uint64_t)m[u] * (uint64_t)x + cc; - m[u] = (uint32_t)z & 0x7FFFFFFF; - cc = (uint32_t)(z >> 31); - } - return cc; -} - -/* - * Reduce a big integer d modulo a small integer p. - * Rules: - * d is unsigned - * p is prime - * 2^30 < p < 2^31 - * p0i = -(1/p) mod 2^31 - * R2 = 2^62 mod p - */ -static uint32_t -zint_mod_small_unsigned(const uint32_t *d, size_t dlen, - uint32_t p, uint32_t p0i, uint32_t R2) -{ - uint32_t x; - size_t u; - - /* - * Algorithm: we inject words one by one, starting with the high - * word. Each step is: - * - multiply x by 2^31 - * - add new word - */ - x = 0; - u = dlen; - while (u -- > 0) { - uint32_t w; - - x = modp_montymul(x, R2, p, p0i); - w = d[u] - p; - w += p & -(w >> 31); - x = modp_add(x, w, p); - } - return x; -} - -/* - * Similar to zint_mod_small_unsigned(), except that d may be signed. - * Extra parameter is Rx = 2^(31*dlen) mod p. - */ -static uint32_t -zint_mod_small_signed(const uint32_t *d, size_t dlen, - uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx) -{ - uint32_t z; - - if (dlen == 0) { - return 0; - } - z = zint_mod_small_unsigned(d, dlen, p, p0i, R2); - z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p); - return z; -} - -/* - * Add y*s to x. x and y initially have length 'len' words; the new x - * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must - * not overlap. - */ -static void -zint_add_mul_small(uint32_t *restrict x, - const uint32_t *restrict y, size_t len, uint32_t s) -{ - size_t u; - uint32_t cc; - - cc = 0; - for (u = 0; u < len; u ++) { - uint32_t xw, yw; - uint64_t z; - - xw = x[u]; - yw = y[u]; - z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc; - x[u] = (uint32_t)z & 0x7FFFFFFF; - cc = (uint32_t)(z >> 31); - } - x[len] = cc; -} - -/* - * Normalize a modular integer around 0: if x > p/2, then x is replaced - * with x - p (signed encoding with two's complement); otherwise, x is - * untouched. The two integers x and p are encoded over the same length. - */ -static void -zint_norm_zero(uint32_t *restrict x, const uint32_t *restrict p, size_t len) -{ - size_t u; - uint32_t r, bb; - - /* - * Compare x with p/2. We use the shifted version of p, and p - * is odd, so we really compare with (p-1)/2; we want to perform - * the subtraction if and only if x > (p-1)/2. - */ - r = 0; - bb = 0; - u = len; - while (u -- > 0) { - uint32_t wx, wp, cc; - - /* - * Get the two words to compare in wx and wp (both over - * 31 bits exactly). - */ - wx = x[u]; - wp = (p[u] >> 1) | (bb << 30); - bb = p[u] & 1; - - /* - * We set cc to -1, 0 or 1, depending on whether wp is - * lower than, equal to, or greater than wx. - */ - cc = wp - wx; - cc = ((-cc) >> 31) | -(cc >> 31); - - /* - * If r != 0 then it is either 1 or -1, and we keep its - * value. Otherwise, if r = 0, then we replace it with cc. - */ - r |= cc & ((r & 1) - 1); - } - - /* - * At this point, r = -1, 0 or 1, depending on whether (p-1)/2 - * is lower than, equal to, or greater than x. We thus want to - * do the subtraction only if r = -1. - */ - zint_sub(x, p, len, r >> 31); -} - -/* - * Rebuild integers from their RNS representation. There are 'num' - * integers, and each consists in 'xlen' words. 'xx' points at that - * first word of the first integer; subsequent integers are accessed - * by adding 'xstride' repeatedly. - * - * The words of an integer are the RNS representation of that integer, - * using the provided 'primes' are moduli. This function replaces - * each integer with its multi-word value (little-endian order). - * - * If "normalize_signed" is non-zero, then the returned value is - * normalized to the -m/2..m/2 interval (where m is the product of all - * small prime moduli); two's complement is used for negative values. - */ -static void -zint_rebuild_CRT(uint32_t *restrict xx, size_t xlen, size_t xstride, - size_t num, const small_prime *primes, int normalize_signed, - uint32_t *restrict tmp) -{ - size_t u; - uint32_t *x; - - tmp[0] = primes[0].p; - for (u = 1; u < xlen; u ++) { - /* - * At the entry of each loop iteration: - * - the first u words of each array have been - * reassembled; - * - the first u words of tmp[] contains the - * product of the prime moduli processed so far. - * - * We call 'q' the product of all previous primes. - */ - uint32_t p, p0i, s, R2; - size_t v; - - p = primes[u].p; - s = primes[u].s; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - - for (v = 0, x = xx; v < num; v ++, x += xstride) { - uint32_t xp, xq, xr; - /* - * xp = the integer x modulo the prime p for this - * iteration - * xq = (x mod q) mod p - */ - xp = x[u]; - xq = zint_mod_small_unsigned(x, u, p, p0i, R2); - - /* - * New value is (x mod q) + q * (s * (xp - xq) mod p) - */ - xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i); - zint_add_mul_small(x, tmp, u, xr); - } - - /* - * Update product of primes in tmp[]. - */ - tmp[u] = zint_mul_small(tmp, u, p); - } - - /* - * Normalize the reconstructed values around 0. - */ - if (normalize_signed) { - for (u = 0, x = xx; u < num; u ++, x += xstride) { - zint_norm_zero(x, tmp, xlen); - } - } -} - -/* - * Negate a big integer conditionally: value a is replaced with -a if - * and only if ctl = 1. Control value ctl must be 0 or 1. - */ -static void -zint_negate(uint32_t *a, size_t len, uint32_t ctl) -{ - size_t u; - uint32_t cc, m; - - /* - * If ctl = 1 then we flip the bits of a by XORing with - * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR - * with 0 and add 0, which leaves the value unchanged. - */ - cc = ctl; - m = -ctl >> 1; - for (u = 0; u < len; u ++) { - uint32_t aw; - - aw = a[u]; - aw = (aw ^ m) + cc; - a[u] = aw & 0x7FFFFFFF; - cc = aw >> 31; - } -} - -/* - * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31). - * The low bits are dropped (the caller should compute the coefficients - * such that these dropped bits are all zeros). If either or both - * yields a negative value, then the value is negated. - * - * Returned value is: - * 0 both values were positive - * 1 new a had to be negated - * 2 new b had to be negated - * 3 both new a and new b had to be negated - * - * Coefficients xa, xb, ya and yb may use the full signed 32-bit range. - */ -static uint32_t -zint_co_reduce(uint32_t *a, uint32_t *b, size_t len, - int64_t xa, int64_t xb, int64_t ya, int64_t yb) -{ - size_t u; - int64_t cca, ccb; - uint32_t nega, negb; - - cca = 0; - ccb = 0; - for (u = 0; u < len; u ++) { - uint32_t wa, wb; - uint64_t za, zb; - - wa = a[u]; - wb = b[u]; - za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca; - zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb; - if (u > 0) { - a[u - 1] = (uint32_t)za & 0x7FFFFFFF; - b[u - 1] = (uint32_t)zb & 0x7FFFFFFF; - } - cca = *(int64_t *)&za >> 31; - ccb = *(int64_t *)&zb >> 31; - } - a[len - 1] = (uint32_t)cca; - b[len - 1] = (uint32_t)ccb; - - nega = (uint32_t)((uint64_t)cca >> 63); - negb = (uint32_t)((uint64_t)ccb >> 63); - zint_negate(a, len, nega); - zint_negate(b, len, negb); - return nega | (negb << 1); -} - -/* - * Finish modular reduction. Rules on input parameters: - * - * if neg = 1, then -m <= a < 0 - * if neg = 0, then 0 <= a < 2*m - * - * If neg = 0, then the top word of a[] is allowed to use 32 bits. - * - * Modulus m must be odd. - */ -static void -zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg) -{ - size_t u; - uint32_t cc, xm, ym; - - /* - * First pass: compare a (assumed nonnegative) with m. Note that - * if the top word uses 32 bits, subtracting m must yield a - * value less than 2^31 since a < 2*m. - */ - cc = 0; - for (u = 0; u < len; u ++) { - cc = (a[u] - m[u] - cc) >> 31; - } - - /* - * If neg = 1 then we must add m (regardless of cc) - * If neg = 0 and cc = 0 then we must subtract m - * If neg = 0 and cc = 1 then we must do nothing - * - * In the loop below, we conditionally subtract either m or -m - * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1); - * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0. - */ - xm = -neg >> 1; - ym = -(neg | (1 - cc)); - cc = neg; - for (u = 0; u < len; u ++) { - uint32_t aw, mw; - - aw = a[u]; - mw = (m[u] ^ xm) & ym; - aw = aw - mw - cc; - a[u] = aw & 0x7FFFFFFF; - cc = aw >> 31; - } -} - -/* - * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with - * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31. - */ -static void -zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len, - uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb) -{ - size_t u; - int64_t cca, ccb; - uint32_t fa, fb; - - /* - * These are actually four combined Montgomery multiplications. - */ - cca = 0; - ccb = 0; - fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF; - fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF; - for (u = 0; u < len; u ++) { - uint32_t wa, wb; - uint64_t za, zb; - - wa = a[u]; - wb = b[u]; - za = wa * (uint64_t)xa + wb * (uint64_t)xb - + m[u] * (uint64_t)fa + (uint64_t)cca; - zb = wa * (uint64_t)ya + wb * (uint64_t)yb - + m[u] * (uint64_t)fb + (uint64_t)ccb; - if (u > 0) { - a[u - 1] = (uint32_t)za & 0x7FFFFFFF; - b[u - 1] = (uint32_t)zb & 0x7FFFFFFF; - } - cca = *(int64_t *)&za >> 31; - ccb = *(int64_t *)&zb >> 31; - } - a[len - 1] = (uint32_t)cca; - b[len - 1] = (uint32_t)ccb; - - /* - * At this point: - * -m <= a < 2*m - * -m <= b < 2*m - * (this is a case of Montgomery reduction) - * The top words of 'a' and 'b' may have a 32-th bit set. - * We want to add or subtract the modulus, as required. - */ - zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63)); - zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63)); -} - -/* - * Compute a GCD between two positive big integers x and y. The two - * integers must be odd. Returned value is 1 if the GCD is 1, 0 - * otherwise. When 1 is returned, arrays u and v are filled with values - * such that: - * 0 <= u <= y - * 0 <= v <= x - * x*u - y*v = 1 - * x[] and y[] are unmodified. Both input values must have the same - * encoded length. Temporary array must be large enough to accommodate 4 - * extra values of that length. Arrays u, v and tmp may not overlap with - * each other, or with either x or y. - */ -static int -zint_bezout(uint32_t *restrict u, uint32_t *restrict v, - const uint32_t *restrict x, const uint32_t *restrict y, - size_t len, uint32_t *restrict tmp) -{ - /* - * Algorithm is an extended binary GCD. We maintain 6 values - * a, b, u0, u1, v0 and v1 with the following invariants: - * - * a = x*u0 - y*v0 - * b = x*u1 - y*v1 - * 0 <= a <= x - * 0 <= b <= y - * 0 <= u0 < y - * 0 <= v0 < x - * 0 <= u1 <= y - * 0 <= v1 < x - * - * Initial values are: - * - * a = x u0 = 1 v0 = 0 - * b = y u1 = y v1 = x-1 - * - * Each iteration reduces either a or b, and maintains the - * invariants. Algorithm stops when a = b, at which point their - * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains - * the values (u,v) we want to return. - * - * The formal definition of the algorithm is a sequence of steps: - * - * - If a is even, then: - * a <- a/2 - * u0 <- u0/2 mod y - * v0 <- v0/2 mod x - * - * - Otherwise, if b is even, then: - * b <- b/2 - * u1 <- u1/2 mod y - * v1 <- v1/2 mod x - * - * - Otherwise, if a > b, then: - * a <- (a-b)/2 - * u0 <- (u0-u1)/2 mod y - * v0 <- (v0-v1)/2 mod x - * - * - Otherwise: - * b <- (b-a)/2 - * u1 <- (u1-u0)/2 mod y - * v1 <- (v1-v0)/2 mod y - * - * We can show that the operations above preserve the invariants: - * - * - If a is even, then u0 and v0 are either both even or both - * odd (since a = x*u0 - y*v0, and x and y are both odd). - * If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2). - * Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way, - * the a = x*u0 - y*v0 invariant is preserved. - * - * - The same holds for the case where b is even. - * - * - If a and b are odd, and a > b, then: - * - * a-b = x*(u0-u1) - y*(v0-v1) - * - * In that situation, if u0 < u1, then x*(u0-u1) < 0, but - * a-b > 0; therefore, it must be that v0 < v1, and the - * first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x), - * which preserves the invariants. Otherwise, if u0 > u1, - * then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and - * b >= 0, hence a-b <= x. It follows that, in that case, - * v0-v1 >= 0. The first part of the update is then: - * (u0,v0) <- (u0-u1,v0-v1), which again preserves the - * invariants. - * - * Either way, once the subtraction is done, the new value of - * a, which is the difference of two odd values, is even, - * and the remaining of this step is a subcase of the - * first algorithm case (i.e. when a is even). - * - * - If a and b are odd, and b > a, then the a similar - * argument holds. - * - * The values a and b start at x and y, respectively. Since x - * and y are odd, their GCD is odd, and it is easily seen that - * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b); - * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a - * or b is reduced by at least one bit at each iteration, so - * the algorithm necessarily converges on the case a = b, at - * which point the common value is the GCD. - * - * In the algorithm expressed above, when a = b, the fourth case - * applies, and sets b = 0. Since a contains the GCD of x and y, - * which are both odd, a must be odd, and subsequent iterations - * (if any) will simply divide b by 2 repeatedly, which has no - * consequence. Thus, the algorithm can run for more iterations - * than necessary; the final GCD will be in a, and the (u,v) - * coefficients will be (u0,v0). - * - * - * The presentation above is bit-by-bit. It can be sped up by - * noticing that all decisions are taken based on the low bits - * and high bits of a and b. We can extract the two top words - * and low word of each of a and b, and compute reduction - * parameters pa, pb, qa and qb such that the new values for - * a and b are: - * a' = (a*pa + b*pb) / (2^31) - * b' = (a*qa + b*qb) / (2^31) - * the two divisions being exact. The coefficients are obtained - * just from the extracted words, and may be slightly off, requiring - * an optional correction: if a' < 0, then we replace pa with -pa - * and pb with -pb. Each such step will reduce the total length - * (sum of lengths of a and b) by at least 30 bits at each - * iteration. - */ - uint32_t *u0, *u1, *v0, *v1, *a, *b; - uint32_t x0i, y0i; - uint32_t num, rc; - size_t j; - - if (len == 0) { - return 0; - } - - /* - * u0 and v0 are the u and v result buffers; the four other - * values (u1, v1, a and b) are taken from tmp[]. - */ - u0 = u; - v0 = v; - u1 = tmp; - v1 = u1 + len; - a = v1 + len; - b = a + len; - - /* - * We'll need the Montgomery reduction coefficients. - */ - x0i = modp_ninv31(x[0]); - y0i = modp_ninv31(y[0]); - - /* - * Initialize a, b, u0, u1, v0 and v1. - * a = x u0 = 1 v0 = 0 - * b = y u1 = y v1 = x-1 - * Note that x is odd, so computing x-1 is easy. - */ - memcpy(a, x, len * sizeof *x); - memcpy(b, y, len * sizeof *y); - u0[0] = 1; - memset(u0 + 1, 0, (len - 1) * sizeof *u0); - memset(v0, 0, len * sizeof *v0); - memcpy(u1, y, len * sizeof *u1); - memcpy(v1, x, len * sizeof *v1); - v1[0] --; - - /* - * Each input operand may be as large as 31*len bits, and we - * reduce the total length by at least 30 bits at each iteration. - */ - for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) { - uint32_t c0, c1; - uint32_t a0, a1, b0, b1; - uint64_t a_hi, b_hi; - uint32_t a_lo, b_lo; - int64_t pa, pb, qa, qb; - int i; - uint32_t r; - - /* - * Extract the top words of a and b. If j is the highest - * index >= 1 such that a[j] != 0 or b[j] != 0, then we - * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1]. - * If a and b are down to one word each, then we use - * a[0] and b[0]. - */ - c0 = (uint32_t)-1; - c1 = (uint32_t)-1; - a0 = 0; - a1 = 0; - b0 = 0; - b1 = 0; - j = len; - while (j -- > 0) { - uint32_t aw, bw; - - aw = a[j]; - bw = b[j]; - a0 ^= (a0 ^ aw) & c0; - a1 ^= (a1 ^ aw) & c1; - b0 ^= (b0 ^ bw) & c0; - b1 ^= (b1 ^ bw) & c1; - c1 = c0; - c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1; - } - - /* - * If c1 = 0, then we grabbed two words for a and b. - * If c1 != 0 but c0 = 0, then we grabbed one word. It - * is not possible that c1 != 0 and c0 != 0, because that - * would mean that both integers are zero. - */ - a1 |= a0 & c1; - a0 &= ~c1; - b1 |= b0 & c1; - b0 &= ~c1; - a_hi = ((uint64_t)a0 << 31) + a1; - b_hi = ((uint64_t)b0 << 31) + b1; - a_lo = a[0]; - b_lo = b[0]; - - /* - * Compute reduction factors: - * - * a' = a*pa + b*pb - * b' = a*qa + b*qb - * - * such that a' and b' are both multiple of 2^31, but are - * only marginally larger than a and b. - */ - pa = 1; - pb = 0; - qa = 0; - qb = 1; - for (i = 0; i < 31; i ++) { - /* - * At each iteration: - * - * a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi - * b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi - * a <- a/2 if: a is even - * b <- b/2 if: a is odd, b is even - * - * We multiply a_lo and b_lo by 2 at each - * iteration, thus a division by 2 really is a - * non-multiplication by 2. - */ - uint32_t rt, oa, ob, cAB, cBA, cA; - uint64_t rz; - - /* - * rt = 1 if a_hi > b_hi, 0 otherwise. - */ - rz = b_hi - a_hi; - rt = (uint32_t)((rz ^ ((a_hi ^ b_hi) - & (a_hi ^ rz))) >> 63); - - /* - * cAB = 1 if b must be subtracted from a - * cBA = 1 if a must be subtracted from b - * cA = 1 if a must be divided by 2 - * - * Rules: - * - * cAB and cBA cannot both be 1. - * If a is not divided by 2, b is. - */ - oa = (a_lo >> i) & 1; - ob = (b_lo >> i) & 1; - cAB = oa & ob & rt; - cBA = oa & ob & ~rt; - cA = cAB | (oa ^ 1); - - /* - * Conditional subtractions. - */ - a_lo -= b_lo & -cAB; - a_hi -= b_hi & -(uint64_t)cAB; - pa -= qa & -(int64_t)cAB; - pb -= qb & -(int64_t)cAB; - b_lo -= a_lo & -cBA; - b_hi -= a_hi & -(uint64_t)cBA; - qa -= pa & -(int64_t)cBA; - qb -= pb & -(int64_t)cBA; - - /* - * Shifting. - */ - a_lo += a_lo & (cA - 1); - pa += pa & ((int64_t)cA - 1); - pb += pb & ((int64_t)cA - 1); - a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA; - b_lo += b_lo & -cA; - qa += qa & -(int64_t)cA; - qb += qb & -(int64_t)cA; - b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1); - } - - /* - * Apply the computed parameters to our values. We - * may have to correct pa and pb depending on the - * returned value of zint_co_reduce() (when a and/or b - * had to be negated). - */ - r = zint_co_reduce(a, b, len, pa, pb, qa, qb); - pa -= (pa + pa) & -(int64_t)(r & 1); - pb -= (pb + pb) & -(int64_t)(r & 1); - qa -= (qa + qa) & -(int64_t)(r >> 1); - qb -= (qb + qb) & -(int64_t)(r >> 1); - zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb); - zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb); - } - - /* - * At that point, array a[] should contain the GCD, and the - * results (u,v) should already be set. We check that the GCD - * is indeed 1. We also check that the two operands x and y - * are odd. - */ - rc = a[0] ^ 1; - for (j = 1; j < len; j ++) { - rc |= a[j]; - } - return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]); -} - -/* - * Add k*y*2^sc to x. The result is assumed to fit in the array of - * size xlen (truncation is applied if necessary). - * Scale factor 'sc' is provided as sch and scl, such that: - * sch = sc / 31 - * scl = sc % 31 - * xlen MUST NOT be lower than ylen. - * - * x[] and y[] are both signed integers, using two's complement for - * negative values. - */ -static void -zint_add_scaled_mul_small(uint32_t *restrict x, size_t xlen, - const uint32_t *restrict y, size_t ylen, int32_t k, - uint32_t sch, uint32_t scl) -{ - size_t u; - uint32_t ysign, tw; - int32_t cc; - - if (ylen == 0) { - return; - } - - ysign = -(y[ylen - 1] >> 30) >> 1; - tw = 0; - cc = 0; - for (u = sch; u < xlen; u ++) { - size_t v; - uint32_t wy, wys, ccu; - uint64_t z; - - /* - * Get the next word of y (scaled). - */ - v = u - sch; - wy = v < ylen ? y[v] : ysign; - wys = ((wy << scl) & 0x7FFFFFFF) | tw; - tw = wy >> (31 - scl); - - /* - * The expression below does not overflow. - */ - z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc); - x[u] = (uint32_t)z & 0x7FFFFFFF; - - /* - * Right-shifting the signed value z would yield - * implementation-defined results (arithmetic shift is - * not guaranteed). However, we can cast to unsigned, - * and get the next carry as an unsigned word. We can - * then convert it back to signed by using the guaranteed - * fact that 'int32_t' uses two's complement with no - * trap representation or padding bit, and with a layout - * compatible with that of 'uint32_t'. - */ - ccu = (uint32_t)(z >> 31); - cc = *(int32_t *)&ccu; - } -} - -/* - * Subtract y*2^sc from x. The result is assumed to fit in the array of - * size xlen (truncation is applied if necessary). - * Scale factor 'sc' is provided as sch and scl, such that: - * sch = sc / 31 - * scl = sc % 31 - * xlen MUST NOT be lower than ylen. - * - * x[] and y[] are both signed integers, using two's complement for - * negative values. - */ -static void -zint_sub_scaled(uint32_t *restrict x, size_t xlen, - const uint32_t *restrict y, size_t ylen, uint32_t sch, uint32_t scl) -{ - size_t u; - uint32_t ysign, tw; - uint32_t cc; - - if (ylen == 0) { - return; - } - - ysign = -(y[ylen - 1] >> 30) >> 1; - tw = 0; - cc = 0; - for (u = sch; u < xlen; u ++) { - size_t v; - uint32_t w, wy, wys; - - /* - * Get the next word of y (scaled). - */ - v = u - sch; - wy = v < ylen ? y[v] : ysign; - wys = ((wy << scl) & 0x7FFFFFFF) | tw; - tw = wy >> (31 - scl); - - w = x[u] - wys - cc; - x[u] = w & 0x7FFFFFFF; - cc = w >> 31; - } -} - -/* - * Convert a one-word signed big integer into a signed value. - */ -static inline int32_t -zint_one_to_plain(const uint32_t *x) -{ - uint32_t w; - - w = x[0]; - w |= (w & 0x40000000) << 1; - return *(int32_t *)&w; -} - -/* ==================================================================== */ - -/* - * Convert a polynomial to floating-point values. - * - * Each coefficient has length flen words, and starts fstride words after - * the previous. - * - * IEEE-754 binary64 values can represent values in a finite range, - * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large, - * they should be "trimmed" by pointing not to the lowest word of each, - * but upper. - */ -static void -poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride, - unsigned logn) -{ - size_t n, u; - - n = MKN(logn); - if (flen == 0) { - for (u = 0; u < n; u ++) { - d[u] = fpr_zero; - } - return; - } - for (u = 0; u < n; u ++, f += fstride) { - size_t v; - uint32_t neg, cc, xm; - fpr x, fsc; - - /* - * Get sign of the integer; if it is negative, then we - * will load its absolute value instead, and negate the - * result. - */ - neg = -(f[flen - 1] >> 30); - xm = neg >> 1; - cc = neg & 1; - x = fpr_zero; - fsc = fpr_one; - for (v = 0; v < flen; v ++, fsc = fpr_mul(fsc, fpr_ptwo31)) { - uint32_t w; - - w = (f[v] ^ xm) + cc; - cc = w >> 31; - w &= 0x7FFFFFFF; - w -= (w << 1) & neg; - x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc)); - } - d[u] = x; - } -} - -/* - * Convert a polynomial to small integers. Source values are supposed - * to be one-word integers, signed over 31 bits. Returned value is 0 - * if any of the coefficients exceeds the provided limit (in absolute - * value), or 1 on success. - * - * This is not constant-time; this is not a problem here, because on - * any failure, the NTRU-solving process will be deemed to have failed - * and the (f,g) polynomials will be discarded. - */ -static int -poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn) -{ - size_t n, u; - - n = MKN(logn); - for (u = 0; u < n; u ++) { - int32_t z; - - z = zint_one_to_plain(s + u); - if (z < -lim || z > lim) { - return 0; - } - d[u] = (int8_t)z; - } - return 1; -} - -/* - * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1. - * Coefficients of polynomial k are small integers (signed values in the - * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31 - * and scl = sc % 31. - * - * This function implements the basic quadratic multiplication algorithm, - * which is efficient in space (no extra buffer needed) but slow at - * high degree. - */ -static void -poly_sub_scaled(uint32_t *restrict F, size_t Flen, size_t Fstride, - const uint32_t *restrict f, size_t flen, size_t fstride, - const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn) -{ - size_t n, u; - - n = MKN(logn); - for (u = 0; u < n; u ++) { - int32_t kf; - size_t v; - uint32_t *x; - const uint32_t *y; - - kf = -k[u]; - x = F + u * Fstride; - y = f; - for (v = 0; v < n; v ++) { - zint_add_scaled_mul_small( - x, Flen, y, flen, kf, sch, scl); - if (u + v == n - 1) { - x = F; - kf = -kf; - } else { - x += Fstride; - } - y += fstride; - } - } -} - -/* - * Subtract k*f from F. Coefficients of polynomial k are small integers - * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function - * assumes that the degree is large, and integers relatively small. - * The value sc is provided as sch = sc / 31 and scl = sc % 31. - */ -static void -poly_sub_scaled_ntt(uint32_t *restrict F, size_t Flen, size_t Fstride, - const uint32_t *restrict f, size_t flen, size_t fstride, - const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn, - uint32_t *restrict tmp) -{ - uint32_t *gm, *igm, *fk, *t1, *x; - const uint32_t *y; - size_t n, u, tlen; - const small_prime *primes; - - n = MKN(logn); - tlen = flen + 1; - gm = tmp; - igm = gm + MKN(logn); - fk = igm + MKN(logn); - t1 = fk + n * tlen; - - primes = PRIMES; - - /* - * Compute k*f in fk[], in RNS notation. - */ - for (u = 0; u < tlen; u ++) { - uint32_t p, p0i, R2, Rx; - size_t v; - - p = primes[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - Rx = modp_Rx((unsigned)flen, p, p0i, R2); - modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i); - - for (v = 0; v < n; v ++) { - t1[v] = modp_set(k[v], p); - } - modp_NTT2(t1, gm, logn, p, p0i); - for (v = 0, y = f, x = fk + u; - v < n; v ++, y += fstride, x += tlen) - { - *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx); - } - modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i); - for (v = 0, x = fk + u; v < n; v ++, x += tlen) { - *x = modp_montymul( - modp_montymul(t1[v], *x, p, p0i), R2, p, p0i); - } - modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i); - } - - /* - * Rebuild k*f. - */ - zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1); - - /* - * Subtract k*f, scaled, from F. - */ - for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) { - zint_sub_scaled(x, Flen, y, tlen, sch, scl); - } -} - -/* ==================================================================== */ - -#if FALCON_KG_CHACHA20 // yyyKG_CHACHA20+1 - -#define RNG_CONTEXT prng -#define get_rng_u64 prng_get_u64 - -#else // yyyKG_CHACHA20+0 - -#define RNG_CONTEXT inner_shake256_context - -/* - * Get a random 8-byte integer from a SHAKE-based RNG. This function - * ensures consistent interpretation of the SHAKE output so that - * the same values will be obtained over different platforms, in case - * a known seed is used. - */ -static inline uint64_t -get_rng_u64(inner_shake256_context *rng) -{ - /* - * We enforce little-endian representation. - */ - -#if FALCON_LE // yyyLE+1 - /* - * On little-endian systems we just interpret the bytes "as is" - * (this is correct because the exact-width types such as - * 'uint64_t' are guaranteed to have no padding and no trap - * representation). - */ - uint64_t r; - - inner_shake256_extract(rng, (uint8_t *)&r, sizeof r); - return r; -#else // yyyLE+0 - uint8_t tmp[8]; - - inner_shake256_extract(rng, tmp, sizeof tmp); - return (uint64_t)tmp[0] - | ((uint64_t)tmp[1] << 8) - | ((uint64_t)tmp[2] << 16) - | ((uint64_t)tmp[3] << 24) - | ((uint64_t)tmp[4] << 32) - | ((uint64_t)tmp[5] << 40) - | ((uint64_t)tmp[6] << 48) - | ((uint64_t)tmp[7] << 56); -#endif // yyyLE- -} - -#endif // yyyKG_CHACHA20- - -/* - * Table below incarnates a discrete Gaussian distribution: - * D(x) = exp(-(x^2)/(2*sigma^2)) - * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024. - * Element 0 of the table is P(x = 0). - * For k > 0, element k is P(x >= k+1 | x > 0). - * Probabilities are scaled up by 2^63. - */ -static const uint64_t gauss_1024_12289[] = { - 1283868770400643928u, 6416574995475331444u, 4078260278032692663u, - 2353523259288686585u, 1227179971273316331u, 575931623374121527u, - 242543240509105209u, 91437049221049666u, 30799446349977173u, - 9255276791179340u, 2478152334826140u, 590642893610164u, - 125206034929641u, 23590435911403u, 3948334035941u, - 586753615614u, 77391054539u, 9056793210u, - 940121950u, 86539696u, 7062824u, - 510971u, 32764u, 1862u, - 94u, 4u, 0u -}; - -/* - * Generate a random value with a Gaussian distribution centered on 0. - * The RNG must be ready for extraction (already flipped). - * - * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The - * precomputed table is for N = 1024. Since the sum of two independent - * values of standard deviation sigma has standard deviation - * sigma*sqrt(2), then we can just generate more values and add them - * together for lower dimensions. - */ -static int -mkgauss(RNG_CONTEXT *rng, unsigned logn) -{ - unsigned u, g; - int val; - - g = 1U << (10 - logn); - val = 0; - for (u = 0; u < g; u ++) { - /* - * Each iteration generates one value with the - * Gaussian distribution for N = 1024. - * - * We use two random 64-bit values. First value - * decides on whether the generated value is 0, and, - * if not, the sign of the value. Second random 64-bit - * word is used to generate the non-zero value. - * - * For constant-time code we have to read the complete - * table. This has negligible cost, compared with the - * remainder of the keygen process (solving the NTRU - * equation). - */ - uint64_t r; - uint32_t f, v, k, neg; - - /* - * First value: - * - flag 'neg' is randomly selected to be 0 or 1. - * - flag 'f' is set to 1 if the generated value is zero, - * or set to 0 otherwise. - */ - r = get_rng_u64(rng); - neg = (uint32_t)(r >> 63); - r &= ~((uint64_t)1 << 63); - f = (uint32_t)((r - gauss_1024_12289[0]) >> 63); - - /* - * We produce a new random 63-bit integer r, and go over - * the array, starting at index 1. We store in v the - * index of the first array element which is not greater - * than r, unless the flag f was already 1. - */ - v = 0; - r = get_rng_u64(rng); - r &= ~((uint64_t)1 << 63); - for (k = 1; k < (sizeof gauss_1024_12289) - / (sizeof gauss_1024_12289[0]); k ++) - { - uint32_t t; - - t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1; - v |= k & -(t & (f ^ 1)); - f |= t; - } - - /* - * We apply the sign ('neg' flag). If the value is zero, - * the sign has no effect. - */ - v = (v ^ -neg) + neg; - - /* - * Generated value is added to val. - */ - val += *(int32_t *)&v; - } - return val; -} - -/* - * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit - * words, of intermediate values in the computation: - * - * MAX_BL_SMALL[depth]: length for the input f and g at that depth - * MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth - * - * Rules: - * - * - Within an array, values grow. - * - * - The 'SMALL' array must have an entry for maximum depth, corresponding - * to the size of values used in the binary GCD. There is no such value - * for the 'LARGE' array (the binary GCD yields already reduced - * coefficients). - * - * - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1]. - * - * - Values must be large enough to handle the common cases, with some - * margins. - * - * - Values must not be "too large" either because we will convert some - * integers into floating-point values by considering the top 10 words, - * i.e. 310 bits; hence, for values of length more than 10 words, we - * should take care to have the length centered on the expected size. - * - * The following average lengths, in bits, have been measured on thousands - * of random keys (fg = max length of the absolute value of coefficients - * of f and g at that depth; FG = idem for the unreduced F and G; for the - * maximum depth, F and G are the output of binary GCD, multiplied by q; - * for each value, the average and standard deviation are provided). - * - * Binary case: - * depth: 10 fg: 6307.52 (24.48) FG: 6319.66 (24.51) - * depth: 9 fg: 3138.35 (12.25) FG: 9403.29 (27.55) - * depth: 8 fg: 1576.87 ( 7.49) FG: 4703.30 (14.77) - * depth: 7 fg: 794.17 ( 4.98) FG: 2361.84 ( 9.31) - * depth: 6 fg: 400.67 ( 3.10) FG: 1188.68 ( 6.04) - * depth: 5 fg: 202.22 ( 1.87) FG: 599.81 ( 3.87) - * depth: 4 fg: 101.62 ( 1.02) FG: 303.49 ( 2.38) - * depth: 3 fg: 50.37 ( 0.53) FG: 153.65 ( 1.39) - * depth: 2 fg: 24.07 ( 0.25) FG: 78.20 ( 0.73) - * depth: 1 fg: 10.99 ( 0.08) FG: 39.82 ( 0.41) - * depth: 0 fg: 4.00 ( 0.00) FG: 19.61 ( 0.49) - * - * Integers are actually represented either in binary notation over - * 31-bit words (signed, using two's complement), or in RNS, modulo - * many small primes. These small primes are close to, but slightly - * lower than, 2^31. Use of RNS loses less than two bits, even for - * the largest values. - * - * IMPORTANT: if these values are modified, then the temporary buffer - * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed - * accordingly. - */ - -static const size_t MAX_BL_SMALL[] = { - 1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209 -}; - -static const size_t MAX_BL_LARGE[] = { - 2, 2, 5, 7, 12, 21, 40, 78, 157, 308 -}; - -/* - * Average and standard deviation for the maximum size (in bits) of - * coefficients of (f,g), depending on depth. These values are used - * to compute bounds for Babai's reduction. - */ -static const struct { - int avg; - int std; -} BITLENGTH[] = { - { 4, 0 }, - { 11, 1 }, - { 24, 1 }, - { 50, 1 }, - { 102, 1 }, - { 202, 2 }, - { 401, 4 }, - { 794, 5 }, - { 1577, 8 }, - { 3138, 13 }, - { 6308, 25 } -}; - -/* - * Minimal recursion depth at which we rebuild intermediate values - * when reconstructing f and g. - */ -#define DEPTH_INT_FG 4 - -/* - * Compute squared norm of a short vector. Returned value is saturated to - * 2^32-1 if it is not lower than 2^31. - */ -static uint32_t -poly_small_sqnorm(const int8_t *f, unsigned logn) -{ - size_t n, u; - uint32_t s, ng; - - n = MKN(logn); - s = 0; - ng = 0; - for (u = 0; u < n; u ++) { - int32_t z; - - z = f[u]; - s += (uint32_t)(z * z); - ng |= s; - } - return s | -(ng >> 31); -} - -/* - * Align (upwards) the provided 'data' pointer with regards to 'base' - * so that the offset is a multiple of the size of 'fpr'. - */ -static fpr * -align_fpr(void *base, void *data) -{ - uint8_t *cb, *cd; - size_t k, km; - - cb = base; - cd = data; - k = (size_t)(cd - cb); - km = k % sizeof(fpr); - if (km) { - k += (sizeof(fpr)) - km; - } - return (fpr *)(cb + k); -} - -/* - * Align (upwards) the provided 'data' pointer with regards to 'base' - * so that the offset is a multiple of the size of 'uint32_t'. - */ -static uint32_t * -align_u32(void *base, void *data) -{ - uint8_t *cb, *cd; - size_t k, km; - - cb = base; - cd = data; - k = (size_t)(cd - cb); - km = k % sizeof(uint32_t); - if (km) { - k += (sizeof(uint32_t)) - km; - } - return (uint32_t *)(cb + k); -} - -/* - * Convert a small vector to floating point. - */ -static void -poly_small_to_fp(fpr *x, const int8_t *f, unsigned logn) -{ - size_t n, u; - - n = MKN(logn); - for (u = 0; u < n; u ++) { - x[u] = fpr_of(f[u]); - } -} - -/* - * Input: f,g of degree N = 2^logn; 'depth' is used only to get their - * individual length. - * - * Output: f',g' of degree N/2, with the length for 'depth+1'. - * - * Values are in RNS; input and/or output may also be in NTT. - */ -static void -make_fg_step(uint32_t *data, unsigned logn, unsigned depth, - int in_ntt, int out_ntt) -{ - size_t n, hn, u; - size_t slen, tlen; - uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1; - const small_prime *primes; - - n = (size_t)1 << logn; - hn = n >> 1; - slen = MAX_BL_SMALL[depth]; - tlen = MAX_BL_SMALL[depth + 1]; - primes = PRIMES; - - /* - * Prepare room for the result. - */ - fd = data; - gd = fd + hn * tlen; - fs = gd + hn * tlen; - gs = fs + n * slen; - gm = gs + n * slen; - igm = gm + n; - t1 = igm + n; - memmove(fs, data, 2 * n * slen * sizeof *data); - - /* - * First slen words: we use the input values directly, and apply - * inverse NTT as we go. - */ - for (u = 0; u < slen; u ++) { - uint32_t p, p0i, R2; - size_t v; - uint32_t *x; - - p = primes[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i); - - for (v = 0, x = fs + u; v < n; v ++, x += slen) { - t1[v] = *x; - } - if (!in_ntt) { - modp_NTT2(t1, gm, logn, p, p0i); - } - for (v = 0, x = fd + u; v < hn; v ++, x += tlen) { - uint32_t w0, w1; - - w0 = t1[(v << 1) + 0]; - w1 = t1[(v << 1) + 1]; - *x = modp_montymul( - modp_montymul(w0, w1, p, p0i), R2, p, p0i); - } - if (in_ntt) { - modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i); - } - - for (v = 0, x = gs + u; v < n; v ++, x += slen) { - t1[v] = *x; - } - if (!in_ntt) { - modp_NTT2(t1, gm, logn, p, p0i); - } - for (v = 0, x = gd + u; v < hn; v ++, x += tlen) { - uint32_t w0, w1; - - w0 = t1[(v << 1) + 0]; - w1 = t1[(v << 1) + 1]; - *x = modp_montymul( - modp_montymul(w0, w1, p, p0i), R2, p, p0i); - } - if (in_ntt) { - modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i); - } - - if (!out_ntt) { - modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i); - modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i); - } - } - - /* - * Since the fs and gs words have been de-NTTized, we can use the - * CRT to rebuild the values. - */ - zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm); - zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm); - - /* - * Remaining words: use modular reductions to extract the values. - */ - for (u = slen; u < tlen; u ++) { - uint32_t p, p0i, R2, Rx; - size_t v; - uint32_t *x; - - p = primes[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - Rx = modp_Rx((unsigned)slen, p, p0i, R2); - modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i); - for (v = 0, x = fs; v < n; v ++, x += slen) { - t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx); - } - modp_NTT2(t1, gm, logn, p, p0i); - for (v = 0, x = fd + u; v < hn; v ++, x += tlen) { - uint32_t w0, w1; - - w0 = t1[(v << 1) + 0]; - w1 = t1[(v << 1) + 1]; - *x = modp_montymul( - modp_montymul(w0, w1, p, p0i), R2, p, p0i); - } - for (v = 0, x = gs; v < n; v ++, x += slen) { - t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx); - } - modp_NTT2(t1, gm, logn, p, p0i); - for (v = 0, x = gd + u; v < hn; v ++, x += tlen) { - uint32_t w0, w1; - - w0 = t1[(v << 1) + 0]; - w1 = t1[(v << 1) + 1]; - *x = modp_montymul( - modp_montymul(w0, w1, p, p0i), R2, p, p0i); - } - - if (!out_ntt) { - modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i); - modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i); - } - } -} - -/* - * Compute f and g at a specific depth, in RNS notation. - * - * Returned values are stored in the data[] array, at slen words per integer. - * - * Conditions: - * 0 <= depth <= logn - * - * Space use in data[]: enough room for any two successive values (f', g', - * f and g). - */ -static void -make_fg(uint32_t *data, const int8_t *f, const int8_t *g, - unsigned logn, unsigned depth, int out_ntt) -{ - size_t n, u; - uint32_t *ft, *gt, p0; - unsigned d; - const small_prime *primes; - - n = MKN(logn); - ft = data; - gt = ft + n; - primes = PRIMES; - p0 = primes[0].p; - for (u = 0; u < n; u ++) { - ft[u] = modp_set(f[u], p0); - gt[u] = modp_set(g[u], p0); - } - - if (depth == 0 && out_ntt) { - uint32_t *gm, *igm; - uint32_t p, p0i; - - p = primes[0].p; - p0i = modp_ninv31(p); - gm = gt + n; - igm = gm + MKN(logn); - modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i); - modp_NTT2(ft, gm, logn, p, p0i); - modp_NTT2(gt, gm, logn, p, p0i); - return; - } - - for (d = 0; d < depth; d ++) { - make_fg_step(data, logn - d, d, - d != 0, (d + 1) < depth || out_ntt); - } -} - -/* - * Solving the NTRU equation, deepest level: compute the resultants of - * f and g with X^N+1, and use binary GCD. The F and G values are - * returned in tmp[]. - * - * Returned value: 1 on success, 0 on error. - */ -static int -solve_NTRU_deepest(unsigned logn_top, - const int8_t *f, const int8_t *g, uint32_t *tmp) -{ - size_t len; - uint32_t *Fp, *Gp, *fp, *gp, *t1, q; - const small_prime *primes; - - len = MAX_BL_SMALL[logn_top]; - primes = PRIMES; - - Fp = tmp; - Gp = Fp + len; - fp = Gp + len; - gp = fp + len; - t1 = gp + len; - - make_fg(fp, f, g, logn_top, logn_top, 0); - - /* - * We use the CRT to rebuild the resultants as big integers. - * There are two such big integers. The resultants are always - * nonnegative. - */ - zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1); - - /* - * Apply the binary GCD. The zint_bezout() function works only - * if both inputs are odd. - * - * We can test on the result and return 0 because that would - * imply failure of the NTRU solving equation, and the (f,g) - * values will be abandoned in that case. - */ - if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) { - return 0; - } - - /* - * Multiply the two values by the target value q. Values must - * fit in the destination arrays. - * We can again test on the returned words: a non-zero output - * of zint_mul_small() means that we exceeded our array - * capacity, and that implies failure and rejection of (f,g). - */ - q = 12289; - if (zint_mul_small(Fp, len, q) != 0 - || zint_mul_small(Gp, len, q) != 0) - { - return 0; - } - - return 1; -} - -/* - * Solving the NTRU equation, intermediate level. Upon entry, the F and G - * from the previous level should be in the tmp[] array. - * This function MAY be invoked for the top-level (in which case depth = 0). - * - * Returned value: 1 on success, 0 on error. - */ -static int -solve_NTRU_intermediate(unsigned logn_top, - const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp) -{ - /* - * In this function, 'logn' is the log2 of the degree for - * this step. If N = 2^logn, then: - * - the F and G values already in fk->tmp (from the deeper - * levels) have degree N/2; - * - this function should return F and G of degree N. - */ - unsigned logn; - size_t n, hn, slen, dlen, llen, rlen, FGlen, u; - uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1; - fpr *rt1, *rt2, *rt3, *rt4, *rt5; - int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k; - uint32_t *x, *y; - int32_t *k; - const small_prime *primes; - - logn = logn_top - depth; - n = (size_t)1 << logn; - hn = n >> 1; - - /* - * slen = size for our input f and g; also size of the reduced - * F and G we return (degree N) - * - * dlen = size of the F and G obtained from the deeper level - * (degree N/2 or N/3) - * - * llen = size for intermediary F and G before reduction (degree N) - * - * We build our non-reduced F and G as two independent halves each, - * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1). - */ - slen = MAX_BL_SMALL[depth]; - dlen = MAX_BL_SMALL[depth + 1]; - llen = MAX_BL_LARGE[depth]; - primes = PRIMES; - - /* - * Fd and Gd are the F and G from the deeper level. - */ - Fd = tmp; - Gd = Fd + dlen * hn; - - /* - * Compute the input f and g for this level. Note that we get f - * and g in RNS + NTT representation. - */ - ft = Gd + dlen * hn; - make_fg(ft, f, g, logn_top, depth, 1); - - /* - * Move the newly computed f and g to make room for our candidate - * F and G (unreduced). - */ - Ft = tmp; - Gt = Ft + n * llen; - t1 = Gt + n * llen; - memmove(t1, ft, 2 * n * slen * sizeof *ft); - ft = t1; - gt = ft + slen * n; - t1 = gt + slen * n; - - /* - * Move Fd and Gd _after_ f and g. - */ - memmove(t1, Fd, 2 * hn * dlen * sizeof *Fd); - Fd = t1; - Gd = Fd + hn * dlen; - - /* - * We reduce Fd and Gd modulo all the small primes we will need, - * and store the values in Ft and Gt (only n/2 values in each). - */ - for (u = 0; u < llen; u ++) { - uint32_t p, p0i, R2, Rx; - size_t v; - uint32_t *xs, *ys, *xd, *yd; - - p = primes[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - Rx = modp_Rx((unsigned)dlen, p, p0i, R2); - for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u; - v < hn; - v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) - { - *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx); - *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx); - } - } - - /* - * We do not need Fd and Gd after that point. - */ - - /* - * Compute our F and G modulo sufficiently many small primes. - */ - for (u = 0; u < llen; u ++) { - uint32_t p, p0i, R2; - uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp; - size_t v; - - /* - * All computations are done modulo p. - */ - p = primes[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - - /* - * If we processed slen words, then f and g have been - * de-NTTized, and are in RNS; we can rebuild them. - */ - if (u == slen) { - zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1); - zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1); - } - - gm = t1; - igm = gm + n; - fx = igm + n; - gx = fx + n; - - modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i); - - if (u < slen) { - for (v = 0, x = ft + u, y = gt + u; - v < n; v ++, x += slen, y += slen) - { - fx[v] = *x; - gx[v] = *y; - } - modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i); - modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i); - } else { - uint32_t Rx; - - Rx = modp_Rx((unsigned)slen, p, p0i, R2); - for (v = 0, x = ft, y = gt; - v < n; v ++, x += slen, y += slen) - { - fx[v] = zint_mod_small_signed(x, slen, - p, p0i, R2, Rx); - gx[v] = zint_mod_small_signed(y, slen, - p, p0i, R2, Rx); - } - modp_NTT2(fx, gm, logn, p, p0i); - modp_NTT2(gx, gm, logn, p, p0i); - } - - /* - * Get F' and G' modulo p and in NTT representation - * (they have degree n/2). These values were computed in - * a previous step, and stored in Ft and Gt. - */ - Fp = gx + n; - Gp = Fp + hn; - for (v = 0, x = Ft + u, y = Gt + u; - v < hn; v ++, x += llen, y += llen) - { - Fp[v] = *x; - Gp[v] = *y; - } - modp_NTT2(Fp, gm, logn - 1, p, p0i); - modp_NTT2(Gp, gm, logn - 1, p, p0i); - - /* - * Compute our F and G modulo p. - * - * General case: - * - * we divide degree by d = 2 or 3 - * f'(x^d) = N(f)(x^d) = f * adj(f) - * g'(x^d) = N(g)(x^d) = g * adj(g) - * f'*G' - g'*F' = q - * F = F'(x^d) * adj(g) - * G = G'(x^d) * adj(f) - * - * We compute things in the NTT. We group roots of phi - * such that all roots x in a group share the same x^d. - * If the roots in a group are x_1, x_2... x_d, then: - * - * N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d) - * - * Thus, we have: - * - * G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d) - * G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d) - * ... - * G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d) - * - * In all cases, we can thus compute F and G in NTT - * representation by a few simple multiplications. - * Moreover, in our chosen NTT representation, roots - * from the same group are consecutive in RAM. - */ - for (v = 0, x = Ft + u, y = Gt + u; v < hn; - v ++, x += (llen << 1), y += (llen << 1)) - { - uint32_t ftA, ftB, gtA, gtB; - uint32_t mFp, mGp; - - ftA = fx[(v << 1) + 0]; - ftB = fx[(v << 1) + 1]; - gtA = gx[(v << 1) + 0]; - gtB = gx[(v << 1) + 1]; - mFp = modp_montymul(Fp[v], R2, p, p0i); - mGp = modp_montymul(Gp[v], R2, p, p0i); - x[0] = modp_montymul(gtB, mFp, p, p0i); - x[llen] = modp_montymul(gtA, mFp, p, p0i); - y[0] = modp_montymul(ftB, mGp, p, p0i); - y[llen] = modp_montymul(ftA, mGp, p, p0i); - } - modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i); - modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i); - } - - /* - * Rebuild F and G with the CRT. - */ - zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1); - zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1); - - /* - * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that - * order). - */ - - /* - * Apply Babai reduction to bring back F and G to size slen. - * - * We use the FFT to compute successive approximations of the - * reduction coefficient. We first isolate the top bits of - * the coefficients of f and g, and convert them to floating - * point; with the FFT, we compute adj(f), adj(g), and - * 1/(f*adj(f)+g*adj(g)). - * - * Then, we repeatedly apply the following: - * - * - Get the top bits of the coefficients of F and G into - * floating point, and use the FFT to compute: - * (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) - * - * - Convert back that value into normal representation, and - * round it to the nearest integers, yielding a polynomial k. - * Proper scaling is applied to f, g, F and G so that the - * coefficients fit on 32 bits (signed). - * - * - Subtract k*f from F and k*g from G. - * - * Under normal conditions, this process reduces the size of F - * and G by some bits at each iteration. For constant-time - * operation, we do not want to measure the actual length of - * F and G; instead, we do the following: - * - * - f and g are converted to floating-point, with some scaling - * if necessary to keep values in the representable range. - * - * - For each iteration, we _assume_ a maximum size for F and G, - * and use the values at that size. If we overreach, then - * we get zeros, which is harmless: the resulting coefficients - * of k will be 0 and the value won't be reduced. - * - * - We conservatively assume that F and G will be reduced by - * at least 25 bits at each iteration. - * - * Even when reaching the bottom of the reduction, reduction - * coefficient will remain low. If it goes out-of-range, then - * something wrong occurred and the whole NTRU solving fails. - */ - - /* - * Memory layout: - * - We need to compute and keep adj(f), adj(g), and - * 1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers, - * respectively). - * - At each iteration we need two extra fp buffer (N fp values), - * and produce a k (N 32-bit words). k will be shared with one - * of the fp buffers. - * - To compute k*f and k*g efficiently (with the NTT), we need - * some extra room; we reuse the space of the temporary buffers. - * - * Arrays of 'fpr' are obtained from the temporary array itself. - * We ensure that the base is at a properly aligned offset (the - * source array tmp[] is supposed to be already aligned). - */ - - rt3 = align_fpr(tmp, t1); - rt4 = rt3 + n; - rt5 = rt4 + n; - rt1 = rt5 + (n >> 1); - k = (int32_t *)align_u32(tmp, rt1); - rt2 = align_fpr(tmp, k + n); - if (rt2 < (rt1 + n)) { - rt2 = rt1 + n; - } - t1 = (uint32_t *)k + n; - - /* - * Get f and g into rt3 and rt4 as floating-point approximations. - * - * We need to "scale down" the floating-point representation of - * coefficients when they are too big. We want to keep the value - * below 2^310 or so. Thus, when values are larger than 10 words, - * we consider only the top 10 words. Array lengths have been - * computed so that average maximum length will fall in the - * middle or the upper half of these top 10 words. - */ - rlen = (slen > 10) ? 10 : slen; - poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn); - poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn); - - /* - * Values in rt3 and rt4 are downscaled by 2^(scale_fg). - */ - scale_fg = 31 * (int)(slen - rlen); - - /* - * Estimated boundaries for the maximum size (in bits) of the - * coefficients of (f,g). We use the measured average, and - * allow for a deviation of at most six times the standard - * deviation. - */ - minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std; - maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std; - - /* - * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f) - * and adj(g) in rt3 and rt4, respectively. - */ - Zf(FFT)(rt3, logn); - Zf(FFT)(rt4, logn); - Zf(poly_invnorm2_fft)(rt5, rt3, rt4, logn); - Zf(poly_adj_fft)(rt3, logn); - Zf(poly_adj_fft)(rt4, logn); - - /* - * Reduce F and G repeatedly. - * - * The expected maximum bit length of coefficients of F and G - * is kept in maxbl_FG, with the corresponding word length in - * FGlen. - */ - FGlen = llen; - maxbl_FG = 31 * (int)llen; - - /* - * Each reduction operation computes the reduction polynomial - * "k". We need that polynomial to have coefficients that fit - * on 32-bit signed integers, with some scaling; thus, we use - * a descending sequence of scaling values, down to zero. - * - * The size of the coefficients of k is (roughly) the difference - * between the size of the coefficients of (F,G) and the size - * of the coefficients of (f,g). Thus, the maximum size of the - * coefficients of k is, at the start, maxbl_FG - minbl_fg; - * this is our starting scale value for k. - * - * We need to estimate the size of (F,G) during the execution of - * the algorithm; we are allowed some overestimation but not too - * much (poly_big_to_fp() uses a 310-bit window). Generally - * speaking, after applying a reduction with k scaled to - * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd, - * where 'dd' is a few bits to account for the fact that the - * reduction is never perfect (intuitively, dd is on the order - * of sqrt(N), so at most 5 bits; we here allow for 10 extra - * bits). - * - * The size of (f,g) is not known exactly, but maxbl_fg is an - * upper bound. - */ - scale_k = maxbl_FG - minbl_fg; - - for (;;) { - int scale_FG, dc, new_maxbl_FG; - uint32_t scl, sch; - fpr pdc, pt; - - /* - * Convert current F and G into floating-point. We apply - * scaling if the current length is more than 10 words. - */ - rlen = (FGlen > 10) ? 10 : FGlen; - scale_FG = 31 * (int)(FGlen - rlen); - poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn); - poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn); - - /* - * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2. - */ - Zf(FFT)(rt1, logn); - Zf(FFT)(rt2, logn); - Zf(poly_mul_fft)(rt1, rt3, logn); - Zf(poly_mul_fft)(rt2, rt4, logn); - Zf(poly_add)(rt2, rt1, logn); - Zf(poly_mul_autoadj_fft)(rt2, rt5, logn); - Zf(iFFT)(rt2, logn); - - /* - * (f,g) are scaled by 'scale_fg', meaning that the - * numbers in rt3/rt4 should be multiplied by 2^(scale_fg) - * to have their true mathematical value. - * - * (F,G) are similarly scaled by 'scale_FG'. Therefore, - * the value we computed in rt2 is scaled by - * 'scale_FG-scale_fg'. - * - * We want that value to be scaled by 'scale_k', hence we - * apply a corrective scaling. After scaling, the values - * should fit in -2^31-1..+2^31-1. - */ - dc = scale_k - scale_FG + scale_fg; - - /* - * We will need to multiply values by 2^(-dc). The value - * 'dc' is not secret, so we can compute 2^(-dc) with a - * non-constant-time process. - * (We could use ldexp(), but we prefer to avoid any - * dependency on libm. When using FP emulation, we could - * use our fpr_ldexp(), which is constant-time.) - */ - if (dc < 0) { - dc = -dc; - pt = fpr_two; - } else { - pt = fpr_onehalf; - } - pdc = fpr_one; - while (dc != 0) { - if ((dc & 1) != 0) { - pdc = fpr_mul(pdc, pt); - } - dc >>= 1; - pt = fpr_sqr(pt); - } - - for (u = 0; u < n; u ++) { - fpr xv; - - xv = fpr_mul(rt2[u], pdc); - - /* - * Sometimes the values can be out-of-bounds if - * the algorithm fails; we must not call - * fpr_rint() (and cast to int32_t) if the value - * is not in-bounds. Note that the test does not - * break constant-time discipline, since any - * failure here implies that we discard the current - * secret key (f,g). - */ - if (!fpr_lt(fpr_mtwo31m1, xv) - || !fpr_lt(xv, fpr_ptwo31m1)) - { - return 0; - } - k[u] = (int32_t)fpr_rint(xv); - } - - /* - * Values in k[] are integers. They really are scaled - * down by maxbl_FG - minbl_fg bits. - * - * If we are at low depth, then we use the NTT to - * compute k*f and k*g. - */ - sch = (uint32_t)(scale_k / 31); - scl = (uint32_t)(scale_k % 31); - if (depth <= DEPTH_INT_FG) { - poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen, - k, sch, scl, logn, t1); - poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen, - k, sch, scl, logn, t1); - } else { - poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen, - k, sch, scl, logn); - poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen, - k, sch, scl, logn); - } - - /* - * We compute the new maximum size of (F,G), assuming that - * (f,g) has _maximal_ length (i.e. that reduction is - * "late" instead of "early". We also adjust FGlen - * accordingly. - */ - new_maxbl_FG = scale_k + maxbl_fg + 10; - if (new_maxbl_FG < maxbl_FG) { - maxbl_FG = new_maxbl_FG; - if ((int)FGlen * 31 >= maxbl_FG + 31) { - FGlen --; - } - } - - /* - * We suppose that scaling down achieves a reduction by - * at least 25 bits per iteration. We stop when we have - * done the loop with an unscaled k. - */ - if (scale_k <= 0) { - break; - } - scale_k -= 25; - if (scale_k < 0) { - scale_k = 0; - } - } - - /* - * If (F,G) length was lowered below 'slen', then we must take - * care to re-extend the sign. - */ - if (FGlen < slen) { - for (u = 0; u < n; u ++, Ft += llen, Gt += llen) { - size_t v; - uint32_t sw; - - sw = -(Ft[FGlen - 1] >> 30) >> 1; - for (v = FGlen; v < slen; v ++) { - Ft[v] = sw; - } - sw = -(Gt[FGlen - 1] >> 30) >> 1; - for (v = FGlen; v < slen; v ++) { - Gt[v] = sw; - } - } - } - - /* - * Compress encoding of all values to 'slen' words (this is the - * expected output format). - */ - for (u = 0, x = tmp, y = tmp; - u < (n << 1); u ++, x += slen, y += llen) - { - memmove(x, y, slen * sizeof *y); - } - return 1; -} - -/* - * Solving the NTRU equation, binary case, depth = 1. Upon entry, the - * F and G from the previous level should be in the tmp[] array. - * - * Returned value: 1 on success, 0 on error. - */ -static int -solve_NTRU_binary_depth1(unsigned logn_top, - const int8_t *f, const int8_t *g, uint32_t *tmp) -{ - /* - * The first half of this function is a copy of the corresponding - * part in solve_NTRU_intermediate(), for the reconstruction of - * the unreduced F and G. The second half (Babai reduction) is - * done differently, because the unreduced F and G fit in 53 bits - * of precision, allowing a much simpler process with lower RAM - * usage. - */ - unsigned depth, logn; - size_t n_top, n, hn, slen, dlen, llen, u; - uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1; - fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6; - uint32_t *x, *y; - - depth = 1; - n_top = (size_t)1 << logn_top; - logn = logn_top - depth; - n = (size_t)1 << logn; - hn = n >> 1; - - /* - * Equations are: - * - * f' = f0^2 - X^2*f1^2 - * g' = g0^2 - X^2*g1^2 - * F' and G' are a solution to f'G' - g'F' = q (from deeper levels) - * F = F'*(g0 - X*g1) - * G = G'*(f0 - X*f1) - * - * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to - * degree N/2 (their odd-indexed coefficients are all zero). - */ - - /* - * slen = size for our input f and g; also size of the reduced - * F and G we return (degree N) - * - * dlen = size of the F and G obtained from the deeper level - * (degree N/2) - * - * llen = size for intermediary F and G before reduction (degree N) - * - * We build our non-reduced F and G as two independent halves each, - * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1). - */ - slen = MAX_BL_SMALL[depth]; - dlen = MAX_BL_SMALL[depth + 1]; - llen = MAX_BL_LARGE[depth]; - - /* - * Fd and Gd are the F and G from the deeper level. Ft and Gt - * are the destination arrays for the unreduced F and G. - */ - Fd = tmp; - Gd = Fd + dlen * hn; - Ft = Gd + dlen * hn; - Gt = Ft + llen * n; - - /* - * We reduce Fd and Gd modulo all the small primes we will need, - * and store the values in Ft and Gt. - */ - for (u = 0; u < llen; u ++) { - uint32_t p, p0i, R2, Rx; - size_t v; - uint32_t *xs, *ys, *xd, *yd; - - p = PRIMES[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - Rx = modp_Rx((unsigned)dlen, p, p0i, R2); - for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u; - v < hn; - v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) - { - *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx); - *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx); - } - } - - /* - * Now Fd and Gd are not needed anymore; we can squeeze them out. - */ - memmove(tmp, Ft, llen * n * sizeof(uint32_t)); - Ft = tmp; - memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t)); - Gt = Ft + llen * n; - ft = Gt + llen * n; - gt = ft + slen * n; - - t1 = gt + slen * n; - - /* - * Compute our F and G modulo sufficiently many small primes. - */ - for (u = 0; u < llen; u ++) { - uint32_t p, p0i, R2; - uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp; - unsigned e; - size_t v; - - /* - * All computations are done modulo p. - */ - p = PRIMES[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - - /* - * We recompute things from the source f and g, of full - * degree. However, we will need only the n first elements - * of the inverse NTT table (igm); the call to modp_mkgm() - * below will fill n_top elements in igm[] (thus overflowing - * into fx[]) but later code will overwrite these extra - * elements. - */ - gm = t1; - igm = gm + n_top; - fx = igm + n; - gx = fx + n_top; - modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i); - - /* - * Set ft and gt to f and g modulo p, respectively. - */ - for (v = 0; v < n_top; v ++) { - fx[v] = modp_set(f[v], p); - gx[v] = modp_set(g[v], p); - } - - /* - * Convert to NTT and compute our f and g. - */ - modp_NTT2(fx, gm, logn_top, p, p0i); - modp_NTT2(gx, gm, logn_top, p, p0i); - for (e = logn_top; e > logn; e --) { - modp_poly_rec_res(fx, e, p, p0i, R2); - modp_poly_rec_res(gx, e, p, p0i, R2); - } - - /* - * From that point onward, we only need tables for - * degree n, so we can save some space. - */ - if (depth > 0) { /* always true */ - memmove(gm + n, igm, n * sizeof *igm); - igm = gm + n; - memmove(igm + n, fx, n * sizeof *ft); - fx = igm + n; - memmove(fx + n, gx, n * sizeof *gt); - gx = fx + n; - } - - /* - * Get F' and G' modulo p and in NTT representation - * (they have degree n/2). These values were computed - * in a previous step, and stored in Ft and Gt. - */ - Fp = gx + n; - Gp = Fp + hn; - for (v = 0, x = Ft + u, y = Gt + u; - v < hn; v ++, x += llen, y += llen) - { - Fp[v] = *x; - Gp[v] = *y; - } - modp_NTT2(Fp, gm, logn - 1, p, p0i); - modp_NTT2(Gp, gm, logn - 1, p, p0i); - - /* - * Compute our F and G modulo p. - * - * Equations are: - * - * f'(x^2) = N(f)(x^2) = f * adj(f) - * g'(x^2) = N(g)(x^2) = g * adj(g) - * - * f'*G' - g'*F' = q - * - * F = F'(x^2) * adj(g) - * G = G'(x^2) * adj(f) - * - * The NTT representation of f is f(w) for all w which - * are roots of phi. In the binary case, as well as in - * the ternary case for all depth except the deepest, - * these roots can be grouped in pairs (w,-w), and we - * then have: - * - * f(w) = adj(f)(-w) - * f(-w) = adj(f)(w) - * - * and w^2 is then a root for phi at the half-degree. - * - * At the deepest level in the ternary case, this still - * holds, in the following sense: the roots of x^2-x+1 - * are (w,-w^2) (for w^3 = -1, and w != -1), and we - * have: - * - * f(w) = adj(f)(-w^2) - * f(-w^2) = adj(f)(w) - * - * In all case, we can thus compute F and G in NTT - * representation by a few simple multiplications. - * Moreover, the two roots for each pair are consecutive - * in our bit-reversal encoding. - */ - for (v = 0, x = Ft + u, y = Gt + u; - v < hn; v ++, x += (llen << 1), y += (llen << 1)) - { - uint32_t ftA, ftB, gtA, gtB; - uint32_t mFp, mGp; - - ftA = fx[(v << 1) + 0]; - ftB = fx[(v << 1) + 1]; - gtA = gx[(v << 1) + 0]; - gtB = gx[(v << 1) + 1]; - mFp = modp_montymul(Fp[v], R2, p, p0i); - mGp = modp_montymul(Gp[v], R2, p, p0i); - x[0] = modp_montymul(gtB, mFp, p, p0i); - x[llen] = modp_montymul(gtA, mFp, p, p0i); - y[0] = modp_montymul(ftB, mGp, p, p0i); - y[llen] = modp_montymul(ftA, mGp, p, p0i); - } - modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i); - modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i); - - /* - * Also save ft and gt (only up to size slen). - */ - if (u < slen) { - modp_iNTT2(fx, igm, logn, p, p0i); - modp_iNTT2(gx, igm, logn, p, p0i); - for (v = 0, x = ft + u, y = gt + u; - v < n; v ++, x += slen, y += slen) - { - *x = fx[v]; - *y = gx[v]; - } - } - } - - /* - * Rebuild f, g, F and G with the CRT. Note that the elements of F - * and G are consecutive, and thus can be rebuilt in a single - * loop; similarly, the elements of f and g are consecutive. - */ - zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1); - zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1); - - /* - * Here starts the Babai reduction, specialized for depth = 1. - * - * Candidates F and G (from Ft and Gt), and base f and g (ft and gt), - * are converted to floating point. There is no scaling, and a - * single pass is sufficient. - */ - - /* - * Convert F and G into floating point (rt1 and rt2). - */ - rt1 = align_fpr(tmp, gt + slen * n); - rt2 = rt1 + n; - poly_big_to_fp(rt1, Ft, llen, llen, logn); - poly_big_to_fp(rt2, Gt, llen, llen, logn); - - /* - * Integer representation of F and G is no longer needed, we - * can remove it. - */ - memmove(tmp, ft, 2 * slen * n * sizeof *ft); - ft = tmp; - gt = ft + slen * n; - rt3 = align_fpr(tmp, gt + slen * n); - memmove(rt3, rt1, 2 * n * sizeof *rt1); - rt1 = rt3; - rt2 = rt1 + n; - rt3 = rt2 + n; - rt4 = rt3 + n; - - /* - * Convert f and g into floating point (rt3 and rt4). - */ - poly_big_to_fp(rt3, ft, slen, slen, logn); - poly_big_to_fp(rt4, gt, slen, slen, logn); - - /* - * Remove unneeded ft and gt. - */ - memmove(tmp, rt1, 4 * n * sizeof *rt1); - rt1 = (fpr *)tmp; - rt2 = rt1 + n; - rt3 = rt2 + n; - rt4 = rt3 + n; - - /* - * We now have: - * rt1 = F - * rt2 = G - * rt3 = f - * rt4 = g - * in that order in RAM. We convert all of them to FFT. - */ - Zf(FFT)(rt1, logn); - Zf(FFT)(rt2, logn); - Zf(FFT)(rt3, logn); - Zf(FFT)(rt4, logn); - - /* - * Compute: - * rt5 = F*adj(f) + G*adj(g) - * rt6 = 1 / (f*adj(f) + g*adj(g)) - * (Note that rt6 is half-length.) - */ - rt5 = rt4 + n; - rt6 = rt5 + n; - Zf(poly_add_muladj_fft)(rt5, rt1, rt2, rt3, rt4, logn); - Zf(poly_invnorm2_fft)(rt6, rt3, rt4, logn); - - /* - * Compute: - * rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g)) - */ - Zf(poly_mul_autoadj_fft)(rt5, rt6, logn); - - /* - * Compute k as the rounded version of rt5. Check that none of - * the values is larger than 2^63-1 (in absolute value) - * because that would make the fpr_rint() do something undefined; - * note that any out-of-bounds value here implies a failure and - * (f,g) will be discarded, so we can make a simple test. - */ - Zf(iFFT)(rt5, logn); - for (u = 0; u < n; u ++) { - fpr z; - - z = rt5[u]; - if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) { - return 0; - } - rt5[u] = fpr_of(fpr_rint(z)); - } - Zf(FFT)(rt5, logn); - - /* - * Subtract k*f from F, and k*g from G. - */ - Zf(poly_mul_fft)(rt3, rt5, logn); - Zf(poly_mul_fft)(rt4, rt5, logn); - Zf(poly_sub)(rt1, rt3, logn); - Zf(poly_sub)(rt2, rt4, logn); - Zf(iFFT)(rt1, logn); - Zf(iFFT)(rt2, logn); - - /* - * Convert back F and G to integers, and return. - */ - Ft = tmp; - Gt = Ft + n; - rt3 = align_fpr(tmp, Gt + n); - memmove(rt3, rt1, 2 * n * sizeof *rt1); - rt1 = rt3; - rt2 = rt1 + n; - for (u = 0; u < n; u ++) { - Ft[u] = (uint32_t)fpr_rint(rt1[u]); - Gt[u] = (uint32_t)fpr_rint(rt2[u]); - } - - return 1; -} - -/* - * Solving the NTRU equation, top level. Upon entry, the F and G - * from the previous level should be in the tmp[] array. - * - * Returned value: 1 on success, 0 on error. - */ -static int -solve_NTRU_binary_depth0(unsigned logn, - const int8_t *f, const int8_t *g, uint32_t *tmp) -{ - size_t n, hn, u; - uint32_t p, p0i, R2; - uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5; - uint32_t *gm, *igm, *ft, *gt; - fpr *rt2, *rt3; - - n = (size_t)1 << logn; - hn = n >> 1; - - /* - * Equations are: - * - * f' = f0^2 - X^2*f1^2 - * g' = g0^2 - X^2*g1^2 - * F' and G' are a solution to f'G' - g'F' = q (from deeper levels) - * F = F'*(g0 - X*g1) - * G = G'*(f0 - X*f1) - * - * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to - * degree N/2 (their odd-indexed coefficients are all zero). - * - * Everything should fit in 31-bit integers, hence we can just use - * the first small prime p = 2147473409. - */ - p = PRIMES[0].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - - Fp = tmp; - Gp = Fp + hn; - ft = Gp + hn; - gt = ft + n; - gm = gt + n; - igm = gm + n; - - modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i); - - /* - * Convert F' anf G' in NTT representation. - */ - for (u = 0; u < hn; u ++) { - Fp[u] = modp_set(zint_one_to_plain(Fp + u), p); - Gp[u] = modp_set(zint_one_to_plain(Gp + u), p); - } - modp_NTT2(Fp, gm, logn - 1, p, p0i); - modp_NTT2(Gp, gm, logn - 1, p, p0i); - - /* - * Load f and g and convert them to NTT representation. - */ - for (u = 0; u < n; u ++) { - ft[u] = modp_set(f[u], p); - gt[u] = modp_set(g[u], p); - } - modp_NTT2(ft, gm, logn, p, p0i); - modp_NTT2(gt, gm, logn, p, p0i); - - /* - * Build the unreduced F,G in ft and gt. - */ - for (u = 0; u < n; u += 2) { - uint32_t ftA, ftB, gtA, gtB; - uint32_t mFp, mGp; - - ftA = ft[u + 0]; - ftB = ft[u + 1]; - gtA = gt[u + 0]; - gtB = gt[u + 1]; - mFp = modp_montymul(Fp[u >> 1], R2, p, p0i); - mGp = modp_montymul(Gp[u >> 1], R2, p, p0i); - ft[u + 0] = modp_montymul(gtB, mFp, p, p0i); - ft[u + 1] = modp_montymul(gtA, mFp, p, p0i); - gt[u + 0] = modp_montymul(ftB, mGp, p, p0i); - gt[u + 1] = modp_montymul(ftA, mGp, p, p0i); - } - modp_iNTT2(ft, igm, logn, p, p0i); - modp_iNTT2(gt, igm, logn, p, p0i); - - Gp = Fp + n; - t1 = Gp + n; - memmove(Fp, ft, 2 * n * sizeof *ft); - - /* - * We now need to apply the Babai reduction. At that point, - * we have F and G in two n-word arrays. - * - * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g) - * modulo p, using the NTT. We still move memory around in - * order to save RAM. - */ - t2 = t1 + n; - t3 = t2 + n; - t4 = t3 + n; - t5 = t4 + n; - - /* - * Compute the NTT tables in t1 and t2. We do not keep t2 - * (we'll recompute it later on). - */ - modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i); - - /* - * Convert F and G to NTT. - */ - modp_NTT2(Fp, t1, logn, p, p0i); - modp_NTT2(Gp, t1, logn, p, p0i); - - /* - * Load f and adj(f) in t4 and t5, and convert them to NTT - * representation. - */ - t4[0] = t5[0] = modp_set(f[0], p); - for (u = 1; u < n; u ++) { - t4[u] = modp_set(f[u], p); - t5[n - u] = modp_set(-f[u], p); - } - modp_NTT2(t4, t1, logn, p, p0i); - modp_NTT2(t5, t1, logn, p, p0i); - - /* - * Compute F*adj(f) in t2, and f*adj(f) in t3. - */ - for (u = 0; u < n; u ++) { - uint32_t w; - - w = modp_montymul(t5[u], R2, p, p0i); - t2[u] = modp_montymul(w, Fp[u], p, p0i); - t3[u] = modp_montymul(w, t4[u], p, p0i); - } - - /* - * Load g and adj(g) in t4 and t5, and convert them to NTT - * representation. - */ - t4[0] = t5[0] = modp_set(g[0], p); - for (u = 1; u < n; u ++) { - t4[u] = modp_set(g[u], p); - t5[n - u] = modp_set(-g[u], p); - } - modp_NTT2(t4, t1, logn, p, p0i); - modp_NTT2(t5, t1, logn, p, p0i); - - /* - * Add G*adj(g) to t2, and g*adj(g) to t3. - */ - for (u = 0; u < n; u ++) { - uint32_t w; - - w = modp_montymul(t5[u], R2, p, p0i); - t2[u] = modp_add(t2[u], - modp_montymul(w, Gp[u], p, p0i), p); - t3[u] = modp_add(t3[u], - modp_montymul(w, t4[u], p, p0i), p); - } - - /* - * Convert back t2 and t3 to normal representation (normalized - * around 0), and then - * move them to t1 and t2. We first need to recompute the - * inverse table for NTT. - */ - modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i); - modp_iNTT2(t2, t4, logn, p, p0i); - modp_iNTT2(t3, t4, logn, p, p0i); - for (u = 0; u < n; u ++) { - t1[u] = (uint32_t)modp_norm(t2[u], p); - t2[u] = (uint32_t)modp_norm(t3[u], p); - } - - /* - * At that point, array contents are: - * - * F (NTT representation) (Fp) - * G (NTT representation) (Gp) - * F*adj(f)+G*adj(g) (t1) - * f*adj(f)+g*adj(g) (t2) - * - * We want to divide t1 by t2. The result is not integral; it - * must be rounded. We thus need to use the FFT. - */ - - /* - * Get f*adj(f)+g*adj(g) in FFT representation. Since this - * polynomial is auto-adjoint, all its coordinates in FFT - * representation are actually real, so we can truncate off - * the imaginary parts. - */ - rt3 = align_fpr(tmp, t3); - for (u = 0; u < n; u ++) { - rt3[u] = fpr_of(((int32_t *)t2)[u]); - } - Zf(FFT)(rt3, logn); - rt2 = align_fpr(tmp, t2); - memmove(rt2, rt3, hn * sizeof *rt3); - - /* - * Convert F*adj(f)+G*adj(g) in FFT representation. - */ - rt3 = rt2 + hn; - for (u = 0; u < n; u ++) { - rt3[u] = fpr_of(((int32_t *)t1)[u]); - } - Zf(FFT)(rt3, logn); - - /* - * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get - * its rounded normal representation in t1. - */ - Zf(poly_div_autoadj_fft)(rt3, rt2, logn); - Zf(iFFT)(rt3, logn); - for (u = 0; u < n; u ++) { - t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p); - } - - /* - * RAM contents are now: - * - * F (NTT representation) (Fp) - * G (NTT representation) (Gp) - * k (t1) - * - * We want to compute F-k*f, and G-k*g. - */ - t2 = t1 + n; - t3 = t2 + n; - t4 = t3 + n; - t5 = t4 + n; - modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i); - for (u = 0; u < n; u ++) { - t4[u] = modp_set(f[u], p); - t5[u] = modp_set(g[u], p); - } - modp_NTT2(t1, t2, logn, p, p0i); - modp_NTT2(t4, t2, logn, p, p0i); - modp_NTT2(t5, t2, logn, p, p0i); - for (u = 0; u < n; u ++) { - uint32_t kw; - - kw = modp_montymul(t1[u], R2, p, p0i); - Fp[u] = modp_sub(Fp[u], - modp_montymul(kw, t4[u], p, p0i), p); - Gp[u] = modp_sub(Gp[u], - modp_montymul(kw, t5[u], p, p0i), p); - } - modp_iNTT2(Fp, t3, logn, p, p0i); - modp_iNTT2(Gp, t3, logn, p, p0i); - for (u = 0; u < n; u ++) { - Fp[u] = (uint32_t)modp_norm(Fp[u], p); - Gp[u] = (uint32_t)modp_norm(Gp[u], p); - } - - return 1; -} - -/* - * Solve the NTRU equation. Returned value is 1 on success, 0 on error. - * G can be NULL, in which case that value is computed but not returned. - * If any of the coefficients of F and G exceeds lim (in absolute value), - * then 0 is returned. - */ -static int -solve_NTRU(unsigned logn, int8_t *F, int8_t *G, - const int8_t *f, const int8_t *g, int lim, uint32_t *tmp) -{ - size_t n, u; - uint32_t *ft, *gt, *Ft, *Gt, *gm; - uint32_t p, p0i, r; - const small_prime *primes; - - n = MKN(logn); - - if (!solve_NTRU_deepest(logn, f, g, tmp)) { - return 0; - } - - /* - * For logn <= 2, we need to use solve_NTRU_intermediate() - * directly, because coefficients are a bit too large and - * do not fit the hypotheses in solve_NTRU_binary_depth0(). - */ - if (logn <= 2) { - unsigned depth; - - depth = logn; - while (depth -- > 0) { - if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) { - return 0; - } - } - } else { - unsigned depth; - - depth = logn; - while (depth -- > 2) { - if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) { - return 0; - } - } - if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) { - return 0; - } - if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) { - return 0; - } - } - - /* - * If no buffer has been provided for G, use a temporary one. - */ - if (G == NULL) { - G = (int8_t *)(tmp + 2 * n); - } - - /* - * Final F and G are in fk->tmp, one word per coefficient - * (signed value over 31 bits). - */ - if (!poly_big_to_small(F, tmp, lim, logn) - || !poly_big_to_small(G, tmp + n, lim, logn)) - { - return 0; - } - - /* - * Verify that the NTRU equation is fulfilled. Since all elements - * have short lengths, verifying modulo a small prime p works, and - * allows using the NTT. - * - * We put Gt[] first in tmp[], and process it first, so that it does - * not overlap with G[] in case we allocated it ourselves. - */ - Gt = tmp; - ft = Gt + n; - gt = ft + n; - Ft = gt + n; - gm = Ft + n; - - primes = PRIMES; - p = primes[0].p; - p0i = modp_ninv31(p); - modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i); - for (u = 0; u < n; u ++) { - Gt[u] = modp_set(G[u], p); - } - for (u = 0; u < n; u ++) { - ft[u] = modp_set(f[u], p); - gt[u] = modp_set(g[u], p); - Ft[u] = modp_set(F[u], p); - } - modp_NTT2(ft, gm, logn, p, p0i); - modp_NTT2(gt, gm, logn, p, p0i); - modp_NTT2(Ft, gm, logn, p, p0i); - modp_NTT2(Gt, gm, logn, p, p0i); - r = modp_montymul(12289, 1, p, p0i); - for (u = 0; u < n; u ++) { - uint32_t z; - - z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i), - modp_montymul(gt[u], Ft[u], p, p0i), p); - if (z != r) { - return 0; - } - } - - return 1; -} - -/* - * Generate a random polynomial with a Gaussian distribution. This function - * also makes sure that the resultant of the polynomial with phi is odd. - */ -static void -poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) -{ - size_t n, u; - unsigned mod2; - - n = MKN(logn); - mod2 = 0; - for (u = 0; u < n; u ++) { - int s; - - restart: - s = mkgauss(rng, logn); - - /* - * We need the coefficient to fit within -127..+127; - * realistically, this is always the case except for - * the very low degrees (N = 2 or 4), for which there - * is no real security anyway. - */ - if (s < -127 || s > 127) { - goto restart; - } - - /* - * We need the sum of all coefficients to be 1; otherwise, - * the resultant of the polynomial with X^N+1 will be even, - * and the binary GCD will fail. - */ - if (u == n - 1) { - if ((mod2 ^ (unsigned)(s & 1)) == 0) { - goto restart; - } - } else { - mod2 ^= (unsigned)(s & 1); - } - f[u] = (int8_t)s; - } -} - -/* see falcon.h */ -void -Zf(keygen)(inner_shake256_context *rng, - int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h, - unsigned logn, uint8_t *tmp) -{ - /* - * Algorithm is the following: - * - * - Generate f and g with the Gaussian distribution. - * - * - If either Res(f,phi) or Res(g,phi) is even, try again. - * - * - If ||(f,g)|| is too large, try again. - * - * - If ||B~_{f,g}|| is too large, try again. - * - * - If f is not invertible mod phi mod q, try again. - * - * - Compute h = g/f mod phi mod q. - * - * - Solve the NTRU equation fG - gF = q; if the solving fails, - * try again. Usual failure condition is when Res(f,phi) - * and Res(g,phi) are not prime to each other. - */ - size_t n, u; - uint16_t *h2, *tmp2; - RNG_CONTEXT *rc; -#if FALCON_KG_CHACHA20 // yyyKG_CHACHA20+1 - prng p; -#endif // yyyKG_CHACHA20- - - n = MKN(logn); -#if FALCON_KG_CHACHA20 // yyyKG_CHACHA20+1 - Zf(prng_init)(&p, rng); - rc = &p; -#else // yyyKG_CHACHA20+0 - rc = rng; -#endif // yyyKG_CHACHA20- - - /* - * We need to generate f and g randomly, until we find values - * such that the norm of (g,-f), and of the orthogonalized - * vector, are satisfying. The orthogonalized vector is: - * (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g))) - * (it is actually the (N+1)-th row of the Gram-Schmidt basis). - * - * In the binary case, coefficients of f and g are generated - * independently of each other, with a discrete Gaussian - * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then, - * the two vectors have expected norm 1.17*sqrt(q), which is - * also our acceptance bound: we require both vectors to be no - * larger than that (this will be satisfied about 1/4th of the - * time, thus we expect sampling new (f,g) about 4 times for that - * step). - * - * We require that Res(f,phi) and Res(g,phi) are both odd (the - * NTRU equation solver requires it). - */ - for (;;) { - fpr *rt1, *rt2, *rt3; - fpr bnorm; - uint32_t normf, normg, norm; - int lim; - - /* - * The poly_small_mkgauss() function makes sure - * that the sum of coefficients is 1 modulo 2 - * (i.e. the resultant of the polynomial with phi - * will be odd). - */ - poly_small_mkgauss(rc, f, logn); - poly_small_mkgauss(rc, g, logn); - - /* - * Verify that all coefficients are within the bounds - * defined in max_fg_bits. This is the case with - * overwhelming probability; this guarantees that the - * key will be encodable with FALCON_COMP_TRIM. - */ - lim = 1 << (Zf(max_fg_bits)[logn] - 1); - for (u = 0; u < n; u ++) { - /* - * We can use non-CT tests since on any failure - * we will discard f and g. - */ - if (f[u] >= lim || f[u] <= -lim - || g[u] >= lim || g[u] <= -lim) - { - lim = -1; - break; - } - } - if (lim < 0) { - continue; - } - - /* - * Bound is 1.17*sqrt(q). We compute the squared - * norms. With q = 12289, the squared bound is: - * (1.17^2)* 12289 = 16822.4121 - * Since f and g are integral, the squared norm - * of (g,-f) is an integer. - */ - normf = poly_small_sqnorm(f, logn); - normg = poly_small_sqnorm(g, logn); - norm = (normf + normg) | -((normf | normg) >> 31); - if (norm >= 16823) { - continue; - } - - /* - * We compute the orthogonalized vector norm. - */ - rt1 = (fpr *)tmp; - rt2 = rt1 + n; - rt3 = rt2 + n; - poly_small_to_fp(rt1, f, logn); - poly_small_to_fp(rt2, g, logn); - Zf(FFT)(rt1, logn); - Zf(FFT)(rt2, logn); - Zf(poly_invnorm2_fft)(rt3, rt1, rt2, logn); - Zf(poly_adj_fft)(rt1, logn); - Zf(poly_adj_fft)(rt2, logn); - Zf(poly_mulconst)(rt1, fpr_q, logn); - Zf(poly_mulconst)(rt2, fpr_q, logn); - Zf(poly_mul_autoadj_fft)(rt1, rt3, logn); - Zf(poly_mul_autoadj_fft)(rt2, rt3, logn); - Zf(iFFT)(rt1, logn); - Zf(iFFT)(rt2, logn); - bnorm = fpr_zero; - for (u = 0; u < n; u ++) { - bnorm = fpr_add(bnorm, fpr_sqr(rt1[u])); - bnorm = fpr_add(bnorm, fpr_sqr(rt2[u])); - } - if (!fpr_lt(bnorm, fpr_bnorm_max)) { - continue; - } - - /* - * Compute public key h = g/f mod X^N+1 mod q. If this - * fails, we must restart. - */ - if (h == NULL) { - h2 = (uint16_t *)tmp; - tmp2 = h2 + n; - } else { - h2 = h; - tmp2 = (uint16_t *)tmp; - } - if (!Zf(compute_public)(h2, f, g, logn, (uint8_t *)tmp2)) { - continue; - } - - /* - * Solve the NTRU equation to get F and G. - */ - lim = (1 << (Zf(max_FG_bits)[logn] - 1)) - 1; - if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) { - continue; - } - - /* - * Key pair is generated. - */ - break; - } -} diff --git a/crypto_sign/falcon-1024/m4-ct/pqm4.c b/crypto_sign/falcon-1024/m4-ct/pqm4.c deleted file mode 100644 index 74b83a8b..00000000 --- a/crypto_sign/falcon-1024/m4-ct/pqm4.c +++ /dev/null @@ -1,347 +0,0 @@ -#include -#include - -#include "api.h" -#include "inner.h" -#include "randombytes.h" - -/* ==================================================================== */ - -/* - * Falcon degree is N = 2^LOGN, where LOGN=9 (for Falcon-512) or 10 - * (for Falcon-1024). We use the advertised public key size to know - * which degree is used. - */ -#if CRYPTO_PUBLICKEYBYTES == 897 -#define LOGN 9 -#elif CRYPTO_PUBLICKEYBYTES == 1793 -#define LOGN 10 -#else -#error Unknown Falcon degree (unexpected public key size) -#endif - -#define N ((size_t)1 << LOGN) -#define NONCELEN 40 -#define SEEDLEN 48 - -/* - * If the private key length is larger than 10000, then this is the - * variant with precomputed expanded keys. - */ -#if CRYPTO_SECRETKEYBYTES > 10000 -#define KG_EXPAND 1 -#else -#define KG_EXPAND 0 -#endif - -/* - * Common buffer, to avoid bulky stack allocation. The buffer sizes are - * all expressed in bytes, but the buffer must be suitably aligned for - * 64-bit integers and floating-point values. - * - * Required size (in bytes): - * - * With expanded key: - * keygen: 48*N + 6*N = 54*N - * sign: 48*N + 2*N = 50*N - * vrfy: 8*N - * - * Without expanded key: - * keygen: 28*N + 5*N = 33*N - * sign: 72*N + 6*N = 78*N - * vrfy: 8*N - */ -static union { -#if KG_EXPAND - uint8_t b[54 * N]; -#else - uint8_t b[78 * N]; -#endif - uint64_t dummy_u64; - fpr dummy_fp; -} tmp; - -int -crypto_sign_keypair(unsigned char *pk, unsigned char *sk) -{ - int8_t *f, *g, *F, *G; - uint16_t *h; - inner_shake256_context rng; - unsigned char seed[SEEDLEN]; -#if KG_EXPAND - size_t v; -#else - size_t u, v; -#endif - unsigned sav_cw; - -#if KG_EXPAND - f = (int8_t *)&tmp.b[48 * N]; - g = f + N; - F = g + N; - G = F + N; - h = (uint16_t *)(G + N); -#else - f = (int8_t *)&tmp.b[28 * N]; - g = f + N; - F = g + N; - G = NULL; - h = (uint16_t *)(F + N); -#endif - - randombytes(seed, SEEDLEN); - inner_shake256_init(&rng); - inner_shake256_inject(&rng, seed, SEEDLEN); - inner_shake256_flip(&rng); - sav_cw = set_fpu_cw(2); - Zf(keygen)(&rng, f, g, F, G, h, LOGN, tmp.b); - -#if KG_EXPAND - /* - * Expand private key. - */ - Zf(expand_privkey)((fpr *)sk, f, g, F, G, LOGN, tmp.b); - set_fpu_cw(sav_cw); -#else - set_fpu_cw(sav_cw); - - /* - * Encode private key. - */ - sk[0] = 0x50 + LOGN; - u = 1; - v = Zf(trim_i8_encode)(sk + u, CRYPTO_SECRETKEYBYTES - u, - f, LOGN, Zf(max_fg_bits)[LOGN]); - if (v == 0) { - return -1; - } - u += v; - v = Zf(trim_i8_encode)(sk + u, CRYPTO_SECRETKEYBYTES - u, - g, LOGN, Zf(max_fg_bits)[LOGN]); - if (v == 0) { - return -1; - } - u += v; - v = Zf(trim_i8_encode)(sk + u, CRYPTO_SECRETKEYBYTES - u, - F, LOGN, Zf(max_FG_bits)[LOGN]); - if (v == 0) { - return -1; - } - u += v; - if (u != CRYPTO_SECRETKEYBYTES) { - return -1; - } -#endif - - /* - * Encode public key. - */ - pk[0] = 0x00 + LOGN; - v = Zf(modq_encode)(pk + 1, CRYPTO_PUBLICKEYBYTES - 1, h, LOGN); - if (v != CRYPTO_PUBLICKEYBYTES - 1) { - return -1; - } - - return 0; -} - -int -crypto_sign(unsigned char *sm, size_t *smlen, - const unsigned char *m, size_t mlen, - const unsigned char *sk) -{ -#if KG_EXPAND - const fpr *expanded_key; -#else - int8_t *f, *g, *F, *G; - size_t u, v; -#endif - int16_t *sig; - uint16_t *hm; - unsigned char seed[SEEDLEN], nonce[NONCELEN]; - unsigned char *esig; - inner_shake256_context sc; - size_t sig_len; - unsigned sav_cw; - -#if KG_EXPAND - sig = (int16_t *)&tmp.b[48 * N]; -#else - f = (int8_t *)&tmp.b[72 * N]; - g = f + N; - F = g + N; - G = F + N; - sig = (int16_t *)(G + N); -#endif - hm = (uint16_t *)sig; /* hm[] is shared with sig[] */ - esig = (unsigned char *)tmp.b; - -#if KG_EXPAND - /* - * Expanded key is provided "as is". - */ - expanded_key = (const fpr *)sk; -#else - /* - * Decode the private key. - */ - if (sk[0] != 0x50 + LOGN) { - return -1; - } - u = 1; - v = Zf(trim_i8_decode)(f, LOGN, Zf(max_fg_bits)[LOGN], - sk + u, CRYPTO_SECRETKEYBYTES - u); - if (v == 0) { - return -1; - } - u += v; - v = Zf(trim_i8_decode)(g, LOGN, Zf(max_fg_bits)[LOGN], - sk + u, CRYPTO_SECRETKEYBYTES - u); - if (v == 0) { - return -1; - } - u += v; - v = Zf(trim_i8_decode)(F, LOGN, Zf(max_FG_bits)[LOGN], - sk + u, CRYPTO_SECRETKEYBYTES - u); - if (v == 0) { - return -1; - } - u += v; - if (u != CRYPTO_SECRETKEYBYTES) { - return -1; - } - if (!Zf(complete_private)(G, f, g, F, LOGN, tmp.b)) { - return -1; - } -#endif - - /* - * Create a random nonce (40 bytes). - */ - randombytes(nonce, NONCELEN); - - /* - * Hash message nonce + message into a vector. - */ - inner_shake256_init(&sc); - inner_shake256_inject(&sc, nonce, NONCELEN); - inner_shake256_inject(&sc, m, mlen); - inner_shake256_flip(&sc); - Zf(hash_to_point_vartime)(&sc, hm, LOGN); - - /* - * Initialize a RNG. - */ - randombytes(seed, SEEDLEN); - inner_shake256_init(&sc); - inner_shake256_inject(&sc, seed, SEEDLEN); - inner_shake256_flip(&sc); - - /* - * Compute the signature. - */ - sav_cw = set_fpu_cw(2); -#if KG_EXPAND - Zf(sign_tree)(sig, &sc, expanded_key, hm, LOGN, tmp.b); -#else - Zf(sign_dyn)(sig, &sc, f, g, F, G, hm, LOGN, tmp.b); -#endif - set_fpu_cw(sav_cw); - - /* - * Encode the signature and bundle it with the message. Format is: - * signature length 2 bytes, big-endian - * nonce 40 bytes - * message mlen bytes - * signature slen bytes - */ - esig[0] = 0x20 + LOGN; - sig_len = Zf(comp_encode)(esig + 1, CRYPTO_BYTES - 1, sig, LOGN); - if (sig_len == 0) { - return -1; - } - sig_len ++; - memmove(sm + 2 + NONCELEN, m, mlen); - sm[0] = (unsigned char)(sig_len >> 8); - sm[1] = (unsigned char)sig_len; - memcpy(sm + 2, nonce, NONCELEN); - memcpy(sm + 2 + NONCELEN + mlen, esig, sig_len); - *smlen = 2 + NONCELEN + mlen + sig_len; - return 0; -} - -int -crypto_sign_open(unsigned char *m, size_t *mlen, - const unsigned char *sm, size_t smlen, - const unsigned char *pk) -{ - uint16_t *h, *hm; - int16_t *sig; - const unsigned char *esig; - inner_shake256_context sc; - size_t sig_len, msg_len; - - h = (uint16_t *)&tmp.b[2 * N]; - hm = h + N; - sig = (int16_t *)(hm + N); - - /* - * Decode public key. - */ - if (pk[0] != 0x00 + LOGN) { - return -1; - } - if (Zf(modq_decode)(h, LOGN, pk + 1, CRYPTO_PUBLICKEYBYTES - 1) - != CRYPTO_PUBLICKEYBYTES - 1) - { - return -1; - } - Zf(to_ntt_monty)(h, LOGN); - - /* - * Find nonce, signature, message length. - */ - if (smlen < 2 + NONCELEN) { - return -1; - } - sig_len = ((size_t)sm[0] << 8) | (size_t)sm[1]; - if (sig_len > (smlen - 2 - NONCELEN)) { - return -1; - } - msg_len = smlen - 2 - NONCELEN - sig_len; - - /* - * Decode signature. - */ - esig = sm + 2 + NONCELEN + msg_len; - if (sig_len < 1 || esig[0] != 0x20 + LOGN) { - return -1; - } - if (Zf(comp_decode)(sig, LOGN, - esig + 1, sig_len - 1) != sig_len - 1) - { - return -1; - } - - /* - * Hash nonce + message into a vector. - */ - inner_shake256_init(&sc); - inner_shake256_inject(&sc, sm + 2, NONCELEN + msg_len); - inner_shake256_flip(&sc); - Zf(hash_to_point_vartime)(&sc, hm, LOGN); - - /* - * Verify signature. - */ - if (!Zf(verify_raw)(hm, sig, h, LOGN, tmp.b)) { - return -1; - } - - /* - * Return plaintext. - */ - memmove(m, sm + 2 + NONCELEN, msg_len); - *mlen = msg_len; - return 0; -} diff --git a/crypto_sign/falcon-1024/m4-ct/rng.c b/crypto_sign/falcon-1024/m4-ct/rng.c deleted file mode 100644 index d2ecb7af..00000000 --- a/crypto_sign/falcon-1024/m4-ct/rng.c +++ /dev/null @@ -1,379 +0,0 @@ -/* - * PRNG and interface to the system RNG. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include - -#include "inner.h" - -// yyyNIST+0 yyyPQCLEAN+0 -/* - * Include relevant system header files. For Win32, this will also need - * linking with advapi32.dll, which we trigger with an appropriate #pragma. - */ -#if FALCON_RAND_GETENTROPY -#include -#endif -#if FALCON_RAND_URANDOM -#include -#if !FALCON_RAND_GETENTROPY -#include -#endif -#include -#include -#endif -#if FALCON_RAND_WIN32 -#include -#include -#pragma comment(lib, "advapi32") -#endif - -/* see inner.h */ -int -Zf(get_seed)(void *seed, size_t len) -{ - (void)seed; - if (len == 0) { - return 1; - } -#if FALCON_RAND_GETENTROPY - if (getentropy(seed, len) == 0) { - return 1; - } -#endif -#if FALCON_RAND_URANDOM - { - int f; - - f = open("/dev/urandom", O_RDONLY); - if (f >= 0) { - while (len > 0) { - ssize_t rlen; - - rlen = read(f, seed, len); - if (rlen < 0) { - if (errno == EINTR) { - continue; - } - break; - } - seed = (uint8_t *)seed + rlen; - len -= (size_t)rlen; - } - close(f); - if (len == 0) { - return 1; - } - } - } -#endif -#if FALCON_RAND_WIN32 - { - HCRYPTPROV hp; - - if (CryptAcquireContext(&hp, 0, 0, PROV_RSA_FULL, - CRYPT_VERIFYCONTEXT | CRYPT_SILENT)) - { - BOOL r; - - r = CryptGenRandom(hp, (DWORD)len, seed); - CryptReleaseContext(hp, 0); - if (r) { - return 1; - } - } - } -#endif - return 0; -} -// yyyNIST- yyyPQCLEAN- - -/* see inner.h */ -void -Zf(prng_init)(prng *p, inner_shake256_context *src) -{ -#if FALCON_LE // yyyLE+1 - inner_shake256_extract(src, p->state.d, 56); -#else // yyyLE+0 - /* - * To ensure reproducibility for a given seed, we - * must enforce little-endian interpretation of - * the state words. - */ - uint8_t tmp[56]; - uint64_t th, tl; - int i; - - inner_shake256_extract(src, tmp, 56); - for (i = 0; i < 14; i ++) { - uint32_t w; - - w = (uint32_t)tmp[(i << 2) + 0] - | ((uint32_t)tmp[(i << 2) + 1] << 8) - | ((uint32_t)tmp[(i << 2) + 2] << 16) - | ((uint32_t)tmp[(i << 2) + 3] << 24); - *(uint32_t *)(p->state.d + (i << 2)) = w; - } - tl = *(uint32_t *)(p->state.d + 48); - th = *(uint32_t *)(p->state.d + 52); - *(uint64_t *)(p->state.d + 48) = tl + (th << 32); -#endif // yyyLE- - Zf(prng_refill)(p); -} - -/* - * PRNG based on ChaCha20. - * - * State consists in key (32 bytes) then IV (16 bytes) and block counter - * (8 bytes). Normally, we should not care about local endianness (this - * is for a PRNG), but for the NIST competition we need reproducible KAT - * vectors that work across architectures, so we enforce little-endian - * interpretation where applicable. Moreover, output words are "spread - * out" over the output buffer with the interleaving pattern that is - * naturally obtained from the AVX2 implementation that runs eight - * ChaCha20 instances in parallel. - * - * The block counter is XORed into the first 8 bytes of the IV. - */ -TARGET_AVX2 -void -Zf(prng_refill)(prng *p) -{ -#if FALCON_AVX2 // yyyAVX2+1 - - static const uint32_t CW[] = { - 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 - }; - - uint64_t cc; - size_t u; - int i; - uint32_t *sw; - union { - uint32_t w[16]; - __m256i y[2]; /* for alignment */ - } t; - __m256i state[16], init[16]; - - sw = (uint32_t *)p->state.d; - - /* - * XOR next counter values into state. - */ - cc = *(uint64_t *)(p->state.d + 48); - for (u = 0; u < 8; u ++) { - t.w[u] = (uint32_t)(cc + u); - t.w[u + 8] = (uint32_t)((cc + u) >> 32); - } - *(uint64_t *)(p->state.d + 48) = cc + 8; - - /* - * Load state. - */ - for (u = 0; u < 4; u ++) { - state[u] = init[u] = - _mm256_broadcastd_epi32(_mm_cvtsi32_si128(CW[u])); - } - for (u = 0; u < 10; u ++) { - state[u + 4] = init[u + 4] = - _mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[u])); - } - state[14] = init[14] = _mm256_xor_si256( - _mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[10])), - _mm256_loadu_si256((__m256i *)&t.w[0])); - state[15] = init[15] = _mm256_xor_si256( - _mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[11])), - _mm256_loadu_si256((__m256i *)&t.w[8])); - - /* - * Do all rounds. - */ - for (i = 0; i < 10; i ++) { - -#define QROUND(a, b, c, d) do { \ - state[a] = _mm256_add_epi32(state[a], state[b]); \ - state[d] = _mm256_xor_si256(state[d], state[a]); \ - state[d] = _mm256_or_si256( \ - _mm256_slli_epi32(state[d], 16), \ - _mm256_srli_epi32(state[d], 16)); \ - state[c] = _mm256_add_epi32(state[c], state[d]); \ - state[b] = _mm256_xor_si256(state[b], state[c]); \ - state[b] = _mm256_or_si256( \ - _mm256_slli_epi32(state[b], 12), \ - _mm256_srli_epi32(state[b], 20)); \ - state[a] = _mm256_add_epi32(state[a], state[b]); \ - state[d] = _mm256_xor_si256(state[d], state[a]); \ - state[d] = _mm256_or_si256( \ - _mm256_slli_epi32(state[d], 8), \ - _mm256_srli_epi32(state[d], 24)); \ - state[c] = _mm256_add_epi32(state[c], state[d]); \ - state[b] = _mm256_xor_si256(state[b], state[c]); \ - state[b] = _mm256_or_si256( \ - _mm256_slli_epi32(state[b], 7), \ - _mm256_srli_epi32(state[b], 25)); \ - } while (0) - - QROUND( 0, 4, 8, 12); - QROUND( 1, 5, 9, 13); - QROUND( 2, 6, 10, 14); - QROUND( 3, 7, 11, 15); - QROUND( 0, 5, 10, 15); - QROUND( 1, 6, 11, 12); - QROUND( 2, 7, 8, 13); - QROUND( 3, 4, 9, 14); - -#undef QROUND - - } - - /* - * Add initial state back and encode the result in the destination - * buffer. We can dump the AVX2 values "as is" because the non-AVX2 - * code uses a compatible order of values. - */ - for (u = 0; u < 16; u ++) { - _mm256_storeu_si256((__m256i *)&p->buf.d[u << 5], - _mm256_add_epi32(state[u], init[u])); - } - -#else // yyyAVX2+0 - - static const uint32_t CW[] = { - 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 - }; - - uint64_t cc; - size_t u; - - /* - * State uses local endianness. Only the output bytes must be - * converted to little endian (if used on a big-endian machine). - */ - cc = *(uint64_t *)(p->state.d + 48); - for (u = 0; u < 8; u ++) { - uint32_t state[16]; - size_t v; - int i; - - memcpy(&state[0], CW, sizeof CW); - memcpy(&state[4], p->state.d, 48); - state[14] ^= (uint32_t)cc; - state[15] ^= (uint32_t)(cc >> 32); - for (i = 0; i < 10; i ++) { - -#define QROUND(a, b, c, d) do { \ - state[a] += state[b]; \ - state[d] ^= state[a]; \ - state[d] = (state[d] << 16) | (state[d] >> 16); \ - state[c] += state[d]; \ - state[b] ^= state[c]; \ - state[b] = (state[b] << 12) | (state[b] >> 20); \ - state[a] += state[b]; \ - state[d] ^= state[a]; \ - state[d] = (state[d] << 8) | (state[d] >> 24); \ - state[c] += state[d]; \ - state[b] ^= state[c]; \ - state[b] = (state[b] << 7) | (state[b] >> 25); \ - } while (0) - - QROUND( 0, 4, 8, 12); - QROUND( 1, 5, 9, 13); - QROUND( 2, 6, 10, 14); - QROUND( 3, 7, 11, 15); - QROUND( 0, 5, 10, 15); - QROUND( 1, 6, 11, 12); - QROUND( 2, 7, 8, 13); - QROUND( 3, 4, 9, 14); - -#undef QROUND - - } - - for (v = 0; v < 4; v ++) { - state[v] += CW[v]; - } - for (v = 4; v < 14; v ++) { - state[v] += ((uint32_t *)p->state.d)[v - 4]; - } - state[14] += ((uint32_t *)p->state.d)[10] - ^ (uint32_t)cc; - state[15] += ((uint32_t *)p->state.d)[11] - ^ (uint32_t)(cc >> 32); - cc ++; - - /* - * We mimic the interleaving that is used in the AVX2 - * implementation. - */ - for (v = 0; v < 16; v ++) { -#if FALCON_LE // yyyLE+1 - ((uint32_t *)p->buf.d)[u + (v << 3)] = state[v]; -#else // yyyLE+0 - p->buf.d[(u << 2) + (v << 5) + 0] = - (uint8_t)state[v]; - p->buf.d[(u << 2) + (v << 5) + 1] = - (uint8_t)(state[v] >> 8); - p->buf.d[(u << 2) + (v << 5) + 2] = - (uint8_t)(state[v] >> 16); - p->buf.d[(u << 2) + (v << 5) + 3] = - (uint8_t)(state[v] >> 24); -#endif // yyyLE- - } - } - *(uint64_t *)(p->state.d + 48) = cc; - -#endif // yyyAVX2- - - p->ptr = 0; -} - -/* see inner.h */ -void -Zf(prng_get_bytes)(prng *p, void *dst, size_t len) -{ - uint8_t *buf; - - buf = dst; - while (len > 0) { - size_t clen; - - clen = (sizeof p->buf.d) - p->ptr; - if (clen > len) { - clen = len; - } - memcpy(buf, p->buf.d, clen); - buf += clen; - len -= clen; - p->ptr += clen; - if (p->ptr == sizeof p->buf.d) { - Zf(prng_refill)(p); - } - } -} diff --git a/crypto_sign/falcon-1024/m4-ct/sign.c b/crypto_sign/falcon-1024/m4-ct/sign.c deleted file mode 100644 index 752fb8ba..00000000 --- a/crypto_sign/falcon-1024/m4-ct/sign.c +++ /dev/null @@ -1,1532 +0,0 @@ -/* - * Falcon signature generation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -/* =================================================================== */ - -/* - * Compute degree N from logarithm 'logn'. - */ -#define MKN(logn) ((size_t)1 << (logn)) - -/* =================================================================== */ -/* - * Binary case: - * N = 2^logn - * phi = X^N+1 - */ - -/* - * Get the size of the LDL tree for an input with polynomials of size - * 2^logn. The size is expressed in the number of elements. - */ -static inline unsigned -ffLDL_treesize(unsigned logn) -{ - /* - * For logn = 0 (polynomials are constant), the "tree" is a - * single element. Otherwise, the tree node has size 2^logn, and - * has two child trees for size logn-1 each. Thus, treesize s() - * must fulfill these two relations: - * - * s(0) = 1 - * s(logn) = (2^logn) + 2*s(logn-1) - */ - return (logn + 1) << logn; -} - -/* - * Inner function for ffLDL_fft(). It expects the matrix to be both - * auto-adjoint and quasicyclic; also, it uses the source operands - * as modifiable temporaries. - * - * tmp[] must have room for at least one polynomial. - */ -static void -ffLDL_fft_inner(fpr *restrict tree, - fpr *restrict g0, fpr *restrict g1, unsigned logn, fpr *restrict tmp) -{ - size_t n, hn; - - n = MKN(logn); - if (n == 1) { - tree[0] = g0[0]; - return; - } - hn = n >> 1; - - /* - * The LDL decomposition yields L (which is written in the tree) - * and the diagonal of D. Since d00 = g0, we just write d11 - * into tmp. - */ - Zf(poly_LDLmv_fft)(tmp, tree, g0, g1, g0, logn); - - /* - * Split d00 (currently in g0) and d11 (currently in tmp). We - * reuse g0 and g1 as temporary storage spaces: - * d00 splits into g1, g1+hn - * d11 splits into g0, g0+hn - */ - Zf(poly_split_fft)(g1, g1 + hn, g0, logn); - Zf(poly_split_fft)(g0, g0 + hn, tmp, logn); - - /* - * Each split result is the first row of a new auto-adjoint - * quasicyclic matrix for the next recursive step. - */ - ffLDL_fft_inner(tree + n, - g1, g1 + hn, logn - 1, tmp); - ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1), - g0, g0 + hn, logn - 1, tmp); -} - -/* - * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix - * is provided as three polynomials (FFT representation). - * - * The "tree" array is filled with the computed tree, of size - * (logn+1)*(2^logn) elements (see ffLDL_treesize()). - * - * Input arrays MUST NOT overlap, except possibly the three unmodified - * arrays g00, g01 and g11. tmp[] should have room for at least three - * polynomials of 2^logn elements each. - */ -static void -ffLDL_fft(fpr *restrict tree, const fpr *restrict g00, - const fpr *restrict g01, const fpr *restrict g11, - unsigned logn, fpr *restrict tmp) -{ - size_t n, hn; - fpr *d00, *d11; - - n = MKN(logn); - if (n == 1) { - tree[0] = g00[0]; - return; - } - hn = n >> 1; - d00 = tmp; - d11 = tmp + n; - tmp += n << 1; - - memcpy(d00, g00, n * sizeof *g00); - Zf(poly_LDLmv_fft)(d11, tree, g00, g01, g11, logn); - - Zf(poly_split_fft)(tmp, tmp + hn, d00, logn); - Zf(poly_split_fft)(d00, d00 + hn, d11, logn); - memcpy(d11, tmp, n * sizeof *tmp); - ffLDL_fft_inner(tree + n, - d11, d11 + hn, logn - 1, tmp); - ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1), - d00, d00 + hn, logn - 1, tmp); -} - -/* - * Normalize an ffLDL tree: each leaf of value x is replaced with - * sigma / sqrt(x). - */ -static void -ffLDL_binary_normalize(fpr *tree, unsigned logn) -{ - /* - * TODO: make an iterative version. - */ - size_t n; - - n = MKN(logn); - if (n == 1) { - /* - * We actually store in the tree leaf the inverse of - * the value mandated by the specification: this - * saves a division both here and in the sampler. - */ - tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma); - } else { - ffLDL_binary_normalize(tree + n, logn - 1); - ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1), - logn - 1); - } -} - -/* =================================================================== */ - -/* - * Convert an integer polynomial (with small values) into the - * representation with complex numbers. - */ -static void -smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn) -{ - size_t n, u; - - n = MKN(logn); - for (u = 0; u < n; u ++) { - r[u] = fpr_of(t[u]); - } -} - -/* - * The expanded private key contains: - * - The B0 matrix (four elements) - * - The ffLDL tree - */ - -static inline size_t -skoff_b00(unsigned logn) -{ - (void)logn; - return 0; -} - -static inline size_t -skoff_b01(unsigned logn) -{ - return MKN(logn); -} - -static inline size_t -skoff_b10(unsigned logn) -{ - return 2 * MKN(logn); -} - -static inline size_t -skoff_b11(unsigned logn) -{ - return 3 * MKN(logn); -} - -static inline size_t -skoff_tree(unsigned logn) -{ - return 4 * MKN(logn); -} - -/* see inner.h */ -void -Zf(expand_privkey)(fpr *restrict expanded_key, - const int8_t *f, const int8_t *g, - const int8_t *F, const int8_t *G, - unsigned logn, uint8_t *restrict tmp) -{ - size_t n; - fpr *rf, *rg, *rF, *rG; - fpr *b00, *b01, *b10, *b11; - fpr *g00, *g01, *g11, *gxx; - fpr *tree; - - n = MKN(logn); - b00 = expanded_key + skoff_b00(logn); - b01 = expanded_key + skoff_b01(logn); - b10 = expanded_key + skoff_b10(logn); - b11 = expanded_key + skoff_b11(logn); - tree = expanded_key + skoff_tree(logn); - - /* - * We load the private key elements directly into the B0 matrix, - * since B0 = [[g, -f], [G, -F]]. - */ - rf = b01; - rg = b00; - rF = b11; - rG = b10; - - smallints_to_fpr(rf, f, logn); - smallints_to_fpr(rg, g, logn); - smallints_to_fpr(rF, F, logn); - smallints_to_fpr(rG, G, logn); - - /* - * Compute the FFT for the key elements, and negate f and F. - */ - Zf(FFT)(rf, logn); - Zf(FFT)(rg, logn); - Zf(FFT)(rF, logn); - Zf(FFT)(rG, logn); - Zf(poly_neg)(rf, logn); - Zf(poly_neg)(rF, logn); - - /* - * The Gram matrix is G = B·B*. Formulas are: - * g00 = b00*adj(b00) + b01*adj(b01) - * g01 = b00*adj(b10) + b01*adj(b11) - * g10 = b10*adj(b00) + b11*adj(b01) - * g11 = b10*adj(b10) + b11*adj(b11) - * - * For historical reasons, this implementation uses - * g00, g01 and g11 (upper triangle). - */ - g00 = (fpr *)tmp; - g01 = g00 + n; - g11 = g01 + n; - gxx = g11 + n; - - memcpy(g00, b00, n * sizeof *b00); - Zf(poly_mulselfadj_fft)(g00, logn); - memcpy(gxx, b01, n * sizeof *b01); - Zf(poly_mulselfadj_fft)(gxx, logn); - Zf(poly_add)(g00, gxx, logn); - - memcpy(g01, b00, n * sizeof *b00); - Zf(poly_muladj_fft)(g01, b10, logn); - memcpy(gxx, b01, n * sizeof *b01); - Zf(poly_muladj_fft)(gxx, b11, logn); - Zf(poly_add)(g01, gxx, logn); - - memcpy(g11, b10, n * sizeof *b10); - Zf(poly_mulselfadj_fft)(g11, logn); - memcpy(gxx, b11, n * sizeof *b11); - Zf(poly_mulselfadj_fft)(gxx, logn); - Zf(poly_add)(g11, gxx, logn); - - /* - * Compute the Falcon tree. - */ - ffLDL_fft(tree, g00, g01, g11, logn, gxx); - - /* - * Normalize tree. - */ - ffLDL_binary_normalize(tree, logn); -} - -typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma); - -/* - * Perform Fast Fourier Sampling for target vector t. The Gram matrix - * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector - * is written over (t0,t1). The Gram matrix is modified as well. The - * tmp[] buffer must have room for four polynomials. - */ -TARGET_AVX2 -static void -ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx, - fpr *restrict t0, fpr *restrict t1, - fpr *restrict g00, fpr *restrict g01, fpr *restrict g11, - unsigned logn, fpr *restrict tmp) -{ - size_t n, hn; - fpr *z0, *z1; - - /* - * Deepest level: the LDL tree leaf value is just g00 (the - * array has length only 1 at this point); we normalize it - * with regards to sigma, then use it for sampling. - */ - if (logn == 0) { - fpr leaf; - - leaf = g00[0]; - leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma); - t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf)); - t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf)); - return; - } - - n = (size_t)1 << logn; - hn = n >> 1; - - /* - * Decompose G into LDL. We only need d00 (identical to g00), - * d11, and l10; we do that in place. - */ - Zf(poly_LDL_fft)(g00, g01, g11, logn); - - /* - * Split d00 and d11 and expand them into half-size quasi-cyclic - * Gram matrices. We also save l10 in tmp[]. - */ - Zf(poly_split_fft)(tmp, tmp + hn, g00, logn); - memcpy(g00, tmp, n * sizeof *tmp); - Zf(poly_split_fft)(tmp, tmp + hn, g11, logn); - memcpy(g11, tmp, n * sizeof *tmp); - memcpy(tmp, g01, n * sizeof *g01); - memcpy(g01, g00, hn * sizeof *g00); - memcpy(g01 + hn, g11, hn * sizeof *g00); - - /* - * The half-size Gram matrices for the recursive LDL tree - * building are now: - * - left sub-tree: g00, g00+hn, g01 - * - right sub-tree: g11, g11+hn, g01+hn - * l10 is in tmp[]. - */ - - /* - * We split t1 and use the first recursive call on the two - * halves, using the right sub-tree. The result is merged - * back into tmp + 2*n. - */ - z1 = tmp + n; - Zf(poly_split_fft)(z1, z1 + hn, t1, logn); - ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn, - g11, g11 + hn, g01 + hn, logn - 1, z1 + n); - Zf(poly_merge_fft)(tmp + (n << 1), z1, z1 + hn, logn); - - /* - * Compute tb0 = t0 + (t1 - z1) * l10. - * At that point, l10 is in tmp, t1 is unmodified, and z1 is - * in tmp + (n << 1). The buffer in z1 is free. - * - * In the end, z1 is written over t1, and tb0 is in t0. - */ - memcpy(z1, t1, n * sizeof *t1); - Zf(poly_sub)(z1, tmp + (n << 1), logn); - memcpy(t1, tmp + (n << 1), n * sizeof *tmp); - Zf(poly_mul_fft)(tmp, z1, logn); - Zf(poly_add)(t0, tmp, logn); - - /* - * Second recursive invocation, on the split tb0 (currently in t0) - * and the left sub-tree. - */ - z0 = tmp; - Zf(poly_split_fft)(z0, z0 + hn, t0, logn); - ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn, - g00, g00 + hn, g01, logn - 1, z0 + n); - Zf(poly_merge_fft)(t0, z0, z0 + hn, logn); -} - -/* - * Perform Fast Fourier Sampling for target vector t and LDL tree T. - * tmp[] must have size for at least two polynomials of size 2^logn. - */ -TARGET_AVX2 -static void -ffSampling_fft(samplerZ samp, void *samp_ctx, - fpr *restrict z0, fpr *restrict z1, - const fpr *restrict tree, - const fpr *restrict t0, const fpr *restrict t1, unsigned logn, - fpr *restrict tmp) -{ - size_t n, hn; - const fpr *tree0, *tree1; - - /* - * When logn == 2, we inline the last two recursion levels. - */ - if (logn == 2) { -#if FALCON_AVX2 // yyyAVX2+1 - fpr w0, w1, w2, w3, sigma; - __m128d ww0, ww1, wa, wb, wc, wd; - __m128d wy0, wy1, wz0, wz1; - __m128d half, invsqrt8, invsqrt2, neghi, neglo; - int si0, si1, si2, si3; - - tree0 = tree + 4; - tree1 = tree + 8; - - half = _mm_set1_pd(0.5); - invsqrt8 = _mm_set1_pd(0.353553390593273762200422181052); - invsqrt2 = _mm_set1_pd(0.707106781186547524400844362105); - neghi = _mm_set_pd(-0.0, 0.0); - neglo = _mm_set_pd(0.0, -0.0); - - /* - * We split t1 into w*, then do the recursive invocation, - * with output in w*. We finally merge back into z1. - */ - ww0 = _mm_loadu_pd(&t1[0].v); - ww1 = _mm_loadu_pd(&t1[2].v); - wa = _mm_unpacklo_pd(ww0, ww1); - wb = _mm_unpackhi_pd(ww0, ww1); - wc = _mm_add_pd(wa, wb); - ww0 = _mm_mul_pd(wc, half); - wc = _mm_sub_pd(wa, wb); - wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi); - ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8); - - w2.v = _mm_cvtsd_f64(ww1); - w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1)); - wa = ww1; - sigma = tree1[3]; - si2 = samp(samp_ctx, w2, sigma); - si3 = samp(samp_ctx, w3, sigma); - ww1 = _mm_set_pd((double)si3, (double)si2); - wa = _mm_sub_pd(wa, ww1); - wb = _mm_loadu_pd(&tree1[0].v); - wc = _mm_mul_pd(wa, wb); - wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1)); - wa = _mm_unpacklo_pd(wc, wd); - wb = _mm_unpackhi_pd(wc, wd); - ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo))); - w0.v = _mm_cvtsd_f64(ww0); - w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1)); - sigma = tree1[2]; - si0 = samp(samp_ctx, w0, sigma); - si1 = samp(samp_ctx, w1, sigma); - ww0 = _mm_set_pd((double)si1, (double)si0); - - wc = _mm_mul_pd( - _mm_set_pd((double)(si2 + si3), (double)(si2 - si3)), - invsqrt2); - wa = _mm_add_pd(ww0, wc); - wb = _mm_sub_pd(ww0, wc); - ww0 = _mm_unpacklo_pd(wa, wb); - ww1 = _mm_unpackhi_pd(wa, wb); - _mm_storeu_pd(&z1[0].v, ww0); - _mm_storeu_pd(&z1[2].v, ww1); - - /* - * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*. - */ - wy0 = _mm_sub_pd(_mm_loadu_pd(&t1[0].v), ww0); - wy1 = _mm_sub_pd(_mm_loadu_pd(&t1[2].v), ww1); - wz0 = _mm_loadu_pd(&tree[0].v); - wz1 = _mm_loadu_pd(&tree[2].v); - ww0 = _mm_sub_pd(_mm_mul_pd(wy0, wz0), _mm_mul_pd(wy1, wz1)); - ww1 = _mm_add_pd(_mm_mul_pd(wy0, wz1), _mm_mul_pd(wy1, wz0)); - ww0 = _mm_add_pd(ww0, _mm_loadu_pd(&t0[0].v)); - ww1 = _mm_add_pd(ww1, _mm_loadu_pd(&t0[2].v)); - - /* - * Second recursive invocation. - */ - wa = _mm_unpacklo_pd(ww0, ww1); - wb = _mm_unpackhi_pd(ww0, ww1); - wc = _mm_add_pd(wa, wb); - ww0 = _mm_mul_pd(wc, half); - wc = _mm_sub_pd(wa, wb); - wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi); - ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8); - - w2.v = _mm_cvtsd_f64(ww1); - w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1)); - wa = ww1; - sigma = tree0[3]; - si2 = samp(samp_ctx, w2, sigma); - si3 = samp(samp_ctx, w3, sigma); - ww1 = _mm_set_pd((double)si3, (double)si2); - wa = _mm_sub_pd(wa, ww1); - wb = _mm_loadu_pd(&tree0[0].v); - wc = _mm_mul_pd(wa, wb); - wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1)); - wa = _mm_unpacklo_pd(wc, wd); - wb = _mm_unpackhi_pd(wc, wd); - ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo))); - w0.v = _mm_cvtsd_f64(ww0); - w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1)); - sigma = tree0[2]; - si0 = samp(samp_ctx, w0, sigma); - si1 = samp(samp_ctx, w1, sigma); - ww0 = _mm_set_pd((double)si1, (double)si0); - - wc = _mm_mul_pd( - _mm_set_pd((double)(si2 + si3), (double)(si2 - si3)), - invsqrt2); - wa = _mm_add_pd(ww0, wc); - wb = _mm_sub_pd(ww0, wc); - ww0 = _mm_unpacklo_pd(wa, wb); - ww1 = _mm_unpackhi_pd(wa, wb); - _mm_storeu_pd(&z0[0].v, ww0); - _mm_storeu_pd(&z0[2].v, ww1); - - return; -#else // yyyAVX2+0 - fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma; - fpr a_re, a_im, b_re, b_im, c_re, c_im; - - tree0 = tree + 4; - tree1 = tree + 8; - - /* - * We split t1 into w*, then do the recursive invocation, - * with output in w*. We finally merge back into z1. - */ - a_re = t1[0]; - a_im = t1[2]; - b_re = t1[1]; - b_im = t1[3]; - c_re = fpr_add(a_re, b_re); - c_im = fpr_add(a_im, b_im); - w0 = fpr_half(c_re); - w1 = fpr_half(c_im); - c_re = fpr_sub(a_re, b_re); - c_im = fpr_sub(a_im, b_im); - w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8); - w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8); - - x0 = w2; - x1 = w3; - sigma = tree1[3]; - w2 = fpr_of(samp(samp_ctx, x0, sigma)); - w3 = fpr_of(samp(samp_ctx, x1, sigma)); - a_re = fpr_sub(x0, w2); - a_im = fpr_sub(x1, w3); - b_re = tree1[0]; - b_im = tree1[1]; - c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); - c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); - x0 = fpr_add(c_re, w0); - x1 = fpr_add(c_im, w1); - sigma = tree1[2]; - w0 = fpr_of(samp(samp_ctx, x0, sigma)); - w1 = fpr_of(samp(samp_ctx, x1, sigma)); - - a_re = w0; - a_im = w1; - b_re = w2; - b_im = w3; - c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2); - c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2); - z1[0] = w0 = fpr_add(a_re, c_re); - z1[2] = w2 = fpr_add(a_im, c_im); - z1[1] = w1 = fpr_sub(a_re, c_re); - z1[3] = w3 = fpr_sub(a_im, c_im); - - /* - * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*. - */ - w0 = fpr_sub(t1[0], w0); - w1 = fpr_sub(t1[1], w1); - w2 = fpr_sub(t1[2], w2); - w3 = fpr_sub(t1[3], w3); - - a_re = w0; - a_im = w2; - b_re = tree[0]; - b_im = tree[2]; - w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); - w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); - a_re = w1; - a_im = w3; - b_re = tree[1]; - b_im = tree[3]; - w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); - w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); - - w0 = fpr_add(w0, t0[0]); - w1 = fpr_add(w1, t0[1]); - w2 = fpr_add(w2, t0[2]); - w3 = fpr_add(w3, t0[3]); - - /* - * Second recursive invocation. - */ - a_re = w0; - a_im = w2; - b_re = w1; - b_im = w3; - c_re = fpr_add(a_re, b_re); - c_im = fpr_add(a_im, b_im); - w0 = fpr_half(c_re); - w1 = fpr_half(c_im); - c_re = fpr_sub(a_re, b_re); - c_im = fpr_sub(a_im, b_im); - w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8); - w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8); - - x0 = w2; - x1 = w3; - sigma = tree0[3]; - w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma)); - w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma)); - a_re = fpr_sub(x0, y0); - a_im = fpr_sub(x1, y1); - b_re = tree0[0]; - b_im = tree0[1]; - c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); - c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); - x0 = fpr_add(c_re, w0); - x1 = fpr_add(c_im, w1); - sigma = tree0[2]; - w0 = fpr_of(samp(samp_ctx, x0, sigma)); - w1 = fpr_of(samp(samp_ctx, x1, sigma)); - - a_re = w0; - a_im = w1; - b_re = w2; - b_im = w3; - c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2); - c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2); - z0[0] = fpr_add(a_re, c_re); - z0[2] = fpr_add(a_im, c_im); - z0[1] = fpr_sub(a_re, c_re); - z0[3] = fpr_sub(a_im, c_im); - - return; -#endif // yyyAVX2- - } - - /* - * Case logn == 1 is reachable only when using Falcon-2 (the - * smallest size for which Falcon is mathematically defined, but - * of course way too insecure to be of any use). - */ - if (logn == 1) { - fpr x0, x1, y0, y1, sigma; - fpr a_re, a_im, b_re, b_im, c_re, c_im; - - x0 = t1[0]; - x1 = t1[1]; - sigma = tree[3]; - z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma)); - z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma)); - a_re = fpr_sub(x0, y0); - a_im = fpr_sub(x1, y1); - b_re = tree[0]; - b_im = tree[1]; - c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); - c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); - x0 = fpr_add(c_re, t0[0]); - x1 = fpr_add(c_im, t0[1]); - sigma = tree[2]; - z0[0] = fpr_of(samp(samp_ctx, x0, sigma)); - z0[1] = fpr_of(samp(samp_ctx, x1, sigma)); - - return; - } - - /* - * Normal end of recursion is for logn == 0. Since the last - * steps of the recursions were inlined in the blocks above - * (when logn == 1 or 2), this case is not reachable, and is - * retained here only for documentation purposes. - - if (logn == 0) { - fpr x0, x1, sigma; - - x0 = t0[0]; - x1 = t1[0]; - sigma = tree[0]; - z0[0] = fpr_of(samp(samp_ctx, x0, sigma)); - z1[0] = fpr_of(samp(samp_ctx, x1, sigma)); - return; - } - - */ - - /* - * General recursive case (logn >= 3). - */ - - n = (size_t)1 << logn; - hn = n >> 1; - tree0 = tree + n; - tree1 = tree + n + ffLDL_treesize(logn - 1); - - /* - * We split t1 into z1 (reused as temporary storage), then do - * the recursive invocation, with output in tmp. We finally - * merge back into z1. - */ - Zf(poly_split_fft)(z1, z1 + hn, t1, logn); - ffSampling_fft(samp, samp_ctx, tmp, tmp + hn, - tree1, z1, z1 + hn, logn - 1, tmp + n); - Zf(poly_merge_fft)(z1, tmp, tmp + hn, logn); - - /* - * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[]. - */ - memcpy(tmp, t1, n * sizeof *t1); - Zf(poly_sub)(tmp, z1, logn); - Zf(poly_mul_fft)(tmp, tree, logn); - Zf(poly_add)(tmp, t0, logn); - - /* - * Second recursive invocation. - */ - Zf(poly_split_fft)(z0, z0 + hn, tmp, logn); - ffSampling_fft(samp, samp_ctx, tmp, tmp + hn, - tree0, z0, z0 + hn, logn - 1, tmp + n); - Zf(poly_merge_fft)(z0, tmp, tmp + hn, logn); -} - -/* - * Compute a signature: the signature contains two vectors, s1 and s2. - * The s1 vector is not returned. The squared norm of (s1,s2) is - * computed, and if it is short enough, then s2 is returned into the - * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is - * returned; the caller should then try again. This function uses an - * expanded key. - * - * tmp[] must have room for at least six polynomials. - */ -static int -do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2, - const fpr *restrict expanded_key, - const uint16_t *hm, - unsigned logn, fpr *restrict tmp) -{ - size_t n, u; - fpr *t0, *t1, *tx, *ty; - const fpr *b00, *b01, *b10, *b11, *tree; - fpr ni; - uint32_t sqn, ng; - int16_t *s1tmp, *s2tmp; - - n = MKN(logn); - t0 = tmp; - t1 = t0 + n; - b00 = expanded_key + skoff_b00(logn); - b01 = expanded_key + skoff_b01(logn); - b10 = expanded_key + skoff_b10(logn); - b11 = expanded_key + skoff_b11(logn); - tree = expanded_key + skoff_tree(logn); - - /* - * Set the target vector to [hm, 0] (hm is the hashed message). - */ - for (u = 0; u < n; u ++) { - t0[u] = fpr_of(hm[u]); - /* This is implicit. - t1[u] = fpr_zero; - */ - } - - /* - * Apply the lattice basis to obtain the real target - * vector (after normalization with regards to modulus). - */ - Zf(FFT)(t0, logn); - ni = fpr_inverse_of_q; - memcpy(t1, t0, n * sizeof *t0); - Zf(poly_mul_fft)(t1, b01, logn); - Zf(poly_mulconst)(t1, fpr_neg(ni), logn); - Zf(poly_mul_fft)(t0, b11, logn); - Zf(poly_mulconst)(t0, ni, logn); - - tx = t1 + n; - ty = tx + n; - - /* - * Apply sampling. Output is written back in [tx, ty]. - */ - ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, logn, ty + n); - - /* - * Get the lattice point corresponding to that tiny vector. - */ - memcpy(t0, tx, n * sizeof *tx); - memcpy(t1, ty, n * sizeof *ty); - Zf(poly_mul_fft)(tx, b00, logn); - Zf(poly_mul_fft)(ty, b10, logn); - Zf(poly_add)(tx, ty, logn); - memcpy(ty, t0, n * sizeof *t0); - Zf(poly_mul_fft)(ty, b01, logn); - - memcpy(t0, tx, n * sizeof *tx); - Zf(poly_mul_fft)(t1, b11, logn); - Zf(poly_add)(t1, ty, logn); - - Zf(iFFT)(t0, logn); - Zf(iFFT)(t1, logn); - - /* - * Compute the signature. - */ - s1tmp = (int16_t *)tx; - sqn = 0; - ng = 0; - for (u = 0; u < n; u ++) { - int32_t z; - - z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]); - sqn += (uint32_t)(z * z); - ng |= sqn; - s1tmp[u] = (int16_t)z; - } - sqn |= -(ng >> 31); - - /* - * With "normal" degrees (e.g. 512 or 1024), it is very - * improbable that the computed vector is not short enough; - * however, it may happen in practice for the very reduced - * versions (e.g. degree 16 or below). In that case, the caller - * will loop, and we must not write anything into s2[] because - * s2[] may overlap with the hashed message hm[] and we need - * hm[] for the next iteration. - */ - s2tmp = (int16_t *)tmp; - for (u = 0; u < n; u ++) { - s2tmp[u] = (int16_t)-fpr_rint(t1[u]); - } - if (Zf(is_short_half)(sqn, s2tmp, logn)) { - memcpy(s2, s2tmp, n * sizeof *s2); - memcpy(tmp, s1tmp, n * sizeof *s1tmp); - return 1; - } - return 0; -} - -/* - * Compute a signature: the signature contains two vectors, s1 and s2. - * The s1 vector is not returned. The squared norm of (s1,s2) is - * computed, and if it is short enough, then s2 is returned into the - * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is - * returned; the caller should then try again. - * - * tmp[] must have room for at least nine polynomials. - */ -static int -do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2, - const int8_t *restrict f, const int8_t *restrict g, - const int8_t *restrict F, const int8_t *restrict G, - const uint16_t *hm, unsigned logn, fpr *restrict tmp) -{ - size_t n, u; - fpr *t0, *t1, *tx, *ty; - fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11; - fpr ni; - uint32_t sqn, ng; - int16_t *s1tmp, *s2tmp; - - n = MKN(logn); - - /* - * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT. - */ - b00 = tmp; - b01 = b00 + n; - b10 = b01 + n; - b11 = b10 + n; - smallints_to_fpr(b01, f, logn); - smallints_to_fpr(b00, g, logn); - smallints_to_fpr(b11, F, logn); - smallints_to_fpr(b10, G, logn); - Zf(FFT)(b01, logn); - Zf(FFT)(b00, logn); - Zf(FFT)(b11, logn); - Zf(FFT)(b10, logn); - Zf(poly_neg)(b01, logn); - Zf(poly_neg)(b11, logn); - - /* - * Compute the Gram matrix G = B·B*. Formulas are: - * g00 = b00*adj(b00) + b01*adj(b01) - * g01 = b00*adj(b10) + b01*adj(b11) - * g10 = b10*adj(b00) + b11*adj(b01) - * g11 = b10*adj(b10) + b11*adj(b11) - * - * For historical reasons, this implementation uses - * g00, g01 and g11 (upper triangle). g10 is not kept - * since it is equal to adj(g01). - * - * We _replace_ the matrix B with the Gram matrix, but we - * must keep b01 and b11 for computing the target vector. - */ - t0 = b11 + n; - t1 = t0 + n; - - memcpy(t0, b01, n * sizeof *b01); - Zf(poly_mulselfadj_fft)(t0, logn); // t0 <- b01*adj(b01) - - memcpy(t1, b00, n * sizeof *b00); - Zf(poly_muladj_fft)(t1, b10, logn); // t1 <- b00*adj(b10) - Zf(poly_mulselfadj_fft)(b00, logn); // b00 <- b00*adj(b00) - Zf(poly_add)(b00, t0, logn); // b00 <- g00 - memcpy(t0, b01, n * sizeof *b01); - Zf(poly_muladj_fft)(b01, b11, logn); // b01 <- b01*adj(b11) - Zf(poly_add)(b01, t1, logn); // b01 <- g01 - - Zf(poly_mulselfadj_fft)(b10, logn); // b10 <- b10*adj(b10) - memcpy(t1, b11, n * sizeof *b11); - Zf(poly_mulselfadj_fft)(t1, logn); // t1 <- b11*adj(b11) - Zf(poly_add)(b10, t1, logn); // b10 <- g11 - - /* - * We rename variables to make things clearer. The three elements - * of the Gram matrix uses the first 3*n slots of tmp[], followed - * by b11 and b01 (in that order). - */ - g00 = b00; - g01 = b01; - g11 = b10; - b01 = t0; - t0 = b01 + n; - t1 = t0 + n; - - /* - * Memory layout at that point: - * g00 g01 g11 b11 b01 t0 t1 - */ - - /* - * Set the target vector to [hm, 0] (hm is the hashed message). - */ - for (u = 0; u < n; u ++) { - t0[u] = fpr_of(hm[u]); - /* This is implicit. - t1[u] = fpr_zero; - */ - } - - /* - * Apply the lattice basis to obtain the real target - * vector (after normalization with regards to modulus). - */ - Zf(FFT)(t0, logn); - ni = fpr_inverse_of_q; - memcpy(t1, t0, n * sizeof *t0); - Zf(poly_mul_fft)(t1, b01, logn); - Zf(poly_mulconst)(t1, fpr_neg(ni), logn); - Zf(poly_mul_fft)(t0, b11, logn); - Zf(poly_mulconst)(t0, ni, logn); - - /* - * b01 and b11 can be discarded, so we move back (t0,t1). - * Memory layout is now: - * g00 g01 g11 t0 t1 - */ - memcpy(b11, t0, n * 2 * sizeof *t0); - t0 = g11 + n; - t1 = t0 + n; - - /* - * Apply sampling; result is written over (t0,t1). - */ - ffSampling_fft_dyntree(samp, samp_ctx, - t0, t1, g00, g01, g11, logn, t1 + n); - - /* - * We arrange the layout back to: - * b00 b01 b10 b11 t0 t1 - * - * We did not conserve the matrix basis, so we must recompute - * it now. - */ - b00 = tmp; - b01 = b00 + n; - b10 = b01 + n; - b11 = b10 + n; - memmove(b11 + n, t0, n * 2 * sizeof *t0); - t0 = b11 + n; - t1 = t0 + n; - smallints_to_fpr(b01, f, logn); - smallints_to_fpr(b00, g, logn); - smallints_to_fpr(b11, F, logn); - smallints_to_fpr(b10, G, logn); - Zf(FFT)(b01, logn); - Zf(FFT)(b00, logn); - Zf(FFT)(b11, logn); - Zf(FFT)(b10, logn); - Zf(poly_neg)(b01, logn); - Zf(poly_neg)(b11, logn); - tx = t1 + n; - ty = tx + n; - - /* - * Get the lattice point corresponding to that tiny vector. - */ - memcpy(tx, t0, n * sizeof *t0); - memcpy(ty, t1, n * sizeof *t1); - Zf(poly_mul_fft)(tx, b00, logn); - Zf(poly_mul_fft)(ty, b10, logn); - Zf(poly_add)(tx, ty, logn); - memcpy(ty, t0, n * sizeof *t0); - Zf(poly_mul_fft)(ty, b01, logn); - - memcpy(t0, tx, n * sizeof *tx); - Zf(poly_mul_fft)(t1, b11, logn); - Zf(poly_add)(t1, ty, logn); - Zf(iFFT)(t0, logn); - Zf(iFFT)(t1, logn); - - s1tmp = (int16_t *)tx; - sqn = 0; - ng = 0; - for (u = 0; u < n; u ++) { - int32_t z; - - z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]); - sqn += (uint32_t)(z * z); - ng |= sqn; - s1tmp[u] = (int16_t)z; - } - sqn |= -(ng >> 31); - - /* - * With "normal" degrees (e.g. 512 or 1024), it is very - * improbable that the computed vector is not short enough; - * however, it may happen in practice for the very reduced - * versions (e.g. degree 16 or below). In that case, the caller - * will loop, and we must not write anything into s2[] because - * s2[] may overlap with the hashed message hm[] and we need - * hm[] for the next iteration. - */ - s2tmp = (int16_t *)tmp; - for (u = 0; u < n; u ++) { - s2tmp[u] = (int16_t)-fpr_rint(t1[u]); - } - if (Zf(is_short_half)(sqn, s2tmp, logn)) { - memcpy(s2, s2tmp, n * sizeof *s2); - memcpy(tmp, s1tmp, n * sizeof *s1tmp); - return 1; - } - return 0; -} - -/* - * Sample an integer value along a half-gaussian distribution centered - * on zero and standard deviation 1.8205, with a precision of 72 bits. - */ -TARGET_AVX2 -int -Zf(gaussian0_sampler)(prng *p) -{ -#if FALCON_AVX2 // yyyAVX2+1 - - /* - * High words. - */ - static const union { - uint16_t u16[16]; - __m256i ymm[1]; - } rhi15 = { - { - 0x51FB, 0x2A69, 0x113E, 0x0568, - 0x014A, 0x003B, 0x0008, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000 - } - }; - - static const union { - uint64_t u64[20]; - __m256i ymm[5]; - } rlo57 = { - { - 0x1F42ED3AC391802, 0x12B181F3F7DDB82, - 0x1CDD0934829C1FF, 0x1754377C7994AE4, - 0x1846CAEF33F1F6F, 0x14AC754ED74BD5F, - 0x024DD542B776AE4, 0x1A1FFDC65AD63DA, - 0x01F80D88A7B6428, 0x001C3FDB2040C69, - 0x00012CF24D031FB, 0x00000949F8B091F, - 0x0000003665DA998, 0x00000000EBF6EBB, - 0x0000000002F5D7E, 0x000000000007098, - 0x0000000000000C6, 0x000000000000001, - 0x000000000000000, 0x000000000000000 - } - }; - - uint64_t lo; - unsigned hi; - __m256i xhi, rhi, gthi, eqhi, eqm; - __m256i xlo, gtlo0, gtlo1, gtlo2, gtlo3, gtlo4; - __m128i t, zt; - int r; - - /* - * Get a 72-bit random value and split it into a low part - * (57 bits) and a high part (15 bits) - */ - lo = prng_get_u64(p); - hi = prng_get_u8(p); - hi = (hi << 7) | (unsigned)(lo >> 57); - lo &= 0x1FFFFFFFFFFFFFF; - - /* - * Broadcast the high part and compare it with the relevant - * values. We need both a "greater than" and an "equal" - * comparisons. - */ - xhi = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(hi)); - rhi = _mm256_loadu_si256(&rhi15.ymm[0]); - gthi = _mm256_cmpgt_epi16(rhi, xhi); - eqhi = _mm256_cmpeq_epi16(rhi, xhi); - - /* - * The result is the number of 72-bit values (among the list of 19) - * which are greater than the 72-bit random value. We first count - * all non-zero 16-bit elements in the first eight of gthi. Such - * elements have value -1 or 0, so we first negate them. - */ - t = _mm_srli_epi16(_mm256_castsi256_si128(gthi), 15); - zt = _mm_setzero_si128(); - t = _mm_hadd_epi16(t, zt); - t = _mm_hadd_epi16(t, zt); - t = _mm_hadd_epi16(t, zt); - r = _mm_cvtsi128_si32(t); - - /* - * We must look at the low bits for all values for which the - * high bits are an "equal" match; values 8-18 all have the - * same high bits (0). - * On 32-bit systems, 'lo' really is two registers, requiring - * some extra code. - */ -#if defined(__x86_64__) || defined(_M_X64) - xlo = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(*(int64_t *)&lo)); -#else - { - uint32_t e0, e1; - int32_t f0, f1; - - e0 = (uint32_t)lo; - e1 = (uint32_t)(lo >> 32); - f0 = *(int32_t *)&e0; - f1 = *(int32_t *)&e1; - xlo = _mm256_set_epi32(f1, f0, f1, f0, f1, f0, f1, f0); - } -#endif - gtlo0 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[0]), xlo); - gtlo1 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[1]), xlo); - gtlo2 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[2]), xlo); - gtlo3 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[3]), xlo); - gtlo4 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[4]), xlo); - - /* - * Keep only comparison results that correspond to the non-zero - * elements in eqhi. - */ - gtlo0 = _mm256_and_si256(gtlo0, _mm256_cvtepi16_epi64( - _mm256_castsi256_si128(eqhi))); - gtlo1 = _mm256_and_si256(gtlo1, _mm256_cvtepi16_epi64( - _mm256_castsi256_si128(_mm256_bsrli_epi128(eqhi, 8)))); - eqm = _mm256_permute4x64_epi64(eqhi, 0xFF); - gtlo2 = _mm256_and_si256(gtlo2, eqm); - gtlo3 = _mm256_and_si256(gtlo3, eqm); - gtlo4 = _mm256_and_si256(gtlo4, eqm); - - /* - * Add all values to count the total number of "-1" elements. - * Since the first eight "high" words are all different, only - * one element (at most) in gtlo0:gtlo1 can be non-zero; however, - * if the high word of the random value is zero, then many - * elements of gtlo2:gtlo3:gtlo4 can be non-zero. - */ - gtlo0 = _mm256_or_si256(gtlo0, gtlo1); - gtlo0 = _mm256_add_epi64( - _mm256_add_epi64(gtlo0, gtlo2), - _mm256_add_epi64(gtlo3, gtlo4)); - t = _mm_add_epi64( - _mm256_castsi256_si128(gtlo0), - _mm256_extracti128_si256(gtlo0, 1)); - t = _mm_add_epi64(t, _mm_srli_si128(t, 8)); - r -= _mm_cvtsi128_si32(t); - - return r; - -#else // yyyAVX2+0 - - static const uint32_t dist[] = { - 10745844u, 3068844u, 3741698u, - 5559083u, 1580863u, 8248194u, - 2260429u, 13669192u, 2736639u, - 708981u, 4421575u, 10046180u, - 169348u, 7122675u, 4136815u, - 30538u, 13063405u, 7650655u, - 4132u, 14505003u, 7826148u, - 417u, 16768101u, 11363290u, - 31u, 8444042u, 8086568u, - 1u, 12844466u, 265321u, - 0u, 1232676u, 13644283u, - 0u, 38047u, 9111839u, - 0u, 870u, 6138264u, - 0u, 14u, 12545723u, - 0u, 0u, 3104126u, - 0u, 0u, 28824u, - 0u, 0u, 198u, - 0u, 0u, 1u - }; - - uint32_t v0, v1, v2, hi; - uint64_t lo; - size_t u; - int z; - - /* - * Get a random 72-bit value, into three 24-bit limbs v0..v2. - */ - lo = prng_get_u64(p); - hi = prng_get_u8(p); - v0 = (uint32_t)lo & 0xFFFFFF; - v1 = (uint32_t)(lo >> 24) & 0xFFFFFF; - v2 = (uint32_t)(lo >> 48) | (hi << 16); - - /* - * Sampled value is z, such that v0..v2 is lower than the first - * z elements of the table. - */ - z = 0; - for (u = 0; u < (sizeof dist) / sizeof(dist[0]); u += 3) { - uint32_t w0, w1, w2, cc; - - w0 = dist[u + 2]; - w1 = dist[u + 1]; - w2 = dist[u + 0]; - cc = (v0 - w0) >> 31; - cc = (v1 - w1 - cc) >> 31; - cc = (v2 - w2 - cc) >> 31; - z += (int)cc; - } - return z; - -#endif // yyyAVX2- -} - -/* - * Sample a bit with probability exp(-x) for some x >= 0. - */ -TARGET_AVX2 -static int -BerExp(prng *p, fpr x, fpr ccs) -{ - int s, i; - fpr r; - uint32_t sw, w; - uint64_t z; - - /* - * Reduce x modulo log(2): x = s*log(2) + r, with s an integer, - * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc(). - */ - s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2)); - r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2)); - - /* - * It may happen (quite rarely) that s >= 64; if sigma = 1.2 - * (the minimum value for sigma), r = 0 and b = 1, then we get - * s >= 64 if the half-Gaussian produced a z >= 13, which happens - * with probability about 0.000000000230383991, which is - * approximatively equal to 2^(-32). In any case, if s >= 64, - * then BerExp will be non-zero with probability less than - * 2^(-64), so we can simply saturate s at 63. - */ - sw = (uint32_t)s; - sw ^= (sw ^ 63) & -((63 - sw) >> 31); - s = (int)sw; - - /* - * Compute exp(-r); we know that 0 <= r < log(2) at this point, so - * we can use fpr_expm_p63(), which yields a result scaled to 2^63. - * We scale it up to 2^64, then right-shift it by s bits because - * we really want exp(-x) = 2^(-s)*exp(-r). - * - * The "-1" operation makes sure that the value fits on 64 bits - * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that - * case). The bias is negligible since fpr_expm_p63() only computes - * with 51 bits of precision or so. - */ - z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s; - - /* - * Sample a bit with probability exp(-x). Since x = s*log(2) + r, - * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the - * PRNG output to limit its consumption, the sign of the difference - * yields the expected result. - */ - i = 64; - do { - i -= 8; - w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF); - } while (!w && i > 0); - return (int)(w >> 31); -} - -/* - * The sampler produces a random integer that follows a discrete Gaussian - * distribution, centered on mu, and with standard deviation sigma. The - * provided parameter isigma is equal to 1/sigma. - * - * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between - * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9. - */ -TARGET_AVX2 -int -Zf(sampler)(void *ctx, fpr mu, fpr isigma) -{ - sampler_context *spc; - int s; - fpr r, dss, ccs; - - spc = ctx; - - /* - * Center is mu. We compute mu = s + r where s is an integer - * and 0 <= r < 1. - */ - s = (int)fpr_floor(mu); - r = fpr_sub(mu, fpr_of(s)); - - /* - * dss = 1/(2*sigma^2) = 0.5*(isigma^2). - */ - dss = fpr_half(fpr_sqr(isigma)); - - /* - * ccs = sigma_min / sigma = sigma_min * isigma. - */ - ccs = fpr_mul(isigma, spc->sigma_min); - - /* - * We now need to sample on center r. - */ - for (;;) { - int z0, z, b; - fpr x; - - /* - * Sample z for a Gaussian distribution. Then get a - * random bit b to turn the sampling into a bimodal - * distribution: if b = 1, we use z+1, otherwise we - * use -z. We thus have two situations: - * - * - b = 1: z >= 1 and sampled against a Gaussian - * centered on 1. - * - b = 0: z <= 0 and sampled against a Gaussian - * centered on 0. - */ - z0 = Zf(gaussian0_sampler)(&spc->p); - b = prng_get_u8(&spc->p) & 1; - z = b + ((b << 1) - 1) * z0; - - /* - * Rejection sampling. We want a Gaussian centered on r; - * but we sampled against a Gaussian centered on b (0 or - * 1). But we know that z is always in the range where - * our sampling distribution is greater than the Gaussian - * distribution, so rejection works. - * - * We got z with distribution: - * G(z) = exp(-((z-b)^2)/(2*sigma0^2)) - * We target distribution: - * S(z) = exp(-((z-r)^2)/(2*sigma^2)) - * Rejection sampling works by keeping the value z with - * probability S(z)/G(z), and starting again otherwise. - * This requires S(z) <= G(z), which is the case here. - * Thus, we simply need to keep our z with probability: - * P = exp(-x) - * where: - * x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2) - * - * Here, we scale up the Bernouilli distribution, which - * makes rejection more probable, but makes rejection - * rate sufficiently decorrelated from the Gaussian - * center and standard deviation that the whole sampler - * can be said to be constant-time. - */ - x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss); - x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0)); - if (BerExp(&spc->p, x, ccs)) { - /* - * Rejection sampling was centered on r, but the - * actual center is mu = s + r. - */ - return s + z; - } - } -} - -/* see inner.h */ -void -Zf(sign_tree)(int16_t *sig, inner_shake256_context *rng, - const fpr *restrict expanded_key, - const uint16_t *hm, unsigned logn, uint8_t *tmp) -{ - fpr *ftmp; - - ftmp = (fpr *)tmp; - for (;;) { - /* - * Signature produces short vectors s1 and s2. The - * signature is acceptable only if the aggregate vector - * s1,s2 is short; we must use the same bound as the - * verifier. - * - * If the signature is acceptable, then we return only s2 - * (the verifier recomputes s1 from s2, the hashed message, - * and the public key). - */ - sampler_context spc; - samplerZ samp; - void *samp_ctx; - - /* - * Normal sampling. We use a fast PRNG seeded from our - * SHAKE context ('rng'). - */ - spc.sigma_min = (logn == 10) - ? fpr_sigma_min_10 - : fpr_sigma_min_9; - Zf(prng_init)(&spc.p, rng); - samp = Zf(sampler); - samp_ctx = &spc; - - /* - * Do the actual signature. - */ - if (do_sign_tree(samp, samp_ctx, sig, - expanded_key, hm, logn, ftmp)) - { - break; - } - } -} - -/* see inner.h */ -void -Zf(sign_dyn)(int16_t *sig, inner_shake256_context *rng, - const int8_t *restrict f, const int8_t *restrict g, - const int8_t *restrict F, const int8_t *restrict G, - const uint16_t *hm, unsigned logn, uint8_t *tmp) -{ - fpr *ftmp; - - ftmp = (fpr *)tmp; - for (;;) { - /* - * Signature produces short vectors s1 and s2. The - * signature is acceptable only if the aggregate vector - * s1,s2 is short; we must use the same bound as the - * verifier. - * - * If the signature is acceptable, then we return only s2 - * (the verifier recomputes s1 from s2, the hashed message, - * and the public key). - */ - sampler_context spc; - samplerZ samp; - void *samp_ctx; - - /* - * Normal sampling. We use a fast PRNG seeded from our - * SHAKE context ('rng'). - */ - spc.sigma_min = (logn == 10) - ? fpr_sigma_min_10 - : fpr_sigma_min_9; - Zf(prng_init)(&spc.p, rng); - samp = Zf(sampler); - samp_ctx = &spc; - - /* - * Do the actual signature. - */ - if (do_sign_dyn(samp, samp_ctx, sig, - f, g, F, G, hm, logn, ftmp)) - { - break; - } - } -} diff --git a/crypto_sign/falcon-1024/m4-ct/vrfy.c b/crypto_sign/falcon-1024/m4-ct/vrfy.c deleted file mode 100644 index c74a3dd3..00000000 --- a/crypto_sign/falcon-1024/m4-ct/vrfy.c +++ /dev/null @@ -1,871 +0,0 @@ -/* - * Falcon signature verification. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -/* ===================================================================== */ -/* - * Constants for NTT. - * - * n = 2^logn (2 <= n <= 1024) - * phi = X^n + 1 - * q = 12289 - * q0i = -1/q mod 2^16 - * R = 2^16 mod q - * R2 = 2^32 mod q - */ - -#define Q 12289 -#define Q0I 12287 -#define R 4091 -#define R2 10952 - -/* - * Table for NTT, binary case: - * GMb[x] = R*(g^rev(x)) mod q - * where g = 7 (it is a 2048-th primitive root of 1 modulo q) - * and rev() is the bit-reversal function over 10 bits. - */ -static const uint16_t GMb[] = { - 4091, 7888, 11060, 11208, 6960, 4342, 6275, 9759, - 1591, 6399, 9477, 5266, 586, 5825, 7538, 9710, - 1134, 6407, 1711, 965, 7099, 7674, 3743, 6442, - 10414, 8100, 1885, 1688, 1364, 10329, 10164, 9180, - 12210, 6240, 997, 117, 4783, 4407, 1549, 7072, - 2829, 6458, 4431, 8877, 7144, 2564, 5664, 4042, - 12189, 432, 10751, 1237, 7610, 1534, 3983, 7863, - 2181, 6308, 8720, 6570, 4843, 1690, 14, 3872, - 5569, 9368, 12163, 2019, 7543, 2315, 4673, 7340, - 1553, 1156, 8401, 11389, 1020, 2967, 10772, 7045, - 3316, 11236, 5285, 11578, 10637, 10086, 9493, 6180, - 9277, 6130, 3323, 883, 10469, 489, 1502, 2851, - 11061, 9729, 2742, 12241, 4970, 10481, 10078, 1195, - 730, 1762, 3854, 2030, 5892, 10922, 9020, 5274, - 9179, 3604, 3782, 10206, 3180, 3467, 4668, 2446, - 7613, 9386, 834, 7703, 6836, 3403, 5351, 12276, - 3580, 1739, 10820, 9787, 10209, 4070, 12250, 8525, - 10401, 2749, 7338, 10574, 6040, 943, 9330, 1477, - 6865, 9668, 3585, 6633, 12145, 4063, 3684, 7680, - 8188, 6902, 3533, 9807, 6090, 727, 10099, 7003, - 6945, 1949, 9731, 10559, 6057, 378, 7871, 8763, - 8901, 9229, 8846, 4551, 9589, 11664, 7630, 8821, - 5680, 4956, 6251, 8388, 10156, 8723, 2341, 3159, - 1467, 5460, 8553, 7783, 2649, 2320, 9036, 6188, - 737, 3698, 4699, 5753, 9046, 3687, 16, 914, - 5186, 10531, 4552, 1964, 3509, 8436, 7516, 5381, - 10733, 3281, 7037, 1060, 2895, 7156, 8887, 5357, - 6409, 8197, 2962, 6375, 5064, 6634, 5625, 278, - 932, 10229, 8927, 7642, 351, 9298, 237, 5858, - 7692, 3146, 12126, 7586, 2053, 11285, 3802, 5204, - 4602, 1748, 11300, 340, 3711, 4614, 300, 10993, - 5070, 10049, 11616, 12247, 7421, 10707, 5746, 5654, - 3835, 5553, 1224, 8476, 9237, 3845, 250, 11209, - 4225, 6326, 9680, 12254, 4136, 2778, 692, 8808, - 6410, 6718, 10105, 10418, 3759, 7356, 11361, 8433, - 6437, 3652, 6342, 8978, 5391, 2272, 6476, 7416, - 8418, 10824, 11986, 5733, 876, 7030, 2167, 2436, - 3442, 9217, 8206, 4858, 5964, 2746, 7178, 1434, - 7389, 8879, 10661, 11457, 4220, 1432, 10832, 4328, - 8557, 1867, 9454, 2416, 3816, 9076, 686, 5393, - 2523, 4339, 6115, 619, 937, 2834, 7775, 3279, - 2363, 7488, 6112, 5056, 824, 10204, 11690, 1113, - 2727, 9848, 896, 2028, 5075, 2654, 10464, 7884, - 12169, 5434, 3070, 6400, 9132, 11672, 12153, 4520, - 1273, 9739, 11468, 9937, 10039, 9720, 2262, 9399, - 11192, 315, 4511, 1158, 6061, 6751, 11865, 357, - 7367, 4550, 983, 8534, 8352, 10126, 7530, 9253, - 4367, 5221, 3999, 8777, 3161, 6990, 4130, 11652, - 3374, 11477, 1753, 292, 8681, 2806, 10378, 12188, - 5800, 11811, 3181, 1988, 1024, 9340, 2477, 10928, - 4582, 6750, 3619, 5503, 5233, 2463, 8470, 7650, - 7964, 6395, 1071, 1272, 3474, 11045, 3291, 11344, - 8502, 9478, 9837, 1253, 1857, 6233, 4720, 11561, - 6034, 9817, 3339, 1797, 2879, 6242, 5200, 2114, - 7962, 9353, 11363, 5475, 6084, 9601, 4108, 7323, - 10438, 9471, 1271, 408, 6911, 3079, 360, 8276, - 11535, 9156, 9049, 11539, 850, 8617, 784, 7919, - 8334, 12170, 1846, 10213, 12184, 7827, 11903, 5600, - 9779, 1012, 721, 2784, 6676, 6552, 5348, 4424, - 6816, 8405, 9959, 5150, 2356, 5552, 5267, 1333, - 8801, 9661, 7308, 5788, 4910, 909, 11613, 4395, - 8238, 6686, 4302, 3044, 2285, 12249, 1963, 9216, - 4296, 11918, 695, 4371, 9793, 4884, 2411, 10230, - 2650, 841, 3890, 10231, 7248, 8505, 11196, 6688, - 4059, 6060, 3686, 4722, 11853, 5816, 7058, 6868, - 11137, 7926, 4894, 12284, 4102, 3908, 3610, 6525, - 7938, 7982, 11977, 6755, 537, 4562, 1623, 8227, - 11453, 7544, 906, 11816, 9548, 10858, 9703, 2815, - 11736, 6813, 6979, 819, 8903, 6271, 10843, 348, - 7514, 8339, 6439, 694, 852, 5659, 2781, 3716, - 11589, 3024, 1523, 8659, 4114, 10738, 3303, 5885, - 2978, 7289, 11884, 9123, 9323, 11830, 98, 2526, - 2116, 4131, 11407, 1844, 3645, 3916, 8133, 2224, - 10871, 8092, 9651, 5989, 7140, 8480, 1670, 159, - 10923, 4918, 128, 7312, 725, 9157, 5006, 6393, - 3494, 6043, 10972, 6181, 11838, 3423, 10514, 7668, - 3693, 6658, 6905, 11953, 10212, 11922, 9101, 8365, - 5110, 45, 2400, 1921, 4377, 2720, 1695, 51, - 2808, 650, 1896, 9997, 9971, 11980, 8098, 4833, - 4135, 4257, 5838, 4765, 10985, 11532, 590, 12198, - 482, 12173, 2006, 7064, 10018, 3912, 12016, 10519, - 11362, 6954, 2210, 284, 5413, 6601, 3865, 10339, - 11188, 6231, 517, 9564, 11281, 3863, 1210, 4604, - 8160, 11447, 153, 7204, 5763, 5089, 9248, 12154, - 11748, 1354, 6672, 179, 5532, 2646, 5941, 12185, - 862, 3158, 477, 7279, 5678, 7914, 4254, 302, - 2893, 10114, 6890, 9560, 9647, 11905, 4098, 9824, - 10269, 1353, 10715, 5325, 6254, 3951, 1807, 6449, - 5159, 1308, 8315, 3404, 1877, 1231, 112, 6398, - 11724, 12272, 7286, 1459, 12274, 9896, 3456, 800, - 1397, 10678, 103, 7420, 7976, 936, 764, 632, - 7996, 8223, 8445, 7758, 10870, 9571, 2508, 1946, - 6524, 10158, 1044, 4338, 2457, 3641, 1659, 4139, - 4688, 9733, 11148, 3946, 2082, 5261, 2036, 11850, - 7636, 12236, 5366, 2380, 1399, 7720, 2100, 3217, - 10912, 8898, 7578, 11995, 2791, 1215, 3355, 2711, - 2267, 2004, 8568, 10176, 3214, 2337, 1750, 4729, - 4997, 7415, 6315, 12044, 4374, 7157, 4844, 211, - 8003, 10159, 9290, 11481, 1735, 2336, 5793, 9875, - 8192, 986, 7527, 1401, 870, 3615, 8465, 2756, - 9770, 2034, 10168, 3264, 6132, 54, 2880, 4763, - 11805, 3074, 8286, 9428, 4881, 6933, 1090, 10038, - 2567, 708, 893, 6465, 4962, 10024, 2090, 5718, - 10743, 780, 4733, 4623, 2134, 2087, 4802, 884, - 5372, 5795, 5938, 4333, 6559, 7549, 5269, 10664, - 4252, 3260, 5917, 10814, 5768, 9983, 8096, 7791, - 6800, 7491, 6272, 1907, 10947, 6289, 11803, 6032, - 11449, 1171, 9201, 7933, 2479, 7970, 11337, 7062, - 8911, 6728, 6542, 8114, 8828, 6595, 3545, 4348, - 4610, 2205, 6999, 8106, 5560, 10390, 9321, 2499, - 2413, 7272, 6881, 10582, 9308, 9437, 3554, 3326, - 5991, 11969, 3415, 12283, 9838, 12063, 4332, 7830, - 11329, 6605, 12271, 2044, 11611, 7353, 11201, 11582, - 3733, 8943, 9978, 1627, 7168, 3935, 5050, 2762, - 7496, 10383, 755, 1654, 12053, 4952, 10134, 4394, - 6592, 7898, 7497, 8904, 12029, 3581, 10748, 5674, - 10358, 4901, 7414, 8771, 710, 6764, 8462, 7193, - 5371, 7274, 11084, 290, 7864, 6827, 11822, 2509, - 6578, 4026, 5807, 1458, 5721, 5762, 4178, 2105, - 11621, 4852, 8897, 2856, 11510, 9264, 2520, 8776, - 7011, 2647, 1898, 7039, 5950, 11163, 5488, 6277, - 9182, 11456, 633, 10046, 11554, 5633, 9587, 2333, - 7008, 7084, 5047, 7199, 9865, 8997, 569, 6390, - 10845, 9679, 8268, 11472, 4203, 1997, 2, 9331, - 162, 6182, 2000, 3649, 9792, 6363, 7557, 6187, - 8510, 9935, 5536, 9019, 3706, 12009, 1452, 3067, - 5494, 9692, 4865, 6019, 7106, 9610, 4588, 10165, - 6261, 5887, 2652, 10172, 1580, 10379, 4638, 9949 -}; - -/* - * Table for inverse NTT, binary case: - * iGMb[x] = R*((1/g)^rev(x)) mod q - * Since g = 7, 1/g = 8778 mod 12289. - */ -static const uint16_t iGMb[] = { - 4091, 4401, 1081, 1229, 2530, 6014, 7947, 5329, - 2579, 4751, 6464, 11703, 7023, 2812, 5890, 10698, - 3109, 2125, 1960, 10925, 10601, 10404, 4189, 1875, - 5847, 8546, 4615, 5190, 11324, 10578, 5882, 11155, - 8417, 12275, 10599, 7446, 5719, 3569, 5981, 10108, - 4426, 8306, 10755, 4679, 11052, 1538, 11857, 100, - 8247, 6625, 9725, 5145, 3412, 7858, 5831, 9460, - 5217, 10740, 7882, 7506, 12172, 11292, 6049, 79, - 13, 6938, 8886, 5453, 4586, 11455, 2903, 4676, - 9843, 7621, 8822, 9109, 2083, 8507, 8685, 3110, - 7015, 3269, 1367, 6397, 10259, 8435, 10527, 11559, - 11094, 2211, 1808, 7319, 48, 9547, 2560, 1228, - 9438, 10787, 11800, 1820, 11406, 8966, 6159, 3012, - 6109, 2796, 2203, 1652, 711, 7004, 1053, 8973, - 5244, 1517, 9322, 11269, 900, 3888, 11133, 10736, - 4949, 7616, 9974, 4746, 10270, 126, 2921, 6720, - 6635, 6543, 1582, 4868, 42, 673, 2240, 7219, - 1296, 11989, 7675, 8578, 11949, 989, 10541, 7687, - 7085, 8487, 1004, 10236, 4703, 163, 9143, 4597, - 6431, 12052, 2991, 11938, 4647, 3362, 2060, 11357, - 12011, 6664, 5655, 7225, 5914, 9327, 4092, 5880, - 6932, 3402, 5133, 9394, 11229, 5252, 9008, 1556, - 6908, 4773, 3853, 8780, 10325, 7737, 1758, 7103, - 11375, 12273, 8602, 3243, 6536, 7590, 8591, 11552, - 6101, 3253, 9969, 9640, 4506, 3736, 6829, 10822, - 9130, 9948, 3566, 2133, 3901, 6038, 7333, 6609, - 3468, 4659, 625, 2700, 7738, 3443, 3060, 3388, - 3526, 4418, 11911, 6232, 1730, 2558, 10340, 5344, - 5286, 2190, 11562, 6199, 2482, 8756, 5387, 4101, - 4609, 8605, 8226, 144, 5656, 8704, 2621, 5424, - 10812, 2959, 11346, 6249, 1715, 4951, 9540, 1888, - 3764, 39, 8219, 2080, 2502, 1469, 10550, 8709, - 5601, 1093, 3784, 5041, 2058, 8399, 11448, 9639, - 2059, 9878, 7405, 2496, 7918, 11594, 371, 7993, - 3073, 10326, 40, 10004, 9245, 7987, 5603, 4051, - 7894, 676, 11380, 7379, 6501, 4981, 2628, 3488, - 10956, 7022, 6737, 9933, 7139, 2330, 3884, 5473, - 7865, 6941, 5737, 5613, 9505, 11568, 11277, 2510, - 6689, 386, 4462, 105, 2076, 10443, 119, 3955, - 4370, 11505, 3672, 11439, 750, 3240, 3133, 754, - 4013, 11929, 9210, 5378, 11881, 11018, 2818, 1851, - 4966, 8181, 2688, 6205, 6814, 926, 2936, 4327, - 10175, 7089, 6047, 9410, 10492, 8950, 2472, 6255, - 728, 7569, 6056, 10432, 11036, 2452, 2811, 3787, - 945, 8998, 1244, 8815, 11017, 11218, 5894, 4325, - 4639, 3819, 9826, 7056, 6786, 8670, 5539, 7707, - 1361, 9812, 2949, 11265, 10301, 9108, 478, 6489, - 101, 1911, 9483, 3608, 11997, 10536, 812, 8915, - 637, 8159, 5299, 9128, 3512, 8290, 7068, 7922, - 3036, 4759, 2163, 3937, 3755, 11306, 7739, 4922, - 11932, 424, 5538, 6228, 11131, 7778, 11974, 1097, - 2890, 10027, 2569, 2250, 2352, 821, 2550, 11016, - 7769, 136, 617, 3157, 5889, 9219, 6855, 120, - 4405, 1825, 9635, 7214, 10261, 11393, 2441, 9562, - 11176, 599, 2085, 11465, 7233, 6177, 4801, 9926, - 9010, 4514, 9455, 11352, 11670, 6174, 7950, 9766, - 6896, 11603, 3213, 8473, 9873, 2835, 10422, 3732, - 7961, 1457, 10857, 8069, 832, 1628, 3410, 4900, - 10855, 5111, 9543, 6325, 7431, 4083, 3072, 8847, - 9853, 10122, 5259, 11413, 6556, 303, 1465, 3871, - 4873, 5813, 10017, 6898, 3311, 5947, 8637, 5852, - 3856, 928, 4933, 8530, 1871, 2184, 5571, 5879, - 3481, 11597, 9511, 8153, 35, 2609, 5963, 8064, - 1080, 12039, 8444, 3052, 3813, 11065, 6736, 8454, - 2340, 7651, 1910, 10709, 2117, 9637, 6402, 6028, - 2124, 7701, 2679, 5183, 6270, 7424, 2597, 6795, - 9222, 10837, 280, 8583, 3270, 6753, 2354, 3779, - 6102, 4732, 5926, 2497, 8640, 10289, 6107, 12127, - 2958, 12287, 10292, 8086, 817, 4021, 2610, 1444, - 5899, 11720, 3292, 2424, 5090, 7242, 5205, 5281, - 9956, 2702, 6656, 735, 2243, 11656, 833, 3107, - 6012, 6801, 1126, 6339, 5250, 10391, 9642, 5278, - 3513, 9769, 3025, 779, 9433, 3392, 7437, 668, - 10184, 8111, 6527, 6568, 10831, 6482, 8263, 5711, - 9780, 467, 5462, 4425, 11999, 1205, 5015, 6918, - 5096, 3827, 5525, 11579, 3518, 4875, 7388, 1931, - 6615, 1541, 8708, 260, 3385, 4792, 4391, 5697, - 7895, 2155, 7337, 236, 10635, 11534, 1906, 4793, - 9527, 7239, 8354, 5121, 10662, 2311, 3346, 8556, - 707, 1088, 4936, 678, 10245, 18, 5684, 960, - 4459, 7957, 226, 2451, 6, 8874, 320, 6298, - 8963, 8735, 2852, 2981, 1707, 5408, 5017, 9876, - 9790, 2968, 1899, 6729, 4183, 5290, 10084, 7679, - 7941, 8744, 5694, 3461, 4175, 5747, 5561, 3378, - 5227, 952, 4319, 9810, 4356, 3088, 11118, 840, - 6257, 486, 6000, 1342, 10382, 6017, 4798, 5489, - 4498, 4193, 2306, 6521, 1475, 6372, 9029, 8037, - 1625, 7020, 4740, 5730, 7956, 6351, 6494, 6917, - 11405, 7487, 10202, 10155, 7666, 7556, 11509, 1546, - 6571, 10199, 2265, 7327, 5824, 11396, 11581, 9722, - 2251, 11199, 5356, 7408, 2861, 4003, 9215, 484, - 7526, 9409, 12235, 6157, 9025, 2121, 10255, 2519, - 9533, 3824, 8674, 11419, 10888, 4762, 11303, 4097, - 2414, 6496, 9953, 10554, 808, 2999, 2130, 4286, - 12078, 7445, 5132, 7915, 245, 5974, 4874, 7292, - 7560, 10539, 9952, 9075, 2113, 3721, 10285, 10022, - 9578, 8934, 11074, 9498, 294, 4711, 3391, 1377, - 9072, 10189, 4569, 10890, 9909, 6923, 53, 4653, - 439, 10253, 7028, 10207, 8343, 1141, 2556, 7601, - 8150, 10630, 8648, 9832, 7951, 11245, 2131, 5765, - 10343, 9781, 2718, 1419, 4531, 3844, 4066, 4293, - 11657, 11525, 11353, 4313, 4869, 12186, 1611, 10892, - 11489, 8833, 2393, 15, 10830, 5003, 17, 565, - 5891, 12177, 11058, 10412, 8885, 3974, 10981, 7130, - 5840, 10482, 8338, 6035, 6964, 1574, 10936, 2020, - 2465, 8191, 384, 2642, 2729, 5399, 2175, 9396, - 11987, 8035, 4375, 6611, 5010, 11812, 9131, 11427, - 104, 6348, 9643, 6757, 12110, 5617, 10935, 541, - 135, 3041, 7200, 6526, 5085, 12136, 842, 4129, - 7685, 11079, 8426, 1008, 2725, 11772, 6058, 1101, - 1950, 8424, 5688, 6876, 12005, 10079, 5335, 927, - 1770, 273, 8377, 2271, 5225, 10283, 116, 11807, - 91, 11699, 757, 1304, 7524, 6451, 8032, 8154, - 7456, 4191, 309, 2318, 2292, 10393, 11639, 9481, - 12238, 10594, 9569, 7912, 10368, 9889, 12244, 7179, - 3924, 3188, 367, 2077, 336, 5384, 5631, 8596, - 4621, 1775, 8866, 451, 6108, 1317, 6246, 8795, - 5896, 7283, 3132, 11564, 4977, 12161, 7371, 1366, - 12130, 10619, 3809, 5149, 6300, 2638, 4197, 1418, - 10065, 4156, 8373, 8644, 10445, 882, 8158, 10173, - 9763, 12191, 459, 2966, 3166, 405, 5000, 9311, - 6404, 8986, 1551, 8175, 3630, 10766, 9265, 700, - 8573, 9508, 6630, 11437, 11595, 5850, 3950, 4775, - 11941, 1446, 6018, 3386, 11470, 5310, 5476, 553, - 9474, 2586, 1431, 2741, 473, 11383, 4745, 836, - 4062, 10666, 7727, 11752, 5534, 312, 4307, 4351, - 5764, 8679, 8381, 8187, 5, 7395, 4363, 1152, - 5421, 5231, 6473, 436, 7567, 8603, 6229, 8230 -}; - -/* - * Reduce a small signed integer modulo q. The source integer MUST - * be between -q/2 and +q/2. - */ -static inline uint32_t -mq_conv_small(int x) -{ - /* - * If x < 0, the cast to uint32_t will set the high bit to 1. - */ - uint32_t y; - - y = (uint32_t)x; - y += Q & -(y >> 31); - return y; -} - -/* - * Addition modulo q. Operands must be in the 0..q-1 range. - */ -static inline uint32_t -mq_add(uint32_t x, uint32_t y) -{ - /* - * We compute x + y - q. If the result is negative, then the - * high bit will be set, and 'd >> 31' will be equal to 1; - * thus '-(d >> 31)' will be an all-one pattern. Otherwise, - * it will be an all-zero pattern. In other words, this - * implements a conditional addition of q. - */ - uint32_t d; - - d = x + y - Q; - d += Q & -(d >> 31); - return d; -} - -/* - * Subtraction modulo q. Operands must be in the 0..q-1 range. - */ -static inline uint32_t -mq_sub(uint32_t x, uint32_t y) -{ - /* - * As in mq_add(), we use a conditional addition to ensure the - * result is in the 0..q-1 range. - */ - uint32_t d; - - d = x - y; - d += Q & -(d >> 31); - return d; -} - -/* - * Division by 2 modulo q. Operand must be in the 0..q-1 range. - */ -static inline uint32_t -mq_rshift1(uint32_t x) -{ - x += Q & -(x & 1); - return (x >> 1); -} - -/* - * Montgomery multiplication modulo q. If we set R = 2^16 mod q, then - * this function computes: x * y / R mod q - * Operands must be in the 0..q-1 range. - */ -static inline uint32_t -mq_montymul(uint32_t x, uint32_t y) -{ - uint32_t z, w; - - /* - * We compute x*y + k*q with a value of k chosen so that the 16 - * low bits of the result are 0. We can then shift the value. - * After the shift, result may still be larger than q, but it - * will be lower than 2*q, so a conditional subtraction works. - */ - - z = x * y; - w = ((z * Q0I) & 0xFFFF) * Q; - - /* - * When adding z and w, the result will have its low 16 bits - * equal to 0. Since x, y and z are lower than q, the sum will - * be no more than (2^15 - 1) * q + (q - 1)^2, which will - * fit on 29 bits. - */ - z = (z + w) >> 16; - - /* - * After the shift, analysis shows that the value will be less - * than 2q. We do a subtraction then conditional subtraction to - * ensure the result is in the expected range. - */ - z -= Q; - z += Q & -(z >> 31); - return z; -} - -/* - * Montgomery squaring (computes (x^2)/R). - */ -static inline uint32_t -mq_montysqr(uint32_t x) -{ - return mq_montymul(x, x); -} - -/* - * Divide x by y modulo q = 12289. - */ -static inline uint32_t -mq_div_12289(uint32_t x, uint32_t y) -{ - /* - * We invert y by computing y^(q-2) mod q. - * - * We use the following addition chain for exponent e = 12287: - * - * e0 = 1 - * e1 = 2 * e0 = 2 - * e2 = e1 + e0 = 3 - * e3 = e2 + e1 = 5 - * e4 = 2 * e3 = 10 - * e5 = 2 * e4 = 20 - * e6 = 2 * e5 = 40 - * e7 = 2 * e6 = 80 - * e8 = 2 * e7 = 160 - * e9 = e8 + e2 = 163 - * e10 = e9 + e8 = 323 - * e11 = 2 * e10 = 646 - * e12 = 2 * e11 = 1292 - * e13 = e12 + e9 = 1455 - * e14 = 2 * e13 = 2910 - * e15 = 2 * e14 = 5820 - * e16 = e15 + e10 = 6143 - * e17 = 2 * e16 = 12286 - * e18 = e17 + e0 = 12287 - * - * Additions on exponents are converted to Montgomery - * multiplications. We define all intermediate results as so - * many local variables, and let the C compiler work out which - * must be kept around. - */ - uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9; - uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18; - - y0 = mq_montymul(y, R2); - y1 = mq_montysqr(y0); - y2 = mq_montymul(y1, y0); - y3 = mq_montymul(y2, y1); - y4 = mq_montysqr(y3); - y5 = mq_montysqr(y4); - y6 = mq_montysqr(y5); - y7 = mq_montysqr(y6); - y8 = mq_montysqr(y7); - y9 = mq_montymul(y8, y2); - y10 = mq_montymul(y9, y8); - y11 = mq_montysqr(y10); - y12 = mq_montysqr(y11); - y13 = mq_montymul(y12, y9); - y14 = mq_montysqr(y13); - y15 = mq_montysqr(y14); - y16 = mq_montymul(y15, y10); - y17 = mq_montysqr(y16); - y18 = mq_montymul(y17, y0); - - /* - * Final multiplication with x, which is not in Montgomery - * representation, computes the correct division result. - */ - return mq_montymul(y18, x); -} - -/* - * Compute NTT on a ring element. - */ -static void -mq_NTT(uint16_t *a, unsigned logn) -{ - size_t n, t, m; - - n = (size_t)1 << logn; - t = n; - for (m = 1; m < n; m <<= 1) { - size_t ht, i, j1; - - ht = t >> 1; - for (i = 0, j1 = 0; i < m; i ++, j1 += t) { - size_t j, j2; - uint32_t s; - - s = GMb[m + i]; - j2 = j1 + ht; - for (j = j1; j < j2; j ++) { - uint32_t u, v; - - u = a[j]; - v = mq_montymul(a[j + ht], s); - a[j] = (uint16_t)mq_add(u, v); - a[j + ht] = (uint16_t)mq_sub(u, v); - } - } - t = ht; - } -} - -/* - * Compute the inverse NTT on a ring element, binary case. - */ -static void -mq_iNTT(uint16_t *a, unsigned logn) -{ - size_t n, t, m; - uint32_t ni; - - n = (size_t)1 << logn; - t = 1; - m = n; - while (m > 1) { - size_t hm, dt, i, j1; - - hm = m >> 1; - dt = t << 1; - for (i = 0, j1 = 0; i < hm; i ++, j1 += dt) { - size_t j, j2; - uint32_t s; - - j2 = j1 + t; - s = iGMb[hm + i]; - for (j = j1; j < j2; j ++) { - uint32_t u, v, w; - - u = a[j]; - v = a[j + t]; - a[j] = (uint16_t)mq_add(u, v); - w = mq_sub(u, v); - a[j + t] = (uint16_t) - mq_montymul(w, s); - } - } - t = dt; - m = hm; - } - - /* - * To complete the inverse NTT, we must now divide all values by - * n (the vector size). We thus need the inverse of n, i.e. we - * need to divide 1 by 2 logn times. But we also want it in - * Montgomery representation, i.e. we also want to multiply it - * by R = 2^16. In the common case, this should be a simple right - * shift. The loop below is generic and works also in corner cases; - * its computation time is negligible. - */ - ni = R; - for (m = n; m > 1; m >>= 1) { - ni = mq_rshift1(ni); - } - for (m = 0; m < n; m ++) { - a[m] = (uint16_t)mq_montymul(a[m], ni); - } -} - -/* - * Convert a polynomial (mod q) to Montgomery representation. - */ -static void -mq_poly_tomonty(uint16_t *f, unsigned logn) -{ - size_t u, n; - - n = (size_t)1 << logn; - for (u = 0; u < n; u ++) { - f[u] = (uint16_t)mq_montymul(f[u], R2); - } -} - -/* - * Multiply two polynomials together (NTT representation, and using - * a Montgomery multiplication). Result f*g is written over f. - */ -static void -mq_poly_montymul_ntt(uint16_t *f, const uint16_t *g, unsigned logn) -{ - size_t u, n; - - n = (size_t)1 << logn; - for (u = 0; u < n; u ++) { - f[u] = (uint16_t)mq_montymul(f[u], g[u]); - } -} - -/* - * Subtract polynomial g from polynomial f. - */ -static void -mq_poly_sub(uint16_t *f, const uint16_t *g, unsigned logn) -{ - size_t u, n; - - n = (size_t)1 << logn; - for (u = 0; u < n; u ++) { - f[u] = (uint16_t)mq_sub(f[u], g[u]); - } -} - -/* ===================================================================== */ - -/* see inner.h */ -void -Zf(to_ntt_monty)(uint16_t *h, unsigned logn) -{ - mq_NTT(h, logn); - mq_poly_tomonty(h, logn); -} - -/* see inner.h */ -int -Zf(verify_raw)(const uint16_t *c0, const int16_t *s2, - const uint16_t *h, unsigned logn, uint8_t *tmp) -{ - size_t u, n; - uint16_t *tt; - - n = (size_t)1 << logn; - tt = (uint16_t *)tmp; - - /* - * Reduce s2 elements modulo q ([0..q-1] range). - */ - for (u = 0; u < n; u ++) { - uint32_t w; - - w = (uint32_t)s2[u]; - w += Q & -(w >> 31); - tt[u] = (uint16_t)w; - } - - /* - * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]). - */ - mq_NTT(tt, logn); - mq_poly_montymul_ntt(tt, h, logn); - mq_iNTT(tt, logn); - mq_poly_sub(tt, c0, logn); - - /* - * Normalize -s1 elements into the [-q/2..q/2] range. - */ - for (u = 0; u < n; u ++) { - int32_t w; - - w = (int32_t)tt[u]; - w -= (int32_t)(Q & -(((Q >> 1) - (uint32_t)w) >> 31)); - ((int16_t *)tt)[u] = (int16_t)w; - } - - /* - * Signature is valid if and only if the aggregate (-s1,s2) vector - * is short enough. - */ - return Zf(is_short)((int16_t *)tt, s2, logn); -} - -/* see inner.h */ -int -Zf(compute_public)(uint16_t *h, - const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp) -{ - size_t u, n; - uint16_t *tt; - - n = (size_t)1 << logn; - tt = (uint16_t *)tmp; - for (u = 0; u < n; u ++) { - tt[u] = (uint16_t)mq_conv_small(f[u]); - h[u] = (uint16_t)mq_conv_small(g[u]); - } - mq_NTT(h, logn); - mq_NTT(tt, logn); - for (u = 0; u < n; u ++) { - if (tt[u] == 0) { - return 0; - } - h[u] = (uint16_t)mq_div_12289(h[u], tt[u]); - } - mq_iNTT(h, logn); - return 1; -} - -/* see inner.h */ -int -Zf(complete_private)(int8_t *G, - const int8_t *f, const int8_t *g, const int8_t *F, - unsigned logn, uint8_t *tmp) -{ - size_t u, n; - uint16_t *t1, *t2; - - n = (size_t)1 << logn; - t1 = (uint16_t *)tmp; - t2 = t1 + n; - for (u = 0; u < n; u ++) { - t1[u] = (uint16_t)mq_conv_small(g[u]); - t2[u] = (uint16_t)mq_conv_small(F[u]); - } - mq_NTT(t1, logn); - mq_NTT(t2, logn); - mq_poly_tomonty(t1, logn); - mq_poly_montymul_ntt(t1, t2, logn); - for (u = 0; u < n; u ++) { - t2[u] = (uint16_t)mq_conv_small(f[u]); - } - mq_NTT(t2, logn); - for (u = 0; u < n; u ++) { - if (t2[u] == 0) { - return 0; - } - t1[u] = (uint16_t)mq_div_12289(t1[u], t2[u]); - } - mq_iNTT(t1, logn); - for (u = 0; u < n; u ++) { - uint32_t w; - int32_t gi; - - w = t1[u]; - w -= (Q & ~-((w - (Q >> 1)) >> 31)); - gi = *(int32_t *)&w; - if (gi < -127 || gi > +127) { - return 0; - } - G[u] = (int8_t)gi; - } - return 1; -} - -/* see inner.h */ -int -Zf(is_invertible)( - const int16_t *s2, unsigned logn, uint8_t *tmp) -{ - size_t u, n; - uint16_t *tt; - uint32_t r; - - n = (size_t)1 << logn; - tt = (uint16_t *)tmp; - for (u = 0; u < n; u ++) { - uint32_t w; - - w = (uint32_t)s2[u]; - w += Q & -(w >> 31); - tt[u] = (uint16_t)w; - } - mq_NTT(tt, logn); - r = 0; - for (u = 0; u < n; u ++) { - r |= (uint32_t)(tt[u] - 1); - } - return (int)(1u - (r >> 31)); -} - -/* see inner.h */ -int -Zf(verify_recover)(uint16_t *h, - const uint16_t *c0, const int16_t *s1, const int16_t *s2, - unsigned logn, uint8_t *tmp) -{ - size_t u, n; - uint16_t *tt; - uint32_t r; - - n = (size_t)1 << logn; - - /* - * Reduce elements of s1 and s2 modulo q; then write s2 into tt[] - * and c0 - s1 into h[]. - */ - tt = (uint16_t *)tmp; - for (u = 0; u < n; u ++) { - uint32_t w; - - w = (uint32_t)s2[u]; - w += Q & -(w >> 31); - tt[u] = (uint16_t)w; - - w = (uint32_t)s1[u]; - w += Q & -(w >> 31); - w = mq_sub(c0[u], w); - h[u] = (uint16_t)w; - } - - /* - * Compute h = (c0 - s1) / s2. If one of the coefficients of s2 - * is zero (in NTT representation) then the operation fails. We - * keep that information into a flag so that we do not deviate - * from strict constant-time processing; if all coefficients of - * s2 are non-zero, then the high bit of r will be zero. - */ - mq_NTT(tt, logn); - mq_NTT(h, logn); - r = 0; - for (u = 0; u < n; u ++) { - r |= (uint32_t)(tt[u] - 1); - h[u] = (uint16_t)mq_div_12289(h[u], tt[u]); - } - mq_iNTT(h, logn); - - /* - * Signature is acceptable if and only if it is short enough, - * and s2 was invertible mod phi mod q. The caller must still - * check that the rebuilt public key matches the expected - * value (e.g. through a hash). - */ - r = ~r & (uint32_t)-Zf(is_short)(s1, s2, logn); - return (int)(r >> 31); -} - -/* see inner.h */ -int -Zf(count_nttzero)(const int16_t *sig, unsigned logn, uint8_t *tmp) -{ - uint16_t *s2; - size_t u, n; - uint32_t r; - - n = (size_t)1 << logn; - s2 = (uint16_t *)tmp; - for (u = 0; u < n; u ++) { - uint32_t w; - - w = (uint32_t)sig[u]; - w += Q & -(w >> 31); - s2[u] = (uint16_t)w; - } - mq_NTT(s2, logn); - r = 0; - for (u = 0; u < n; u ++) { - uint32_t w; - - w = (uint32_t)s2[u] - 1u; - r += (w >> 31); - } - return (int)r; -} diff --git a/crypto_sign/falcon-512-tree/m4-ct/README.txt b/crypto_sign/falcon-512-tree/m4-ct/README.txt deleted file mode 100644 index 7bedf7f1..00000000 --- a/crypto_sign/falcon-512-tree/m4-ct/README.txt +++ /dev/null @@ -1,137 +0,0 @@ -Falcon implementation for PQM4 (or even mupq in general). - - -There are multiple variants. Each variant is selected with the choice of -api.h (four choices: api512dyn.h, api512tree.h, api1024dyn.h, -api1024tree.h), and additional compile-time macro that are documented in -config.h and can be set either in config.h, or through command-line -flags passed to the C compiler. - -Choice of api.h: - - api512dyn.h - "Normal" Falcon-512. Private key is reasonably compact. The - Falcon LDL tree is internally recomputed for each signature. - - api512tree.h - Falcon-512 is key expansion. The Falcon LDL tree is computed - as part of the keygen, and returned as private key. This - speeds up signature generation, but also greatly enlarges - the private key size. - - api1024dyn.h - "Normal" Falcon-1024. - - api1024tree.h - Falcon-1024 with key expansion. - -Compile-time options (config.h): - - FALCON_FPEMU - Set to 1 to enable use of the internal constant-time emulation - of floating-point operations. - - FALCON_FPNATIVE - Set to 1 to use the native 'double' type and floating-point - operations. On architectures that lack a FPU, this will use the - compiler-provided floating-point emulation routines, which are - usually not constant-time (and sometimes return values which - do not follow IEEE-754 rounding rules). - - FALCON_ASM_CORTEXM4 - Set to 1 to use the M4 assembly routine for the constant-time - emulation of floating-point operations. These are faster than - the generic routines in C activated by FALCON_FPEMU. - -There is some internal autodetection that tries to select the right -values automatically, but it's safer to explicitly select things: - - To use the native 'double' type: - -DFALCON_FPNATIVE=1 - - To use the generic FP emulation code: - -DFALCON_FPEMU=1 -DFALCON_ASM_CORTEXM4=0 - - To use the M4 assembly code for FP emulation: - -DFALCON_FPEMU=1 -DFALCON_ASM_CORTEXM4=1 - -The code relying on the native 'double' type requires an implementation -that follows IEEE-754 rules with a 64-bit type. It works on 64-bit x86 -and PowerPC / POWER systems. On 32-bit x86, it tends to fail because the -80387 FPU is used with more precision; on such a system, use -'-msse2 -mfpmath=sse' to force use of the SSE2 unit (this might be the -default on some systems, e.g. Darwin / macOS). - - -IMPORTANT NOTES -=============== - - * The PQM4 API is implemented in pqm4.c. Since the M4 stack is usually - small (usual default is 4 kB), temporary buffers are statically - allocated. This implies that the crypto_sign_keypair(), crypto_sign() - and crypto_sign_open() functions are not thread-safe or reentrant. - Also, the static allocation is "forever". - - See the comments for the 'tmp' variable in pqm4.c; this gives the - relevant sizes. - - * When using expanded keys, the private key contains 64-bit values - (floating-point, i.e. 'double' or 'uint64_t' depending on the kind - of floating-point emulation that is used). On many systems, this - implies some alignment requirements. I.e. crypto_sign_keypair() and - crypto_sign() then require the 'sk' pointer to be suitably aligned. - On an ARM Cortex M4, 32-bit alignment is required (while the basic - RAM access opcodes tolerate unaligned accesses, the 'ldm' and 'stm' - opcodes need 32-bit aligned pointers). - - * When using the native 'double' type, the code has a dependency on - the sqrt() function. On x86, the relevant SSE2 opcode is inlined, - but the library function is still (potentially) invoked in case the - operand is negative, so that proper error management is performed. - This case does not happen in Falcon, but the library function is - still referenced, and explicitly linking with '-lm' may be - necessary. - - * When using the native 'double' type, do _NOT_ enable -ffast-math. - The internal rounding function relies on the usual trick: - when x >= 0, round(x) = (x + 2**52) - 2**52 - - This trick works only as long as each addition is rounded as per - the IEEE-754 rules to the exact precision of the 64-bit type. - When -ffast-math is enabled, the compiler may assume commutativity - and "optimize" that expression into 'round(x) = x', which does not - work at all. - - -TESTS -===== - -In the 'tests/' directory is a generator for known-answer tests, and the -expected file. The code comes from the NIST, but was modified to avoid a -dependency on OpenSSL. When compiling the C source file against the -selected Falcon implementation, an executable is produced, that, when -executed, generates an '*.req' and an '*.rsp' files. The .req file is -redundant (the .rsp file contains all the information, and some more). - -The expected .rsp files are provided as: - KAT512dyn.rsp Falcon-512, no expanded key - KAT512tree.rsp Falcon-512, with expanded key - KAT1024dyn.rsp Falcon-1024, no expanded key - KAT1024tree.rsp Falcon-1024, with expanded key - - -Normally, all computations are exact and the files are exactly -reproducible. However, some discrepancies may occur with the '*tree' -files in the following cases: - - - On big-endian architectures, the bytes in sk[] will be in a - different order. This is a side effect of putting the raw bytes - of the expanded key in sk[] (this could be fixed with some - reencoding pass, but this was not implemented yet). - - - If a non-exact IEEE-754 implementation is used, some of the - low bits of the values may be changed. This may happen if the - underlying implementation is not strictly faithful to rounding. - -As long as only the 'sk' lines are changed, then the public keys -and signature values are unimpacted. diff --git a/crypto_sign/falcon-512-tree/m4-ct/api.h b/crypto_sign/falcon-512-tree/m4-ct/api.h deleted file mode 100644 index 81082b45..00000000 --- a/crypto_sign/falcon-512-tree/m4-ct/api.h +++ /dev/null @@ -1,17 +0,0 @@ -#include - -#define CRYPTO_SECRETKEYBYTES 57344 -#define CRYPTO_PUBLICKEYBYTES 897 -#define CRYPTO_BYTES 690 - -#define CRYPTO_ALGNAME "Falcon-512-tree" - -int crypto_sign_keypair(unsigned char *pk, unsigned char *sk); - -int crypto_sign(unsigned char *sm, size_t *smlen, - const unsigned char *m, size_t mlen, - const unsigned char *sk); - -int crypto_sign_open(unsigned char *m, size_t *mlen, - const unsigned char *sm, size_t smlen, - const unsigned char *pk); diff --git a/crypto_sign/falcon-512-tree/m4-ct/codec.c b/crypto_sign/falcon-512-tree/m4-ct/codec.c deleted file mode 100644 index 5bd61424..00000000 --- a/crypto_sign/falcon-512-tree/m4-ct/codec.c +++ /dev/null @@ -1,559 +0,0 @@ -/* - * Encoding/decoding of keys and signatures. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -/* see inner.h */ -size_t -Zf(modq_encode)( - void *out, size_t max_out_len, - const uint16_t *x, unsigned logn) -{ - size_t n, out_len, u; - uint8_t *buf; - uint32_t acc; - int acc_len; - - n = (size_t)1 << logn; - for (u = 0; u < n; u ++) { - if (x[u] >= 12289) { - return 0; - } - } - out_len = ((n * 14) + 7) >> 3; - if (out == NULL) { - return out_len; - } - if (out_len > max_out_len) { - return 0; - } - buf = out; - acc = 0; - acc_len = 0; - for (u = 0; u < n; u ++) { - acc = (acc << 14) | x[u]; - acc_len += 14; - while (acc_len >= 8) { - acc_len -= 8; - *buf ++ = (uint8_t)(acc >> acc_len); - } - } - if (acc_len > 0) { - *buf = (uint8_t)(acc << (8 - acc_len)); - } - return out_len; -} - -/* see inner.h */ -size_t -Zf(modq_decode)( - uint16_t *x, unsigned logn, - const void *in, size_t max_in_len) -{ - size_t n, in_len, u; - const uint8_t *buf; - uint32_t acc; - int acc_len; - - n = (size_t)1 << logn; - in_len = ((n * 14) + 7) >> 3; - if (in_len > max_in_len) { - return 0; - } - buf = in; - acc = 0; - acc_len = 0; - u = 0; - while (u < n) { - acc = (acc << 8) | (*buf ++); - acc_len += 8; - if (acc_len >= 14) { - unsigned w; - - acc_len -= 14; - w = (acc >> acc_len) & 0x3FFF; - if (w >= 12289) { - return 0; - } - x[u ++] = (uint16_t)w; - } - } - if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) { - return 0; - } - return in_len; -} - -/* see inner.h */ -size_t -Zf(trim_i16_encode)( - void *out, size_t max_out_len, - const int16_t *x, unsigned logn, unsigned bits) -{ - size_t n, u, out_len; - int minv, maxv; - uint8_t *buf; - uint32_t acc, mask; - unsigned acc_len; - - n = (size_t)1 << logn; - maxv = (1 << (bits - 1)) - 1; - minv = -maxv; - for (u = 0; u < n; u ++) { - if (x[u] < minv || x[u] > maxv) { - return 0; - } - } - out_len = ((n * bits) + 7) >> 3; - if (out == NULL) { - return out_len; - } - if (out_len > max_out_len) { - return 0; - } - buf = out; - acc = 0; - acc_len = 0; - mask = ((uint32_t)1 << bits) - 1; - for (u = 0; u < n; u ++) { - acc = (acc << bits) | ((uint16_t)x[u] & mask); - acc_len += bits; - while (acc_len >= 8) { - acc_len -= 8; - *buf ++ = (uint8_t)(acc >> acc_len); - } - } - if (acc_len > 0) { - *buf ++ = (uint8_t)(acc << (8 - acc_len)); - } - return out_len; -} - -/* see inner.h */ -size_t -Zf(trim_i16_decode)( - int16_t *x, unsigned logn, unsigned bits, - const void *in, size_t max_in_len) -{ - size_t n, in_len; - const uint8_t *buf; - size_t u; - uint32_t acc, mask1, mask2; - unsigned acc_len; - - n = (size_t)1 << logn; - in_len = ((n * bits) + 7) >> 3; - if (in_len > max_in_len) { - return 0; - } - buf = in; - u = 0; - acc = 0; - acc_len = 0; - mask1 = ((uint32_t)1 << bits) - 1; - mask2 = (uint32_t)1 << (bits - 1); - while (u < n) { - acc = (acc << 8) | *buf ++; - acc_len += 8; - while (acc_len >= bits && u < n) { - uint32_t w; - - acc_len -= bits; - w = (acc >> acc_len) & mask1; - w |= -(w & mask2); - if (w == -mask2) { - /* - * The -2^(bits-1) value is forbidden. - */ - return 0; - } - w |= -(w & mask2); - x[u ++] = (int16_t)*(int32_t *)&w; - } - } - if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) { - /* - * Extra bits in the last byte must be zero. - */ - return 0; - } - return in_len; -} - -/* see inner.h */ -size_t -Zf(trim_i8_encode)( - void *out, size_t max_out_len, - const int8_t *x, unsigned logn, unsigned bits) -{ - size_t n, u, out_len; - int minv, maxv; - uint8_t *buf; - uint32_t acc, mask; - unsigned acc_len; - - n = (size_t)1 << logn; - maxv = (1 << (bits - 1)) - 1; - minv = -maxv; - for (u = 0; u < n; u ++) { - if (x[u] < minv || x[u] > maxv) { - return 0; - } - } - out_len = ((n * bits) + 7) >> 3; - if (out == NULL) { - return out_len; - } - if (out_len > max_out_len) { - return 0; - } - buf = out; - acc = 0; - acc_len = 0; - mask = ((uint32_t)1 << bits) - 1; - for (u = 0; u < n; u ++) { - acc = (acc << bits) | ((uint8_t)x[u] & mask); - acc_len += bits; - while (acc_len >= 8) { - acc_len -= 8; - *buf ++ = (uint8_t)(acc >> acc_len); - } - } - if (acc_len > 0) { - *buf ++ = (uint8_t)(acc << (8 - acc_len)); - } - return out_len; -} - -/* see inner.h */ -size_t -Zf(trim_i8_decode)( - int8_t *x, unsigned logn, unsigned bits, - const void *in, size_t max_in_len) -{ - size_t n, in_len; - const uint8_t *buf; - size_t u; - uint32_t acc, mask1, mask2; - unsigned acc_len; - - n = (size_t)1 << logn; - in_len = ((n * bits) + 7) >> 3; - if (in_len > max_in_len) { - return 0; - } - buf = in; - u = 0; - acc = 0; - acc_len = 0; - mask1 = ((uint32_t)1 << bits) - 1; - mask2 = (uint32_t)1 << (bits - 1); - while (u < n) { - acc = (acc << 8) | *buf ++; - acc_len += 8; - while (acc_len >= bits && u < n) { - uint32_t w; - - acc_len -= bits; - w = (acc >> acc_len) & mask1; - w |= -(w & mask2); - if (w == -mask2) { - /* - * The -2^(bits-1) value is forbidden. - */ - return 0; - } - x[u ++] = (int8_t)*(int32_t *)&w; - } - } - if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) { - /* - * Extra bits in the last byte must be zero. - */ - return 0; - } - return in_len; -} - -/* see inner.h */ -size_t -Zf(comp_encode)( - void *out, size_t max_out_len, - const int16_t *x, unsigned logn) -{ - uint8_t *buf; - size_t n, u, v; - uint32_t acc; - unsigned acc_len; - - n = (size_t)1 << logn; - buf = out; - - /* - * Make sure that all values are within the -2047..+2047 range. - */ - for (u = 0; u < n; u ++) { - if (x[u] < -2047 || x[u] > +2047) { - return 0; - } - } - - acc = 0; - acc_len = 0; - v = 0; - for (u = 0; u < n; u ++) { - int t; - unsigned w; - - /* - * Get sign and absolute value of next integer; push the - * sign bit. - */ - acc <<= 1; - t = x[u]; - if (t < 0) { - t = -t; - acc |= 1; - } - w = (unsigned)t; - - /* - * Push the low 7 bits of the absolute value. - */ - acc <<= 7; - acc |= w & 127u; - w >>= 7; - - /* - * We pushed exactly 8 bits. - */ - acc_len += 8; - - /* - * Push as many zeros as necessary, then a one. Since the - * absolute value is at most 2047, w can only range up to - * 15 at this point, thus we will add at most 16 bits - * here. With the 8 bits above and possibly up to 7 bits - * from previous iterations, we may go up to 31 bits, which - * will fit in the accumulator, which is an uint32_t. - */ - acc <<= (w + 1); - acc |= 1; - acc_len += w + 1; - - /* - * Produce all full bytes. - */ - while (acc_len >= 8) { - acc_len -= 8; - if (buf != NULL) { - if (v >= max_out_len) { - return 0; - } - buf[v] = (uint8_t)(acc >> acc_len); - } - v ++; - } - } - - /* - * Flush remaining bits (if any). - */ - if (acc_len > 0) { - if (buf != NULL) { - if (v >= max_out_len) { - return 0; - } - buf[v] = (uint8_t)(acc << (8 - acc_len)); - } - v ++; - } - - return v; -} - -/* see inner.h */ -size_t -Zf(comp_decode)( - int16_t *x, unsigned logn, - const void *in, size_t max_in_len) -{ - const uint8_t *buf; - size_t n, u, v; - uint32_t acc; - unsigned acc_len; - - n = (size_t)1 << logn; - buf = in; - acc = 0; - acc_len = 0; - v = 0; - for (u = 0; u < n; u ++) { - unsigned b, s, m; - - /* - * Get next eight bits: sign and low seven bits of the - * absolute value. - */ - if (v >= max_in_len) { - return 0; - } - acc = (acc << 8) | (uint32_t)buf[v ++]; - b = acc >> acc_len; - s = b & 128; - m = b & 127; - - /* - * Get next bits until a 1 is reached. - */ - for (;;) { - if (acc_len == 0) { - if (v >= max_in_len) { - return 0; - } - acc = (acc << 8) | (uint32_t)buf[v ++]; - acc_len = 8; - } - acc_len --; - if (((acc >> acc_len) & 1) != 0) { - break; - } - m += 128; - if (m > 2047) { - return 0; - } - } - x[u] = (int16_t)(s ? -(int)m : (int)m); - } - return v; -} - -/* - * Key elements and signatures are polynomials with small integer - * coefficients. Here are some statistics gathered over many - * generated key pairs (10000 or more for each degree): - * - * log(n) n max(f,g) std(f,g) max(F,G) std(F,G) - * 1 2 129 56.31 143 60.02 - * 2 4 123 40.93 160 46.52 - * 3 8 97 28.97 159 38.01 - * 4 16 100 21.48 154 32.50 - * 5 32 71 15.41 151 29.36 - * 6 64 59 11.07 138 27.77 - * 7 128 39 7.91 144 27.00 - * 8 256 32 5.63 148 26.61 - * 9 512 22 4.00 137 26.46 - * 10 1024 15 2.84 146 26.41 - * - * We want a compact storage format for private key, and, as part of - * key generation, we are allowed to reject some keys which would - * otherwise be fine (this does not induce any noticeable vulnerability - * as long as we reject only a small proportion of possible keys). - * Hence, we enforce at key generation time maximum values for the - * elements of f, g, F and G, so that their encoding can be expressed - * in fixed-width values. Limits have been chosen so that generated - * keys are almost always within bounds, thus not impacting neither - * security or performance. - * - * IMPORTANT: the code assumes that all coefficients of f, g, F and G - * ultimately fit in the -127..+127 range. Thus, none of the elements - * of max_fg_bits[] and max_FG_bits[] shall be greater than 8. - */ - -const uint8_t Zf(max_fg_bits)[] = { - 0, /* unused */ - 8, - 8, - 8, - 8, - 8, - 7, - 7, - 6, - 6, - 5 -}; - -const uint8_t Zf(max_FG_bits)[] = { - 0, /* unused */ - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 8 -}; - -/* - * When generating a new key pair, we can always reject keys which - * feature an abnormally large coefficient. This can also be done for - * signatures, albeit with some care: in case the signature process is - * used in a derandomized setup (explicitly seeded with the message and - * private key), we have to follow the specification faithfully, and the - * specification only enforces a limit on the L2 norm of the signature - * vector. The limit on the L2 norm implies that the absolute value of - * a coefficient of the signature cannot be more than the following: - * - * log(n) n max sig coeff (theoretical) - * 1 2 412 - * 2 4 583 - * 3 8 824 - * 4 16 1166 - * 5 32 1649 - * 6 64 2332 - * 7 128 3299 - * 8 256 4665 - * 9 512 6598 - * 10 1024 9331 - * - * However, the largest observed signature coefficients during our - * experiments was 1077 (in absolute value), hence we can assume that, - * with overwhelming probability, signature coefficients will fit - * in -2047..2047, i.e. 12 bits. - */ - -const uint8_t Zf(max_sig_bits)[] = { - 0, /* unused */ - 10, - 11, - 11, - 12, - 12, - 12, - 12, - 12, - 12, - 12 -}; diff --git a/crypto_sign/falcon-512-tree/m4-ct/common.c b/crypto_sign/falcon-512-tree/m4-ct/common.c deleted file mode 100644 index ef30028b..00000000 --- a/crypto_sign/falcon-512-tree/m4-ct/common.c +++ /dev/null @@ -1,297 +0,0 @@ -/* - * Support functions for signatures (hash-to-point, norm). - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -/* see inner.h */ -void -Zf(hash_to_point_vartime)( - inner_shake256_context *sc, - uint16_t *x, unsigned logn) -{ - /* - * This is the straightforward per-the-spec implementation. It - * is not constant-time, thus it might reveal information on the - * plaintext (at least, enough to check the plaintext against a - * list of potential plaintexts) in a scenario where the - * attacker does not have access to the signature value or to - * the public key, but knows the nonce (without knowledge of the - * nonce, the hashed output cannot be matched against potential - * plaintexts). - */ - size_t n; - - n = (size_t)1 << logn; - while (n > 0) { - uint8_t buf[2]; - uint32_t w; - - inner_shake256_extract(sc, (void *)buf, sizeof buf); - w = ((unsigned)buf[0] << 8) | (unsigned)buf[1]; - if (w < 61445) { - while (w >= 12289) { - w -= 12289; - } - *x ++ = (uint16_t)w; - n --; - } - } -} - -/* see inner.h */ -void -Zf(hash_to_point_ct)( - inner_shake256_context *sc, - uint16_t *x, unsigned logn, uint8_t *tmp) -{ - /* - * Each 16-bit sample is a value in 0..65535. The value is - * kept if it falls in 0..61444 (because 61445 = 5*12289) - * and rejected otherwise; thus, each sample has probability - * about 0.93758 of being selected. - * - * We want to oversample enough to be sure that we will - * have enough values with probability at least 1 - 2^(-256). - * Depending on degree N, this leads to the following - * required oversampling: - * - * logn n oversampling - * 1 2 65 - * 2 4 67 - * 3 8 71 - * 4 16 77 - * 5 32 86 - * 6 64 100 - * 7 128 122 - * 8 256 154 - * 9 512 205 - * 10 1024 287 - * - * If logn >= 7, then the provided temporary buffer is large - * enough. Otherwise, we use a stack buffer of 63 entries - * (i.e. 126 bytes) for the values that do not fit in tmp[]. - */ - - static const uint16_t overtab[] = { - 0, /* unused */ - 65, - 67, - 71, - 77, - 86, - 100, - 122, - 154, - 205, - 287 - }; - - unsigned n, n2, u, m, p, over; - uint16_t *tt1, tt2[63]; - - /* - * We first generate m 16-bit value. Values 0..n-1 go to x[]. - * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[]. - * We also reduce modulo q the values; rejected values are set - * to 0xFFFF. - */ - n = 1U << logn; - n2 = n << 1; - over = overtab[logn]; - m = n + over; - tt1 = (uint16_t *)tmp; - for (u = 0; u < m; u ++) { - uint8_t buf[2]; - uint32_t w, wr; - - inner_shake256_extract(sc, buf, sizeof buf); - w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1]; - wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1)); - wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1)); - wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1)); - wr |= ((w - 61445) >> 31) - 1; - if (u < n) { - x[u] = (uint16_t)wr; - } else if (u < n2) { - tt1[u - n] = (uint16_t)wr; - } else { - tt2[u - n2] = (uint16_t)wr; - } - } - - /* - * Now we must "squeeze out" the invalid values. We do this in - * a logarithmic sequence of passes; each pass computes where a - * value should go, and moves it down by 'p' slots if necessary, - * where 'p' uses an increasing powers-of-two scale. It can be - * shown that in all cases where the loop decides that a value - * has to be moved down by p slots, the destination slot is - * "free" (i.e. contains an invalid value). - */ - for (p = 1; p <= over; p <<= 1) { - unsigned v; - - /* - * In the loop below: - * - * - v contains the index of the final destination of - * the value; it is recomputed dynamically based on - * whether values are valid or not. - * - * - u is the index of the value we consider ("source"); - * its address is s. - * - * - The loop may swap the value with the one at index - * u-p. The address of the swap destination is d. - */ - v = 0; - for (u = 0; u < m; u ++) { - uint16_t *s, *d; - unsigned j, sv, dv, mk; - - if (u < n) { - s = &x[u]; - } else if (u < n2) { - s = &tt1[u - n]; - } else { - s = &tt2[u - n2]; - } - sv = *s; - - /* - * The value in sv should ultimately go to - * address v, i.e. jump back by u-v slots. - */ - j = u - v; - - /* - * We increment v for the next iteration, but - * only if the source value is valid. The mask - * 'mk' is -1 if the value is valid, 0 otherwise, - * so we _subtract_ mk. - */ - mk = (sv >> 15) - 1U; - v -= mk; - - /* - * In this loop we consider jumps by p slots; if - * u < p then there is nothing more to do. - */ - if (u < p) { - continue; - } - - /* - * Destination for the swap: value at address u-p. - */ - if ((u - p) < n) { - d = &x[u - p]; - } else if ((u - p) < n2) { - d = &tt1[(u - p) - n]; - } else { - d = &tt2[(u - p) - n2]; - } - dv = *d; - - /* - * The swap should be performed only if the source - * is valid AND the jump j has its 'p' bit set. - */ - mk &= -(((j & p) + 0x1FF) >> 9); - - *s = (uint16_t)(sv ^ (mk & (sv ^ dv))); - *d = (uint16_t)(dv ^ (mk & (sv ^ dv))); - } - } -} - -/* see inner.h */ -int -Zf(is_short)( - const int16_t *s1, const int16_t *s2, unsigned logn) -{ - /* - * We use the l2-norm. Code below uses only 32-bit operations to - * compute the square of the norm with saturation to 2^32-1 if - * the value exceeds 2^31-1. - */ - size_t n, u; - uint32_t s, ng; - - n = (size_t)1 << logn; - s = 0; - ng = 0; - for (u = 0; u < n; u ++) { - int32_t z; - - z = s1[u]; - s += (uint32_t)(z * z); - ng |= s; - z = s2[u]; - s += (uint32_t)(z * z); - ng |= s; - } - s |= -(ng >> 31); - - /* - * Acceptance bound on the l2-norm is: - * 1.2*1.55*sqrt(q)*sqrt(2*N) - * Value 7085 is floor((1.2^2)*(1.55^2)*2*1024). - */ - return s < (((uint32_t)7085 * (uint32_t)12289) >> (10 - logn)); -} - -/* see inner.h */ -int -Zf(is_short_half)( - uint32_t sqn, const int16_t *s2, unsigned logn) -{ - size_t n, u; - uint32_t ng; - - n = (size_t)1 << logn; - ng = -(sqn >> 31); - for (u = 0; u < n; u ++) { - int32_t z; - - z = s2[u]; - sqn += (uint32_t)(z * z); - ng |= sqn; - } - sqn |= -(ng >> 31); - - /* - * Acceptance bound on the l2-norm is: - * 1.2*1.55*sqrt(q)*sqrt(2*N) - * Value 7085 is floor((1.2^2)*(1.55^2)*2*1024). - */ - return sqn < (((uint32_t)7085 * (uint32_t)12289) >> (10 - logn)); -} diff --git a/crypto_sign/falcon-512-tree/m4-ct/config.h b/crypto_sign/falcon-512-tree/m4-ct/config.h deleted file mode 100644 index cd78727e..00000000 --- a/crypto_sign/falcon-512-tree/m4-ct/config.h +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Manual configuration file for the Falcon implementation. Here can - * be set some compilation-time options. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#ifndef FALCON_CONFIG_H__ -#define FALCON_CONFIG_H__ - -/* - * Each option is a macro which should be defined to either 1 or 0. - * If any of the options below is left undefined, then a default value - * will be used by the code, possibly using compile-time autodetection - * from compiler-defined macros. - * - * Explicitly setting a parameter can be done by uncommenting/modifying - * its definition below, in this file, or equivalently by setting it as - * a compiler flag. - */ - -/* - * Use the native 'double' C type for floating-point computations. Exact - * reproducibility of all tests requires that type to faithfully follow - * IEEE-754 "round-to-nearest" rules. - * - * Native double support will use the CPU hardware and/or - * compiler-provided functions; the latter is typically NOT - * constant-time, while the former MAY be constant-time, or not. On - * recent x86 CPU in 64-bit mode, SSE2 opcodes are used and they provide - * constant-time operations for all the operations used in Falcon, - * except for some special cases of divisions and square roots, but it - * can be shown that theses cases imply only negligible leak of - * information that cannot be leveraged into a full attack. - * - * If neither FALCON_FPNATIVE nor FALCON_FPEMU is defined, then use of - * the native 'double' C type is the default behaviour unless - * FALCON_ASM_CORTEXM4 is defined to 1, in which case the emulated code - * will be used. - * -#define FALCON_FPNATIVE 1 - */ - -/* - * Use emulated floating-point implementation. - * - * Emulation uses only integer operations with uint32_t and uint64_t - * types. This is constant-time, provided that the underlying platform - * offers constant-time opcodes for the following operations: - * - * - Multiplication of two 32-bit unsigned integers into a 64-bit result. - * - Left-shift or right-shift of a 32-bit unsigned integer by a - * potentially secret shift count in the 0..31 range. - * - * Notably, the ARM Cortex M3 does not fulfill the first condition, - * while the Pentium IV does not fulfill the second. - * - * If neither FALCON_FPNATIVE nor FALCON_FPEMU is defined, then use of - * the native 'double' C type is the default behaviour unless - * FALCON_ASM_CORTEXM4 is defined to 1, in which case the emulated code - * will be used. - * -#define FALCON_FPEMU 1 - */ - -/* - * Enable use of assembly for ARM Cortex-M4 CPU. By default, such - * support will be used based on some autodection on the compiler - * version and target architecture. Define this variable to 1 to force - * use of the assembly code, or 0 to disable it regardless of the - * autodetection. - * - * When FALCON_ASM_CORTEXM4 is enabled (whether defined explicitly or - * autodetected), emulated floating-point code will be used, unless - * FALCON_FPNATIVE or FALCON_FPEMU is explicitly set to override the - * choice. Emulated code with ARM assembly is constant-time and provides - * better performance than emulated code with plain C. - * - * The assembly code for the M4 can also work on a Cortex-M3. If the - * compiler is instructed to target the M3 (e.g. '-mcpu=cortex-m3' with - * GCC) then FALCON_ASM_CORTEXM4 won't be autodetected, but it can be - * enabled explicitly. Take care, though, that the M3 multiplication - * opcode (multiplication of two 32-bit unsigned integers with a 64-bit - * result) is NOT constant-time. - * -#define FALCON_ASM_CORTEXM4 1 - */ - -#define FALCON_ASM_CORTEXM4 1 - -/* - * Enable use of AVX2 intrinsics. If enabled, then the code will compile - * only when targeting x86 with a compiler that supports AVX2 intrinsics - * (tested with GCC 7.4.0, Clang 6.0.0, and MSVC 2015, both in 32-bit - * and 64-bit modes), and run only on systems that offer the AVX2 - * opcodes. Some operations leverage AVX2 for better performance. - * -#define FALCON_AVX2 1 - */ - -/* - * Enable use of FMA intrinsics. This setting has any effect only if - * FALCON_AVX2 is also enabled. The FMA intrinsics are normally available - * on any x86 CPU that also has AVX2. Note that setting this option will - * slightly modify the values of expanded private keys, but will normally - * not change the values of non-expanded private keys, public keys or - * signatures, for a given keygen/sign seed (non-expanded private keys - * and signatures might theoretically change, but only with low probability, - * less than 2^(-40); produced signatures are still safe and interoperable). - * -#define FALCON_FMA 1 - */ - -/* - * Assert that the platform uses little-endian encoding. If enabled, - * then encoding and decoding of aligned multibyte values will be - * slightly faster (especially for hashing and random number - * generation). If not defined explicitly, then autodetection is - * applied. - * -#define FALCON_LE 1 - */ - -/* - * Assert that the platform tolerates accesses to unaligned multibyte - * values. If enabled, then some operations are slightly faster. Note - * that ARM Cortex M4 do _not_ fully tolerate unaligned accesses; for - * such systems, this option should not be enabled. If not defined - * explicitly, then autodetection is applied. - * -#define FALCON_UNALIGNED 1 - */ - -/* - * Use a PRNG based on ChaCha20 and seeded with SHAKE256, instead of - * SHAKE256 directly, for key pair generation purposes. This speeds up - * key pair generation, especially on platforms where SHAKE256 is - * comparatively slow: on the ARM Cortex M4, average key generation time - * is reduced by 19% with this setting; on a recent x86 Skylake, the - * reduction is smaller (less than 8%). - * - * However, this setting changes the private/public key pair obtained - * from a given seed, thus preventing reproducibility of the - * known-answer tests vectors. For compatibility with existing KAT - * vectors (e.g. in PQClean, pqm4 and NIST implementations), this - * setting is not enabled by default. - * -#define FALCON_KG_CHACHA20 1 - */ - -/* - * Use an explicit OS-provided source of randomness for seeding (for the - * Zf(get_seed)() function implementation). Three possible sources are - * defined: - * - * - getentropy() system call - * - /dev/urandom special file - * - CryptGenRandom() function call - * - * More than one source may be enabled, in which case they will be tried - * in the order above, until a success is reached. - * - * By default, sources are enabled at compile-time based on these - * conditions: - * - * - getentropy(): target is one of: Linux with Glibc-2.25+, FreeBSD 12+, - * or OpenBSD. - * - /dev/urandom: target is a Unix-like system (including Linux, - * FreeBSD, NetBSD, OpenBSD, DragonFly, macOS, Android, Solaris, AIX). - * - CryptGenRandom(): target is Windows (Win32 or Win64). - * - * On most small embedded systems, none will be enabled and Zf(get_seed)() - * will always return 0. Applications will need to provide their own seeds. - * -#define FALCON_RAND_GETENTROPY 1 -#define FALCON_RAND_URANDOM 1 -#define FALCON_RAND_WIN32 1 - */ - -#endif diff --git a/crypto_sign/falcon-512-tree/m4-ct/fft.c b/crypto_sign/falcon-512-tree/m4-ct/fft.c deleted file mode 100644 index b1904b24..00000000 --- a/crypto_sign/falcon-512-tree/m4-ct/fft.c +++ /dev/null @@ -1,1412 +0,0 @@ -/* - * FFT code. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -/* - * Rules for complex number macros: - * -------------------------------- - * - * Operand order is: destination, source1, source2... - * - * Each operand is a real and an imaginary part. - * - * All overlaps are allowed. - */ - -/* - * Addition of two complex numbers (d = a + b). - */ -#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im) do { \ - fpr fpct_re, fpct_im; \ - fpct_re = fpr_add(a_re, b_re); \ - fpct_im = fpr_add(a_im, b_im); \ - (d_re) = fpct_re; \ - (d_im) = fpct_im; \ - } while (0) - -/* - * Subtraction of two complex numbers (d = a - b). - */ -#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im) do { \ - fpr fpct_re, fpct_im; \ - fpct_re = fpr_sub(a_re, b_re); \ - fpct_im = fpr_sub(a_im, b_im); \ - (d_re) = fpct_re; \ - (d_im) = fpct_im; \ - } while (0) - -/* - * Multplication of two complex numbers (d = a * b). - */ -#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im) do { \ - fpr fpct_a_re, fpct_a_im; \ - fpr fpct_b_re, fpct_b_im; \ - fpr fpct_d_re, fpct_d_im; \ - fpct_a_re = (a_re); \ - fpct_a_im = (a_im); \ - fpct_b_re = (b_re); \ - fpct_b_im = (b_im); \ - fpct_d_re = fpr_sub( \ - fpr_mul(fpct_a_re, fpct_b_re), \ - fpr_mul(fpct_a_im, fpct_b_im)); \ - fpct_d_im = fpr_add( \ - fpr_mul(fpct_a_re, fpct_b_im), \ - fpr_mul(fpct_a_im, fpct_b_re)); \ - (d_re) = fpct_d_re; \ - (d_im) = fpct_d_im; \ - } while (0) - -/* - * Squaring of a complex number (d = a * a). - */ -#define FPC_SQR(d_re, d_im, a_re, a_im) do { \ - fpr fpct_a_re, fpct_a_im; \ - fpr fpct_d_re, fpct_d_im; \ - fpct_a_re = (a_re); \ - fpct_a_im = (a_im); \ - fpct_d_re = fpr_sub(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \ - fpct_d_im = fpr_double(fpr_mul(fpct_a_re, fpct_a_im)); \ - (d_re) = fpct_d_re; \ - (d_im) = fpct_d_im; \ - } while (0) - -/* - * Inversion of a complex number (d = 1 / a). - */ -#define FPC_INV(d_re, d_im, a_re, a_im) do { \ - fpr fpct_a_re, fpct_a_im; \ - fpr fpct_d_re, fpct_d_im; \ - fpr fpct_m; \ - fpct_a_re = (a_re); \ - fpct_a_im = (a_im); \ - fpct_m = fpr_add(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \ - fpct_m = fpr_inv(fpct_m); \ - fpct_d_re = fpr_mul(fpct_a_re, fpct_m); \ - fpct_d_im = fpr_mul(fpr_neg(fpct_a_im), fpct_m); \ - (d_re) = fpct_d_re; \ - (d_im) = fpct_d_im; \ - } while (0) - -/* - * Division of complex numbers (d = a / b). - */ -#define FPC_DIV(d_re, d_im, a_re, a_im, b_re, b_im) do { \ - fpr fpct_a_re, fpct_a_im; \ - fpr fpct_b_re, fpct_b_im; \ - fpr fpct_d_re, fpct_d_im; \ - fpr fpct_m; \ - fpct_a_re = (a_re); \ - fpct_a_im = (a_im); \ - fpct_b_re = (b_re); \ - fpct_b_im = (b_im); \ - fpct_m = fpr_add(fpr_sqr(fpct_b_re), fpr_sqr(fpct_b_im)); \ - fpct_m = fpr_inv(fpct_m); \ - fpct_b_re = fpr_mul(fpct_b_re, fpct_m); \ - fpct_b_im = fpr_mul(fpr_neg(fpct_b_im), fpct_m); \ - fpct_d_re = fpr_sub( \ - fpr_mul(fpct_a_re, fpct_b_re), \ - fpr_mul(fpct_a_im, fpct_b_im)); \ - fpct_d_im = fpr_add( \ - fpr_mul(fpct_a_re, fpct_b_im), \ - fpr_mul(fpct_a_im, fpct_b_re)); \ - (d_re) = fpct_d_re; \ - (d_im) = fpct_d_im; \ - } while (0) - -/* - * Let w = exp(i*pi/N); w is a primitive 2N-th root of 1. We define the - * values w_j = w^(2j+1) for all j from 0 to N-1: these are the roots - * of X^N+1 in the field of complex numbers. A crucial property is that - * w_{N-1-j} = conj(w_j) = 1/w_j for all j. - * - * FFT representation of a polynomial f (taken modulo X^N+1) is the - * set of values f(w_j). Since f is real, conj(f(w_j)) = f(conj(w_j)), - * thus f(w_{N-1-j}) = conj(f(w_j)). We thus store only half the values, - * for j = 0 to N/2-1; the other half can be recomputed easily when (if) - * needed. A consequence is that FFT representation has the same size - * as normal representation: N/2 complex numbers use N real numbers (each - * complex number is the combination of a real and an imaginary part). - * - * We use a specific ordering which makes computations easier. Let rev() - * be the bit-reversal function over log(N) bits. For j in 0..N/2-1, we - * store the real and imaginary parts of f(w_j) in slots: - * - * Re(f(w_j)) -> slot rev(j)/2 - * Im(f(w_j)) -> slot rev(j)/2+N/2 - * - * (Note that rev(j) is even for j < N/2.) - */ - -/* see inner.h */ -TARGET_AVX2 -void -Zf(FFT)(fpr *f, unsigned logn) -{ - /* - * FFT algorithm in bit-reversal order uses the following - * iterative algorithm: - * - * t = N - * for m = 1; m < N; m *= 2: - * ht = t/2 - * for i1 = 0; i1 < m; i1 ++: - * j1 = i1 * t - * s = GM[m + i1] - * for j = j1; j < (j1 + ht); j ++: - * x = f[j] - * y = s * f[j + ht] - * f[j] = x + y - * f[j + ht] = x - y - * t = ht - * - * GM[k] contains w^rev(k) for primitive root w = exp(i*pi/N). - * - * In the description above, f[] is supposed to contain complex - * numbers. In our in-memory representation, the real and - * imaginary parts of f[k] are in array slots k and k+N/2. - * - * We only keep the first half of the complex numbers. We can - * see that after the first iteration, the first and second halves - * of the array of complex numbers have separate lives, so we - * simply ignore the second part. - */ - - unsigned u; - size_t t, n, hn, m; - - /* - * First iteration: compute f[j] + i * f[j+N/2] for all j < N/2 - * (because GM[1] = w^rev(1) = w^(N/2) = i). - * In our chosen representation, this is a no-op: everything is - * already where it should be. - */ - - /* - * Subsequent iterations are truncated to use only the first - * half of values. - */ - n = (size_t)1 << logn; - hn = n >> 1; - t = hn; - for (u = 1, m = 2; u < logn; u ++, m <<= 1) { - size_t ht, hm, i1, j1; - - ht = t >> 1; - hm = m >> 1; - for (i1 = 0, j1 = 0; i1 < hm; i1 ++, j1 += t) { - size_t j, j2; - - j2 = j1 + ht; -#if FALCON_AVX2 // yyyAVX2+1 - if (ht >= 4) { - __m256d s_re, s_im; - - s_re = _mm256_set1_pd( - fpr_gm_tab[((m + i1) << 1) + 0].v); - s_im = _mm256_set1_pd( - fpr_gm_tab[((m + i1) << 1) + 1].v); - for (j = j1; j < j2; j += 4) { - __m256d x_re, x_im, y_re, y_im; - __m256d z_re, z_im; - - x_re = _mm256_loadu_pd(&f[j].v); - x_im = _mm256_loadu_pd(&f[j + hn].v); - z_re = _mm256_loadu_pd(&f[j+ht].v); - z_im = _mm256_loadu_pd(&f[j+ht + hn].v); - y_re = FMSUB(z_re, s_re, - _mm256_mul_pd(z_im, s_im)); - y_im = FMADD(z_re, s_im, - _mm256_mul_pd(z_im, s_re)); - _mm256_storeu_pd(&f[j].v, - _mm256_add_pd(x_re, y_re)); - _mm256_storeu_pd(&f[j + hn].v, - _mm256_add_pd(x_im, y_im)); - _mm256_storeu_pd(&f[j + ht].v, - _mm256_sub_pd(x_re, y_re)); - _mm256_storeu_pd(&f[j + ht + hn].v, - _mm256_sub_pd(x_im, y_im)); - } - } else { - fpr s_re, s_im; - - s_re = fpr_gm_tab[((m + i1) << 1) + 0]; - s_im = fpr_gm_tab[((m + i1) << 1) + 1]; - for (j = j1; j < j2; j ++) { - fpr x_re, x_im, y_re, y_im; - - x_re = f[j]; - x_im = f[j + hn]; - y_re = f[j + ht]; - y_im = f[j + ht + hn]; - FPC_MUL(y_re, y_im, - y_re, y_im, s_re, s_im); - FPC_ADD(f[j], f[j + hn], - x_re, x_im, y_re, y_im); - FPC_SUB(f[j + ht], f[j + ht + hn], - x_re, x_im, y_re, y_im); - } - } -#else // yyyAVX2+0 - fpr s_re, s_im; - - s_re = fpr_gm_tab[((m + i1) << 1) + 0]; - s_im = fpr_gm_tab[((m + i1) << 1) + 1]; - for (j = j1; j < j2; j ++) { - fpr x_re, x_im, y_re, y_im; - - x_re = f[j]; - x_im = f[j + hn]; - y_re = f[j + ht]; - y_im = f[j + ht + hn]; - FPC_MUL(y_re, y_im, y_re, y_im, s_re, s_im); - FPC_ADD(f[j], f[j + hn], - x_re, x_im, y_re, y_im); - FPC_SUB(f[j + ht], f[j + ht + hn], - x_re, x_im, y_re, y_im); - } -#endif // yyyAVX2- - } - t = ht; - } -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(iFFT)(fpr *f, unsigned logn) -{ - /* - * Inverse FFT algorithm in bit-reversal order uses the following - * iterative algorithm: - * - * t = 1 - * for m = N; m > 1; m /= 2: - * hm = m/2 - * dt = t*2 - * for i1 = 0; i1 < hm; i1 ++: - * j1 = i1 * dt - * s = iGM[hm + i1] - * for j = j1; j < (j1 + t); j ++: - * x = f[j] - * y = f[j + t] - * f[j] = x + y - * f[j + t] = s * (x - y) - * t = dt - * for i1 = 0; i1 < N; i1 ++: - * f[i1] = f[i1] / N - * - * iGM[k] contains (1/w)^rev(k) for primitive root w = exp(i*pi/N) - * (actually, iGM[k] = 1/GM[k] = conj(GM[k])). - * - * In the main loop (not counting the final division loop), in - * all iterations except the last, the first and second half of f[] - * (as an array of complex numbers) are separate. In our chosen - * representation, we do not keep the second half. - * - * The last iteration recombines the recomputed half with the - * implicit half, and should yield only real numbers since the - * target polynomial is real; moreover, s = i at that step. - * Thus, when considering x and y: - * y = conj(x) since the final f[j] must be real - * Therefore, f[j] is filled with 2*Re(x), and f[j + t] is - * filled with 2*Im(x). - * But we already have Re(x) and Im(x) in array slots j and j+t - * in our chosen representation. That last iteration is thus a - * simple doubling of the values in all the array. - * - * We make the last iteration a no-op by tweaking the final - * division into a division by N/2, not N. - */ - size_t u, n, hn, t, m; - - n = (size_t)1 << logn; - t = 1; - m = n; - hn = n >> 1; - for (u = logn; u > 1; u --) { - size_t hm, dt, i1, j1; - - hm = m >> 1; - dt = t << 1; - for (i1 = 0, j1 = 0; j1 < hn; i1 ++, j1 += dt) { - size_t j, j2; - - j2 = j1 + t; -#if FALCON_AVX2 // yyyAVX2+1 - if (t >= 4) { - __m256d s_re, s_im; - - s_re = _mm256_set1_pd( - fpr_gm_tab[((hm + i1) << 1) + 0].v); - s_im = _mm256_set1_pd( - fpr_gm_tab[((hm + i1) << 1) + 1].v); - for (j = j1; j < j2; j += 4) { - __m256d x_re, x_im, y_re, y_im; - __m256d z_re, z_im; - - x_re = _mm256_loadu_pd(&f[j].v); - x_im = _mm256_loadu_pd(&f[j + hn].v); - y_re = _mm256_loadu_pd(&f[j+t].v); - y_im = _mm256_loadu_pd(&f[j+t + hn].v); - _mm256_storeu_pd(&f[j].v, - _mm256_add_pd(x_re, y_re)); - _mm256_storeu_pd(&f[j + hn].v, - _mm256_add_pd(x_im, y_im)); - x_re = _mm256_sub_pd(y_re, x_re); - x_im = _mm256_sub_pd(x_im, y_im); - z_re = FMSUB(x_im, s_im, - _mm256_mul_pd(x_re, s_re)); - z_im = FMADD(x_re, s_im, - _mm256_mul_pd(x_im, s_re)); - _mm256_storeu_pd(&f[j+t].v, z_re); - _mm256_storeu_pd(&f[j+t + hn].v, z_im); - } - } else { - fpr s_re, s_im; - - s_re = fpr_gm_tab[((hm + i1) << 1)+0]; - s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1)+1]); - for (j = j1; j < j2; j ++) { - fpr x_re, x_im, y_re, y_im; - - x_re = f[j]; - x_im = f[j + hn]; - y_re = f[j + t]; - y_im = f[j + t + hn]; - FPC_ADD(f[j], f[j + hn], - x_re, x_im, y_re, y_im); - FPC_SUB(x_re, x_im, - x_re, x_im, y_re, y_im); - FPC_MUL(f[j + t], f[j + t + hn], - x_re, x_im, s_re, s_im); - } - } -#else // yyyAVX2+0 - fpr s_re, s_im; - - s_re = fpr_gm_tab[((hm + i1) << 1) + 0]; - s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1) + 1]); - for (j = j1; j < j2; j ++) { - fpr x_re, x_im, y_re, y_im; - - x_re = f[j]; - x_im = f[j + hn]; - y_re = f[j + t]; - y_im = f[j + t + hn]; - FPC_ADD(f[j], f[j + hn], - x_re, x_im, y_re, y_im); - FPC_SUB(x_re, x_im, x_re, x_im, y_re, y_im); - FPC_MUL(f[j + t], f[j + t + hn], - x_re, x_im, s_re, s_im); - } -#endif // yyyAVX2- - } - t = dt; - m = hm; - } - - /* - * Last iteration is a no-op, provided that we divide by N/2 - * instead of N. We need to make a special case for logn = 0. - */ - if (logn > 0) { - fpr ni; - - ni = fpr_p2_tab[logn]; - for (u = 0; u < n; u ++) { - f[u] = fpr_mul(f[u], ni); - } - } -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_add)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, u; - - n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 4) { - for (u = 0; u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_add_pd( - _mm256_loadu_pd(&a[u].v), - _mm256_loadu_pd(&b[u].v))); - } - } else { - for (u = 0; u < n; u ++) { - a[u] = fpr_add(a[u], b[u]); - } - } -#else // yyyAVX2+0 - for (u = 0; u < n; u ++) { - a[u] = fpr_add(a[u], b[u]); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_sub)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, u; - - n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 4) { - for (u = 0; u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_sub_pd( - _mm256_loadu_pd(&a[u].v), - _mm256_loadu_pd(&b[u].v))); - } - } else { - for (u = 0; u < n; u ++) { - a[u] = fpr_sub(a[u], b[u]); - } - } -#else // yyyAVX2+0 - for (u = 0; u < n; u ++) { - a[u] = fpr_sub(a[u], b[u]); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_neg)(fpr *a, unsigned logn) -{ - size_t n, u; - - n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 4) { - __m256d s; - - s = _mm256_set1_pd(-0.0); - for (u = 0; u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s)); - } - } else { - for (u = 0; u < n; u ++) { - a[u] = fpr_neg(a[u]); - } - } -#else // yyyAVX2+0 - for (u = 0; u < n; u ++) { - a[u] = fpr_neg(a[u]); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_adj_fft)(fpr *a, unsigned logn) -{ - size_t n, u; - - n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d s; - - s = _mm256_set1_pd(-0.0); - for (u = (n >> 1); u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s)); - } - } else { - for (u = (n >> 1); u < n; u ++) { - a[u] = fpr_neg(a[u]); - } - } -#else // yyyAVX2+0 - for (u = (n >> 1); u < n; u ++) { - a[u] = fpr_neg(a[u]); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_mul_fft)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, b_re, b_im, c_re, c_im; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - b_re = _mm256_loadu_pd(&b[u].v); - b_im = _mm256_loadu_pd(&b[u + hn].v); - c_re = FMSUB( - a_re, b_re, _mm256_mul_pd(a_im, b_im)); - c_im = FMADD( - a_re, b_im, _mm256_mul_pd(a_im, b_re)); - _mm256_storeu_pd(&a[u].v, c_re); - _mm256_storeu_pd(&a[u + hn].v, c_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_muladj_fft)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, b_re, b_im, c_re, c_im; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - b_re = _mm256_loadu_pd(&b[u].v); - b_im = _mm256_loadu_pd(&b[u + hn].v); - c_re = FMADD( - a_re, b_re, _mm256_mul_pd(a_im, b_im)); - c_im = FMSUB( - a_im, b_re, _mm256_mul_pd(a_re, b_im)); - _mm256_storeu_pd(&a[u].v, c_re); - _mm256_storeu_pd(&a[u + hn].v, c_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = fpr_neg(b[u + hn]); - FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = fpr_neg(b[u + hn]); - FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn) -{ - /* - * Since each coefficient is multiplied with its own conjugate, - * the result contains only real values. - */ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d zero; - - zero = _mm256_setzero_pd(); - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - _mm256_storeu_pd(&a[u].v, - FMADD(a_re, a_re, - _mm256_mul_pd(a_im, a_im))); - _mm256_storeu_pd(&a[u + hn].v, zero); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im; - - a_re = a[u]; - a_im = a[u + hn]; - a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)); - a[u + hn] = fpr_zero; - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr a_re, a_im; - - a_re = a[u]; - a_im = a[u + hn]; - a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)); - a[u + hn] = fpr_zero; - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_mulconst)(fpr *a, fpr x, unsigned logn) -{ - size_t n, u; - - n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 4) { - __m256d x4; - - x4 = _mm256_set1_pd(x.v); - for (u = 0; u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_mul_pd(x4, _mm256_loadu_pd(&a[u].v))); - } - } else { - for (u = 0; u < n; u ++) { - a[u] = fpr_mul(a[u], x); - } - } -#else // yyyAVX2+0 - for (u = 0; u < n; u ++) { - a[u] = fpr_mul(a[u], x); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_div_fft)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, b_re, b_im, c_re, c_im, t; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - b_re = _mm256_loadu_pd(&b[u].v); - b_im = _mm256_loadu_pd(&b[u + hn].v); - t = _mm256_div_pd(one, - FMADD(b_re, b_re, - _mm256_mul_pd(b_im, b_im))); - b_re = _mm256_mul_pd(b_re, t); - b_im = _mm256_mul_pd(b_im, t); - c_re = FMADD( - a_re, b_re, _mm256_mul_pd(a_im, b_im)); - c_im = FMSUB( - a_im, b_re, _mm256_mul_pd(a_re, b_im)); - _mm256_storeu_pd(&a[u].v, c_re); - _mm256_storeu_pd(&a[u + hn].v, c_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_invnorm2_fft)(fpr *restrict d, - const fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, b_re, b_im, dv; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - b_re = _mm256_loadu_pd(&b[u].v); - b_im = _mm256_loadu_pd(&b[u + hn].v); - dv = _mm256_div_pd(one, - _mm256_add_pd( - FMADD(a_re, a_re, - _mm256_mul_pd(a_im, a_im)), - FMADD(b_re, b_re, - _mm256_mul_pd(b_im, b_im)))); - _mm256_storeu_pd(&d[u].v, dv); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im; - fpr b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - d[u] = fpr_inv(fpr_add( - fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)), - fpr_add(fpr_sqr(b_re), fpr_sqr(b_im)))); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr a_re, a_im; - fpr b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - d[u] = fpr_inv(fpr_add( - fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)), - fpr_add(fpr_sqr(b_re), fpr_sqr(b_im)))); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_add_muladj_fft)(fpr *restrict d, - const fpr *restrict F, const fpr *restrict G, - const fpr *restrict f, const fpr *restrict g, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - for (u = 0; u < hn; u += 4) { - __m256d F_re, F_im, G_re, G_im; - __m256d f_re, f_im, g_re, g_im; - __m256d a_re, a_im, b_re, b_im; - - F_re = _mm256_loadu_pd(&F[u].v); - F_im = _mm256_loadu_pd(&F[u + hn].v); - G_re = _mm256_loadu_pd(&G[u].v); - G_im = _mm256_loadu_pd(&G[u + hn].v); - f_re = _mm256_loadu_pd(&f[u].v); - f_im = _mm256_loadu_pd(&f[u + hn].v); - g_re = _mm256_loadu_pd(&g[u].v); - g_im = _mm256_loadu_pd(&g[u + hn].v); - - a_re = FMADD(F_re, f_re, - _mm256_mul_pd(F_im, f_im)); - a_im = FMSUB(F_im, f_re, - _mm256_mul_pd(F_re, f_im)); - b_re = FMADD(G_re, g_re, - _mm256_mul_pd(G_im, g_im)); - b_im = FMSUB(G_im, g_re, - _mm256_mul_pd(G_re, g_im)); - _mm256_storeu_pd(&d[u].v, - _mm256_add_pd(a_re, b_re)); - _mm256_storeu_pd(&d[u + hn].v, - _mm256_add_pd(a_im, b_im)); - } - } else { - for (u = 0; u < hn; u ++) { - fpr F_re, F_im, G_re, G_im; - fpr f_re, f_im, g_re, g_im; - fpr a_re, a_im, b_re, b_im; - - F_re = F[u]; - F_im = F[u + hn]; - G_re = G[u]; - G_im = G[u + hn]; - f_re = f[u]; - f_im = f[u + hn]; - g_re = g[u]; - g_im = g[u + hn]; - - FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im)); - FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im)); - d[u] = fpr_add(a_re, b_re); - d[u + hn] = fpr_add(a_im, b_im); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr F_re, F_im, G_re, G_im; - fpr f_re, f_im, g_re, g_im; - fpr a_re, a_im, b_re, b_im; - - F_re = F[u]; - F_im = F[u + hn]; - G_re = G[u]; - G_im = G[u + hn]; - f_re = f[u]; - f_im = f[u + hn]; - g_re = g[u]; - g_im = g[u + hn]; - - FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im)); - FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im)); - d[u] = fpr_add(a_re, b_re); - d[u + hn] = fpr_add(a_im, b_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_mul_autoadj_fft)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, bv; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - bv = _mm256_loadu_pd(&b[u].v); - _mm256_storeu_pd(&a[u].v, - _mm256_mul_pd(a_re, bv)); - _mm256_storeu_pd(&a[u + hn].v, - _mm256_mul_pd(a_im, bv)); - } - } else { - for (u = 0; u < hn; u ++) { - a[u] = fpr_mul(a[u], b[u]); - a[u + hn] = fpr_mul(a[u + hn], b[u]); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - a[u] = fpr_mul(a[u], b[u]); - a[u + hn] = fpr_mul(a[u + hn], b[u]); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_div_autoadj_fft)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d ib, a_re, a_im; - - ib = _mm256_div_pd(one, _mm256_loadu_pd(&b[u].v)); - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - _mm256_storeu_pd(&a[u].v, _mm256_mul_pd(a_re, ib)); - _mm256_storeu_pd(&a[u + hn].v, _mm256_mul_pd(a_im, ib)); - } - } else { - for (u = 0; u < hn; u ++) { - fpr ib; - - ib = fpr_inv(b[u]); - a[u] = fpr_mul(a[u], ib); - a[u + hn] = fpr_mul(a[u + hn], ib); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr ib; - - ib = fpr_inv(b[u]); - a[u] = fpr_mul(a[u], ib); - a[u + hn] = fpr_mul(a[u + hn], ib); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_LDL_fft)( - const fpr *restrict g00, - fpr *restrict g01, fpr *restrict g11, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - __m256d t, mu_re, mu_im, xi_re, xi_im; - - g00_re = _mm256_loadu_pd(&g00[u].v); - g00_im = _mm256_loadu_pd(&g00[u + hn].v); - g01_re = _mm256_loadu_pd(&g01[u].v); - g01_im = _mm256_loadu_pd(&g01[u + hn].v); - g11_re = _mm256_loadu_pd(&g11[u].v); - g11_im = _mm256_loadu_pd(&g11[u + hn].v); - - t = _mm256_div_pd(one, - FMADD(g00_re, g00_re, - _mm256_mul_pd(g00_im, g00_im))); - g00_re = _mm256_mul_pd(g00_re, t); - g00_im = _mm256_mul_pd(g00_im, t); - mu_re = FMADD(g01_re, g00_re, - _mm256_mul_pd(g01_im, g00_im)); - mu_im = FMSUB(g01_re, g00_im, - _mm256_mul_pd(g01_im, g00_re)); - xi_re = FMSUB(mu_re, g01_re, - _mm256_mul_pd(mu_im, g01_im)); - xi_im = FMADD(mu_im, g01_re, - _mm256_mul_pd(mu_re, g01_im)); - _mm256_storeu_pd(&g11[u].v, - _mm256_sub_pd(g11_re, xi_re)); - _mm256_storeu_pd(&g11[u + hn].v, - _mm256_add_pd(g11_im, xi_im)); - _mm256_storeu_pd(&g01[u].v, mu_re); - _mm256_storeu_pd(&g01[u + hn].v, mu_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - fpr mu_re, mu_im; - - g00_re = g00[u]; - g00_im = g00[u + hn]; - g01_re = g01[u]; - g01_im = g01[u + hn]; - g11_re = g11[u]; - g11_im = g11[u + hn]; - FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im); - FPC_MUL(g01_re, g01_im, - mu_re, mu_im, g01_re, fpr_neg(g01_im)); - FPC_SUB(g11[u], g11[u + hn], - g11_re, g11_im, g01_re, g01_im); - g01[u] = mu_re; - g01[u + hn] = fpr_neg(mu_im); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - fpr mu_re, mu_im; - - g00_re = g00[u]; - g00_im = g00[u + hn]; - g01_re = g01[u]; - g01_im = g01[u + hn]; - g11_re = g11[u]; - g11_im = g11[u + hn]; - FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im); - FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im)); - FPC_SUB(g11[u], g11[u + hn], g11_re, g11_im, g01_re, g01_im); - g01[u] = mu_re; - g01[u + hn] = fpr_neg(mu_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_LDLmv_fft)( - fpr *restrict d11, fpr *restrict l10, - const fpr *restrict g00, const fpr *restrict g01, - const fpr *restrict g11, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - __m256d t, mu_re, mu_im, xi_re, xi_im; - - g00_re = _mm256_loadu_pd(&g00[u].v); - g00_im = _mm256_loadu_pd(&g00[u + hn].v); - g01_re = _mm256_loadu_pd(&g01[u].v); - g01_im = _mm256_loadu_pd(&g01[u + hn].v); - g11_re = _mm256_loadu_pd(&g11[u].v); - g11_im = _mm256_loadu_pd(&g11[u + hn].v); - - t = _mm256_div_pd(one, - FMADD(g00_re, g00_re, - _mm256_mul_pd(g00_im, g00_im))); - g00_re = _mm256_mul_pd(g00_re, t); - g00_im = _mm256_mul_pd(g00_im, t); - mu_re = FMADD(g01_re, g00_re, - _mm256_mul_pd(g01_im, g00_im)); - mu_im = FMSUB(g01_re, g00_im, - _mm256_mul_pd(g01_im, g00_re)); - xi_re = FMSUB(mu_re, g01_re, - _mm256_mul_pd(mu_im, g01_im)); - xi_im = FMADD(mu_im, g01_re, - _mm256_mul_pd(mu_re, g01_im)); - _mm256_storeu_pd(&d11[u].v, - _mm256_sub_pd(g11_re, xi_re)); - _mm256_storeu_pd(&d11[u + hn].v, - _mm256_add_pd(g11_im, xi_im)); - _mm256_storeu_pd(&l10[u].v, mu_re); - _mm256_storeu_pd(&l10[u + hn].v, mu_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - fpr mu_re, mu_im; - - g00_re = g00[u]; - g00_im = g00[u + hn]; - g01_re = g01[u]; - g01_im = g01[u + hn]; - g11_re = g11[u]; - g11_im = g11[u + hn]; - FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im); - FPC_MUL(g01_re, g01_im, - mu_re, mu_im, g01_re, fpr_neg(g01_im)); - FPC_SUB(d11[u], d11[u + hn], - g11_re, g11_im, g01_re, g01_im); - l10[u] = mu_re; - l10[u + hn] = fpr_neg(mu_im); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - fpr mu_re, mu_im; - - g00_re = g00[u]; - g00_im = g00[u + hn]; - g01_re = g01[u]; - g01_im = g01[u + hn]; - g11_re = g11[u]; - g11_im = g11[u + hn]; - FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im); - FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im)); - FPC_SUB(d11[u], d11[u + hn], g11_re, g11_im, g01_re, g01_im); - l10[u] = mu_re; - l10[u + hn] = fpr_neg(mu_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_split_fft)( - fpr *restrict f0, fpr *restrict f1, - const fpr *restrict f, unsigned logn) -{ - /* - * The FFT representation we use is in bit-reversed order - * (element i contains f(w^(rev(i))), where rev() is the - * bit-reversal function over the ring degree. This changes - * indexes with regards to the Falcon specification. - */ - size_t n, hn, qn, u; - - n = (size_t)1 << logn; - hn = n >> 1; - qn = hn >> 1; - -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d half, sv; - - half = _mm256_set1_pd(0.5); - sv = _mm256_set_pd(-0.0, 0.0, -0.0, 0.0); - for (u = 0; u < qn; u += 2) { - __m256d ab_re, ab_im, ff0, ff1, ff2, ff3, gmt; - - ab_re = _mm256_loadu_pd(&f[(u << 1)].v); - ab_im = _mm256_loadu_pd(&f[(u << 1) + hn].v); - ff0 = _mm256_mul_pd(_mm256_hadd_pd(ab_re, ab_im), half); - ff0 = _mm256_permute4x64_pd(ff0, 0xD8); - _mm_storeu_pd(&f0[u].v, - _mm256_extractf128_pd(ff0, 0)); - _mm_storeu_pd(&f0[u + qn].v, - _mm256_extractf128_pd(ff0, 1)); - - ff1 = _mm256_mul_pd(_mm256_hsub_pd(ab_re, ab_im), half); - gmt = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v); - ff2 = _mm256_shuffle_pd(ff1, ff1, 0x5); - ff3 = _mm256_hadd_pd( - _mm256_mul_pd(ff1, gmt), - _mm256_xor_pd(_mm256_mul_pd(ff2, gmt), sv)); - ff3 = _mm256_permute4x64_pd(ff3, 0xD8); - _mm_storeu_pd(&f1[u].v, - _mm256_extractf128_pd(ff3, 0)); - _mm_storeu_pd(&f1[u + qn].v, - _mm256_extractf128_pd(ff3, 1)); - } - } else { - f0[0] = f[0]; - f1[0] = f[hn]; - - for (u = 0; u < qn; u ++) { - fpr a_re, a_im, b_re, b_im; - fpr t_re, t_im; - - a_re = f[(u << 1) + 0]; - a_im = f[(u << 1) + 0 + hn]; - b_re = f[(u << 1) + 1]; - b_im = f[(u << 1) + 1 + hn]; - - FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im); - f0[u] = fpr_half(t_re); - f0[u + qn] = fpr_half(t_im); - - FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im); - FPC_MUL(t_re, t_im, t_re, t_im, - fpr_gm_tab[((u + hn) << 1) + 0], - fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1])); - f1[u] = fpr_half(t_re); - f1[u + qn] = fpr_half(t_im); - } - } -#else // yyyAVX2+0 - /* - * We process complex values by pairs. For logn = 1, there is only - * one complex value (the other one is the implicit conjugate), - * so we add the two lines below because the loop will be - * skipped. - */ - f0[0] = f[0]; - f1[0] = f[hn]; - - for (u = 0; u < qn; u ++) { - fpr a_re, a_im, b_re, b_im; - fpr t_re, t_im; - - a_re = f[(u << 1) + 0]; - a_im = f[(u << 1) + 0 + hn]; - b_re = f[(u << 1) + 1]; - b_im = f[(u << 1) + 1 + hn]; - - FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im); - f0[u] = fpr_half(t_re); - f0[u + qn] = fpr_half(t_im); - - FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im); - FPC_MUL(t_re, t_im, t_re, t_im, - fpr_gm_tab[((u + hn) << 1) + 0], - fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1])); - f1[u] = fpr_half(t_re); - f1[u + qn] = fpr_half(t_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_merge_fft)( - fpr *restrict f, - const fpr *restrict f0, const fpr *restrict f1, unsigned logn) -{ - size_t n, hn, qn, u; - - n = (size_t)1 << logn; - hn = n >> 1; - qn = hn >> 1; - -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 16) { - for (u = 0; u < qn; u += 4) { - __m256d a_re, a_im, b_re, b_im, c_re, c_im; - __m256d gm1, gm2, g_re, g_im; - __m256d t_re, t_im, u_re, u_im; - __m256d tu1_re, tu2_re, tu1_im, tu2_im; - - a_re = _mm256_loadu_pd(&f0[u].v); - a_im = _mm256_loadu_pd(&f0[u + qn].v); - c_re = _mm256_loadu_pd(&f1[u].v); - c_im = _mm256_loadu_pd(&f1[u + qn].v); - - gm1 = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v); - gm2 = _mm256_loadu_pd(&fpr_gm_tab[(u + 2 + hn) << 1].v); - g_re = _mm256_unpacklo_pd(gm1, gm2); - g_im = _mm256_unpackhi_pd(gm1, gm2); - g_re = _mm256_permute4x64_pd(g_re, 0xD8); - g_im = _mm256_permute4x64_pd(g_im, 0xD8); - - b_re = FMSUB( - c_re, g_re, _mm256_mul_pd(c_im, g_im)); - b_im = FMADD( - c_re, g_im, _mm256_mul_pd(c_im, g_re)); - - t_re = _mm256_add_pd(a_re, b_re); - t_im = _mm256_add_pd(a_im, b_im); - u_re = _mm256_sub_pd(a_re, b_re); - u_im = _mm256_sub_pd(a_im, b_im); - - tu1_re = _mm256_unpacklo_pd(t_re, u_re); - tu2_re = _mm256_unpackhi_pd(t_re, u_re); - tu1_im = _mm256_unpacklo_pd(t_im, u_im); - tu2_im = _mm256_unpackhi_pd(t_im, u_im); - _mm256_storeu_pd(&f[(u << 1)].v, - _mm256_permute2f128_pd(tu1_re, tu2_re, 0x20)); - _mm256_storeu_pd(&f[(u << 1) + 4].v, - _mm256_permute2f128_pd(tu1_re, tu2_re, 0x31)); - _mm256_storeu_pd(&f[(u << 1) + hn].v, - _mm256_permute2f128_pd(tu1_im, tu2_im, 0x20)); - _mm256_storeu_pd(&f[(u << 1) + 4 + hn].v, - _mm256_permute2f128_pd(tu1_im, tu2_im, 0x31)); - } - } else { - f[0] = f0[0]; - f[hn] = f1[0]; - - for (u = 0; u < qn; u ++) { - fpr a_re, a_im, b_re, b_im; - fpr t_re, t_im; - - a_re = f0[u]; - a_im = f0[u + qn]; - FPC_MUL(b_re, b_im, f1[u], f1[u + qn], - fpr_gm_tab[((u + hn) << 1) + 0], - fpr_gm_tab[((u + hn) << 1) + 1]); - FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im); - f[(u << 1) + 0] = t_re; - f[(u << 1) + 0 + hn] = t_im; - FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im); - f[(u << 1) + 1] = t_re; - f[(u << 1) + 1 + hn] = t_im; - } - } -#else // yyyAVX2+0 - /* - * An extra copy to handle the special case logn = 1. - */ - f[0] = f0[0]; - f[hn] = f1[0]; - - for (u = 0; u < qn; u ++) { - fpr a_re, a_im, b_re, b_im; - fpr t_re, t_im; - - a_re = f0[u]; - a_im = f0[u + qn]; - FPC_MUL(b_re, b_im, f1[u], f1[u + qn], - fpr_gm_tab[((u + hn) << 1) + 0], - fpr_gm_tab[((u + hn) << 1) + 1]); - FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im); - f[(u << 1) + 0] = t_re; - f[(u << 1) + 0 + hn] = t_im; - FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im); - f[(u << 1) + 1] = t_re; - f[(u << 1) + 1 + hn] = t_im; - } -#endif // yyyAVX2- -} diff --git a/crypto_sign/falcon-512-tree/m4-ct/fpr.c b/crypto_sign/falcon-512-tree/m4-ct/fpr.c deleted file mode 100644 index eb23a44b..00000000 --- a/crypto_sign/falcon-512-tree/m4-ct/fpr.c +++ /dev/null @@ -1,3460 +0,0 @@ -/* - * Floating-point operations. - * - * This file implements the non-inline functions declared in - * fpr.h, as well as the constants for FFT / iFFT. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -#if FALCON_FPEMU // yyyFPEMU+1 - -/* - * Normalize a provided unsigned integer to the 2^63..2^64-1 range by - * left-shifting it if necessary. The exponent e is adjusted accordingly - * (i.e. if the value was left-shifted by n bits, then n is subtracted - * from e). If source m is 0, then it remains 0, but e is altered. - * Both m and e must be simple variables (no expressions allowed). - */ -#define FPR_NORM64(m, e) do { \ - uint32_t nt; \ - \ - (e) -= 63; \ - \ - nt = (uint32_t)((m) >> 32); \ - nt = (nt | -nt) >> 31; \ - (m) ^= ((m) ^ ((m) << 32)) & ((uint64_t)nt - 1); \ - (e) += (int)(nt << 5); \ - \ - nt = (uint32_t)((m) >> 48); \ - nt = (nt | -nt) >> 31; \ - (m) ^= ((m) ^ ((m) << 16)) & ((uint64_t)nt - 1); \ - (e) += (int)(nt << 4); \ - \ - nt = (uint32_t)((m) >> 56); \ - nt = (nt | -nt) >> 31; \ - (m) ^= ((m) ^ ((m) << 8)) & ((uint64_t)nt - 1); \ - (e) += (int)(nt << 3); \ - \ - nt = (uint32_t)((m) >> 60); \ - nt = (nt | -nt) >> 31; \ - (m) ^= ((m) ^ ((m) << 4)) & ((uint64_t)nt - 1); \ - (e) += (int)(nt << 2); \ - \ - nt = (uint32_t)((m) >> 62); \ - nt = (nt | -nt) >> 31; \ - (m) ^= ((m) ^ ((m) << 2)) & ((uint64_t)nt - 1); \ - (e) += (int)(nt << 1); \ - \ - nt = (uint32_t)((m) >> 63); \ - (m) ^= ((m) ^ ((m) << 1)) & ((uint64_t)nt - 1); \ - (e) += (int)(nt); \ - } while (0) - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -__attribute__((naked)) -fpr -fpr_scaled(int64_t i __attribute__((unused)), int sc __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, lr }\n\t" - "\n\t" - "@ Input i is in r0:r1, and sc in r2.\n\t" - "@ Extract the sign bit, and compute the absolute value.\n\t" - "@ -> sign bit in r3, with value 0 or -1\n\t" - "asrs r3, r1, #31\n\t" - "eors r0, r3\n\t" - "eors r1, r3\n\t" - "subs r0, r3\n\t" - "sbcs r1, r3\n\t" - "\n\t" - "@ Scale exponent to account for the encoding; if the source is\n\t" - "@ zero or if the scaled exponent is negative, it is set to 32.\n\t" - "addw r2, r2, #1022\n\t" - "orrs r4, r0, r1\n\t" - "bics r4, r4, r2, asr #31\n\t" - "rsbs r5, r4, #0\n\t" - "orrs r4, r5\n\t" - "ands r2, r2, r4, asr #31\n\t" - "adds r2, #32\n\t" - "\n\t" - "@ Normalize value to a full 64-bit width, by shifting it left.\n\t" - "@ The shift count is subtracted from the exponent (in r2).\n\t" - "@ If the mantissa is 0, the exponent is set to 0.\n\t" - "\n\t" - "@ If top word is 0, replace with low word; otherwise, add 32 to\n\t" - "@ the exponent.\n\t" - "rsbs r4, r1, #0\n\t" - "orrs r4, r1\n\t" - "eors r5, r0, r1\n\t" - "bics r5, r5, r4, asr #31\n\t" - "eors r1, r5\n\t" - "ands r0, r0, r4, asr #31\n\t" - "lsrs r4, r4, #31\n\t" - "adds r2, r2, r4, lsl #5\n\t" - "\n\t" - "@ Count leading zeros of r1 to finish the shift.\n\t" - "clz r4, r1\n\t" - "subs r2, r4\n\t" - "rsbs r5, r4, #32\n\t" - "lsls r1, r4\n\t" - "lsrs r5, r0, r5\n\t" - "lsls r0, r4\n\t" - "orrs r1, r5\n\t" - "\n\t" - "@ Clear the top bit; we know it's a 1 (unless the whole mantissa\n\t" - "@ was zero, but then it's still OK to clear it)\n\t" - "bfc r1, #31, #1\n\t" - "\n\t" - "@ Now shift right the value by 11 bits; this puts the value in\n\t" - "@ the 2^52..2^53-1 range. We also keep a copy of the pre-shift\n\t" - "@ low bits in r5.\n\t" - "movs r5, r0\n\t" - "lsrs r0, #11\n\t" - "orrs r0, r0, r1, lsl #21\n\t" - "lsrs r1, #11\n\t" - "\n\t" - "@ Also plug the exponent at the right place. This must be done\n\t" - "@ now so that, in case the rounding creates a carry, that carry\n\t" - "@ adds to the exponent, which would be exactly what we want at\n\t" - "@ that point.\n\t" - "orrs r1, r1, r2, lsl #20\n\t" - "\n\t" - "@ Rounding: we must add 1 to the mantissa in the following cases:\n\t" - "@ - bits 11 to 9 of r5 are '011', '110' or '111'\n\t" - "@ - bits 11 to 9 of r5 are '010' and one of the\n\t" - "@ bits 0 to 8 is non-zero\n\t" - "ubfx r6, r5, #0, #9\n\t" - "addw r6, r6, #511\n\t" - "orrs r5, r6\n\t" - "\n\t" - "ubfx r5, r5, #9, #3\n\t" - "movs r6, #0xC8\n\t" - "lsrs r6, r5\n\t" - "ands r6, #1\n\t" - "adds r0, r6\n\t" - "adcs r1, #0\n\t" - "\n\t" - "@ Put back the sign.\n\t" - "orrs r1, r1, r3, lsl #31\n\t" - "\n\t" - "pop { r4, r5, r6, pc}\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - -fpr -fpr_scaled(int64_t i, int sc) -{ - /* - * To convert from int to float, we have to do the following: - * 1. Get the absolute value of the input, and its sign - * 2. Shift right or left the value as appropriate - * 3. Pack the result - * - * We can assume that the source integer is not -2^63. - */ - int s, e; - uint32_t t; - uint64_t m; - - /* - * Extract sign bit. - * We have: -i = 1 + ~i - */ - s = (int)((uint64_t)i >> 63); - i ^= -(int64_t)s; - i += s; - - /* - * For now we suppose that i != 0. - * Otherwise, we set m to i and left-shift it as much as needed - * to get a 1 in the top bit. We can do that in a logarithmic - * number of conditional shifts. - */ - m = (uint64_t)i; - e = 9 + sc; - FPR_NORM64(m, e); - - /* - * Now m is in the 2^63..2^64-1 range. We must divide it by 512; - * if one of the dropped bits is a 1, this should go into the - * "sticky bit". - */ - m |= ((uint32_t)m & 0x1FF) + 0x1FF; - m >>= 9; - - /* - * Corrective action: if i = 0 then all of the above was - * incorrect, and we clamp e and m down to zero. - */ - t = (uint32_t)((uint64_t)(i | -i) >> 63); - m &= -(uint64_t)t; - e &= -(int)t; - - /* - * Assemble back everything. The FPR() function will handle cases - * where e is too low. - */ - return FPR(s, e, m); -} - -#endif // yyyASM_CORTEXM4- - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -// yyyPQCLEAN+0 -#if 0 -/* Debug code -- To get a printout of registers from a specific point - in ARM Cortex M4 assembly code, uncomment this code and add a - "bl DEBUG" call where wished for. */ - -void -print_regs(uint32_t *rr, uint32_t flags) -{ - int i; - extern int printf(const char *fmt, ...); - - printf("\nRegs:\n"); - for (i = 0; i < 7; i ++) { - int j; - - j = i + 7; - printf(" %2d = %08X %2d = %08X\n", i, rr[i], j, rr[j]); - } - printf(" flags = %08X ", flags); - if ((flags >> 31) & 1) { - printf("N"); - } - if ((flags >> 30) & 1) { - printf("Z"); - } - if ((flags >> 29) & 1) { - printf("C"); - } - if ((flags >> 28) & 1) { - printf("V"); - } - if ((flags >> 27) & 1) { - printf("Q"); - } - printf("\n"); -} - -__attribute__((naked)) -void -DEBUG(void) -{ - __asm__ ( - "push { r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr }\n\t" - "mov r0, sp\n\t" - "mrs r1, apsr\n\t" - "bl print_regs\n\t" - "pop { r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, pc }\n\t" - ); -} -#endif -// yyyPQCLEAN- - -__attribute__((naked)) -fpr -fpr_add(fpr x __attribute__((unused)), fpr y __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, r7, r8, r10, r11, lr }\n\t" - "\n\t" - "@ Make sure that the first operand (x) has the larger absolute\n\t" - "@ value. This guarantees that the exponent of y is less than\n\t" - "@ or equal to the exponent of x, and, if they are equal, then\n\t" - "@ the mantissa of y will not be greater than the mantissa of x.\n\t" - "@ However, if absolute values are equal and the sign of x is 1,\n\t" - "@ then we want to also swap the values.\n\t" - "ubfx r4, r1, #0, #31 @ top word without sign bit\n\t" - "ubfx r5, r3, #0, #31 @ top word without sign bit\n\t" - "subs r7, r0, r2 @ difference in r7:r4\n\t" - "sbcs r4, r5\n\t" - "orrs r7, r4\n\t" - "rsbs r5, r7, #0\n\t" - "orrs r7, r5 @ bit 31 of r7 is 0 iff difference is zero\n\t" - "bics r6, r1, r7\n\t" - "orrs r6, r4 @ bit 31 of r6 is 1 iff the swap must be done\n\t" - "\n\t" - "@ Conditional swap\n\t" - "eors r4, r0, r2\n\t" - "eors r5, r1, r3\n\t" - "ands r4, r4, r6, asr #31\n\t" - "ands r5, r5, r6, asr #31\n\t" - "eors r0, r4\n\t" - "eors r1, r5\n\t" - "eors r2, r4\n\t" - "eors r3, r5\n\t" - "\n\t" - "@ Extract mantissa of x into r0:r1, exponent in r4, sign in r5\n\t" - "ubfx r4, r1, #20, #11 @ Exponent in r4 (without sign)\n\t" - "addw r5, r4, #2047 @ Get a carry to test r4 for zero\n\t" - "lsrs r5, #11 @ r5 is the mantissa implicit high bit\n\t" - "bfc r1, #20, #11 @ Clear exponent bits (not the sign)\n\t" - "orrs r1, r1, r5, lsl #20 @ Set mantissa high bit\n\t" - "asrs r5, r1, #31 @ Get sign bit (sign-extended)\n\t" - "bfc r1, #31, #1 @ Clear the sign bit\n\t" - "\n\t" - "@ Extract mantissa of y into r2:r3, exponent in r6, sign in r7\n\t" - "ubfx r6, r3, #20, #11 @ Exponent in r6 (without sign)\n\t" - "addw r7, r6, #2047 @ Get a carry to test r6 for zero\n\t" - "lsrs r7, #11 @ r7 is the mantissa implicit high bit\n\t" - "bfc r3, #20, #11 @ Clear exponent bits (not the sign)\n\t" - "orrs r3, r3, r7, lsl #20 @ Set mantissa high bit\n\t" - "asrs r7, r3, #31 @ Get sign bit (sign-extended)\n\t" - "bfc r3, #31, #1 @ Clear the sign bit\n\t" - "\n\t" - "@ Scale mantissas up by three bits.\n\t" - "lsls r1, #3\n\t" - "orrs r1, r1, r0, lsr #29\n\t" - "lsls r0, #3\n\t" - "lsls r3, #3\n\t" - "orrs r3, r3, r2, lsr #29\n\t" - "lsls r2, #3\n\t" - "\n\t" - "@ x: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t" - "@ y: exponent=r6, sign=r7, mantissa=r2:r3 (scaled up 3 bits)\n\t" - "\n\t" - "@ At that point, the exponent of x (in r4) is larger than that\n\t" - "@ of y (in r6). The difference is the amount of shifting that\n\t" - "@ should be done on y. If that amount is larger than 59 then\n\t" - "@ we clamp y to 0. We won't need y's exponent beyond that point,\n\t" - "@ so we store that shift count in r6.\n\t" - "subs r6, r4, r6\n\t" - "subs r8, r6, #60\n\t" - "ands r2, r2, r8, asr #31\n\t" - "ands r3, r3, r8, asr #31\n\t" - "\n\t" - "@ Shift right r2:r3 by r6 bits. The shift count is in the 0..59\n\t" - "@ range. r11 will be non-zero if and only if some non-zero bits\n\t" - "@ were dropped.\n\t" - "subs r8, r6, #32\n\t" - "bics r11, r2, r8, asr #31\n\t" - "ands r2, r2, r8, asr #31\n\t" - "bics r10, r3, r8, asr #31\n\t" - "orrs r2, r2, r10\n\t" - "ands r3, r3, r8, asr #31\n\t" - "ands r6, r6, #31\n\t" - "rsbs r8, r6, #32\n\t" - "lsls r10, r2, r8\n\t" - "orrs r11, r11, r10\n\t" - "lsrs r2, r2, r6\n\t" - "lsls r10, r3, r8\n\t" - "orrs r2, r2, r10\n\t" - "lsrs r3, r3, r6\n\t" - "\n\t" - "@ If r11 is non-zero then some non-zero bit was dropped and the\n\t" - "@ low bit of r2 must be forced to 1 ('sticky bit').\n\t" - "rsbs r6, r11, #0\n\t" - "orrs r6, r6, r11\n\t" - "orrs r2, r2, r6, lsr #31\n\t" - "\n\t" - "@ x: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t" - "@ y: sign=r7, value=r2:r3 (scaled to same exponent as x)\n\t" - "\n\t" - "@ If x and y don't have the same sign, then we should negate r2:r3\n\t" - "@ (i.e. subtract the mantissa instead of adding it). Signs of x\n\t" - "@ and y are in r5 and r7, as full-width words. We won't need r7\n\t" - "@ afterwards.\n\t" - "eors r7, r5 @ r7 = -1 if y must be negated, 0 otherwise\n\t" - "eors r2, r7\n\t" - "eors r3, r7\n\t" - "subs r2, r7\n\t" - "sbcs r3, r7\n\t" - "\n\t" - "@ r2:r3 has been shifted, we can add to r0:r1.\n\t" - "adds r0, r2\n\t" - "adcs r1, r3\n\t" - "\n\t" - "@ result: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t" - "\n\t" - "@ Normalize the result with some left-shifting to full 64-bit\n\t" - "@ width. Shift count goes to r2, and exponent (r4) is adjusted.\n\t" - "clz r2, r0\n\t" - "clz r3, r1\n\t" - "sbfx r6, r3, #5, #1\n\t" - "ands r2, r6\n\t" - "adds r2, r2, r3\n\t" - "subs r4, r4, r2\n\t" - "\n\t" - "@ Shift r0:r1 to the left by r2 bits.\n\t" - "subs r7, r2, #32\n\t" - "lsls r7, r0, r7\n\t" - "lsls r1, r1, r2\n\t" - "rsbs r6, r2, #32\n\t" - "orrs r1, r1, r7\n\t" - "lsrs r6, r0, r6\n\t" - "orrs r1, r1, r6\n\t" - "lsls r0, r0, r2\n\t" - "\n\t" - "@ The exponent of x was in r4. The left-shift operation has\n\t" - "@ subtracted some value from it, 8 in case the result has the\n\t" - "@ same exponent as x. However, the high bit of the mantissa will\n\t" - "@ add 1 to the exponent, so we only add back 7 (the exponent is\n\t" - "@ added in because rounding might have produced a carry, which\n\t" - "@ should then spill into the exponent).\n\t" - "adds r4, #7\n\t" - "\n\t" - "@ If the mantissa new mantissa is non-zero, then its bit 63 is\n\t" - "@ non-zero (thanks to the normalizing shift). Otherwise, that bit\n\t" - "@ is zero, and we should then set the exponent to zero as well.\n\t" - "ands r4, r4, r1, asr #31\n\t" - "\n\t" - "@ Shrink back the value to a 52-bit mantissa. This requires\n\t" - "@ right-shifting by 11 bits; we keep a copy of the pre-shift\n\t" - "@ low word in r3.\n\t" - "movs r3, r0\n\t" - "lsrs r0, #11\n\t" - "orrs r0, r0, r1, lsl #21\n\t" - "lsrs r1, #11\n\t" - "\n\t" - "@ Apply rounding.\n\t" - "ubfx r6, r3, #0, #9\n\t" - "addw r6, r6, #511\n\t" - "orrs r3, r6\n\t" - "ubfx r3, r3, #9, #3\n\t" - "movs r6, #0xC8\n\t" - "lsrs r6, r3\n\t" - "ands r6, #1\n\t" - "adds r0, r6\n\t" - "adcs r1, #0\n\t" - "\n\t" - "@Plug in the exponent with an addition.\n\t" - "adds r1, r1, r4, lsl #20\n\t" - "\n\t" - "@ If the new exponent is negative or zero, then it underflowed\n\t" - "@ and we must clear the whole mantissa and exponent.\n\t" - "rsbs r4, r4, #0\n\t" - "ands r0, r0, r4, asr #31\n\t" - "ands r1, r1, r4, asr #31\n\t" - "\n\t" - "@ Put back the sign. This is the sign of x: thanks to the\n\t" - "@ conditional swap at the start, this is always correct.\n\t" - "bfi r1, r5, #31, #1\n\t" - "\n\t" - "pop { r4, r5, r6, r7, r8, r10, r11, pc }\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - -fpr -fpr_add(fpr x, fpr y) -{ - uint64_t m, xu, yu, za; - uint32_t cs; - int ex, ey, sx, sy, cc; - - /* - * Make sure that the first operand (x) has the larger absolute - * value. This guarantees that the exponent of y is less than - * or equal to the exponent of x, and, if they are equal, then - * the mantissa of y will not be greater than the mantissa of x. - * - * After this swap, the result will have the sign x, except in - * the following edge case: abs(x) = abs(y), and x and y have - * opposite sign bits; in that case, the result shall be +0 - * even if the sign bit of x is 1. To handle this case properly, - * we do the swap is abs(x) = abs(y) AND the sign of x is 1. - */ - m = ((uint64_t)1 << 63) - 1; - za = (x & m) - (y & m); - cs = (uint32_t)(za >> 63) - | ((1U - (uint32_t)(-za >> 63)) & (uint32_t)(x >> 63)); - m = (x ^ y) & -(uint64_t)cs; - x ^= m; - y ^= m; - - /* - * Extract sign bits, exponents and mantissas. The mantissas are - * scaled up to 2^55..2^56-1, and the exponent is unbiased. If - * an operand is zero, its mantissa is set to 0 at this step, and - * its exponent will be -1078. - */ - ex = (int)(x >> 52); - sx = ex >> 11; - ex &= 0x7FF; - m = (uint64_t)(uint32_t)((ex + 0x7FF) >> 11) << 52; - xu = ((x & (((uint64_t)1 << 52) - 1)) | m) << 3; - ex -= 1078; - ey = (int)(y >> 52); - sy = ey >> 11; - ey &= 0x7FF; - m = (uint64_t)(uint32_t)((ey + 0x7FF) >> 11) << 52; - yu = ((y & (((uint64_t)1 << 52) - 1)) | m) << 3; - ey -= 1078; - - /* - * x has the larger exponent; hence, we only need to right-shift y. - * If the shift count is larger than 59 bits then we clamp the - * value to zero. - */ - cc = ex - ey; - yu &= -(uint64_t)((uint32_t)(cc - 60) >> 31); - cc &= 63; - - /* - * The lowest bit of yu is "sticky". - */ - m = fpr_ulsh(1, cc) - 1; - yu |= (yu & m) + m; - yu = fpr_ursh(yu, cc); - - /* - * If the operands have the same sign, then we add the mantissas; - * otherwise, we subtract the mantissas. - */ - xu += yu - ((yu << 1) & -(uint64_t)(sx ^ sy)); - - /* - * The result may be smaller, or slightly larger. We normalize - * it to the 2^63..2^64-1 range (if xu is zero, then it stays - * at zero). - */ - FPR_NORM64(xu, ex); - - /* - * Scale down the value to 2^54..s^55-1, handling the last bit - * as sticky. - */ - xu |= ((uint32_t)xu & 0x1FF) + 0x1FF; - xu >>= 9; - ex += 9; - - /* - * In general, the result has the sign of x. However, if the - * result is exactly zero, then the following situations may - * be encountered: - * x > 0, y = -x -> result should be +0 - * x < 0, y = -x -> result should be +0 - * x = +0, y = +0 -> result should be +0 - * x = -0, y = +0 -> result should be +0 - * x = +0, y = -0 -> result should be +0 - * x = -0, y = -0 -> result should be -0 - * - * But at the conditional swap step at the start of the - * function, we ensured that if abs(x) = abs(y) and the - * sign of x was 1, then x and y were swapped. Thus, the - * two following cases cannot actually happen: - * x < 0, y = -x - * x = -0, y = +0 - * In all other cases, the sign bit of x is conserved, which - * is what the FPR() function does. The FPR() function also - * properly clamps values to zero when the exponent is too - * low, but does not alter the sign in that case. - */ - return FPR(sx, ex, xu); -} - -#endif // yyyASM_CORTEXM4- - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -__attribute__((naked)) -fpr -fpr_mul(fpr x __attribute__((unused)), fpr y __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, r7, r8, r10, r11, lr }\n\t" - "\n\t" - "@ Extract mantissas: x.m = r4:r5, y.m = r6:r7\n\t" - "@ r4 and r6 contain only 25 bits each.\n\t" - "bics r4, r0, #0xFE000000\n\t" - "lsls r5, r1, #7\n\t" - "orrs r5, r5, r0, lsr #25\n\t" - "orrs r5, r5, #0x08000000\n\t" - "bics r5, r5, #0xF0000000\n\t" - "bics r6, r2, #0xFE000000\n\t" - "lsls r7, r3, #7\n\t" - "orrs r7, r7, r2, lsr #25\n\t" - "orrs r7, r7, #0x08000000\n\t" - "bics r7, r7, #0xF0000000\n\t" - "\n\t" - "@ Perform product. Values are in the 2^52..2^53-1 range, so\n\t" - "@ the product is at most 106-bit long. Of the low 50 bits,\n\t" - "@ we only want to know if they are all zeros or not. Here,\n\t" - "@ we get the top 56 bits in r10:r11, and r8 will be non-zero\n\t" - "@ if and only if at least one of the low 50 bits is non-zero.\n\t" - "umull r8, r10, r4, r6 @ x0*y0\n\t" - "lsls r10, #7\n\t" - "orrs r10, r10, r8, lsr #25\n\t" - "eors r11, r11\n\t" - "umlal r10, r11, r4, r7 @ x0*y1\n\t" - "umlal r10, r11, r5, r6 @ x1*y0\n\t" - "orrs r8, r8, r10, lsl #7\n\t" - "lsrs r10, #25\n\t" - "orrs r10, r10, r11, lsl #7\n\t" - "eors r11, r11\n\t" - "umlal r10, r11, r5, r7 @ x1*y1\n\t" - "\n\t" - "@ Now r0, r2, r4, r5, r6 and r7 are free.\n\t" - "@ If any of the low 50 bits was non-zero, then we force the\n\t" - "@ low bit of r10 to 1.\n\t" - "rsbs r4, r8, #0\n\t" - "orrs r8, r8, r4\n\t" - "orrs r10, r10, r8, lsr #31\n\t" - "\n\t" - "@ r8 is free.\n\t" - "@ r10:r11 contains the product in the 2^54..2^56-1 range. We\n\t" - "@ normalize it to 2^54..2^55-1 (into r6:r7) with a conditional\n\t" - "@ shift (low bit is sticky). r5 contains -1 if the shift was done,\n\t" - "@ 0 otherwise.\n\t" - "ands r6, r10, #1\n\t" - "lsrs r5, r11, #23\n\t" - "rsbs r5, r5, #0\n\t" - "orrs r6, r6, r10, lsr #1\n\t" - "orrs r6, r6, r11, lsl #31\n\t" - "lsrs r7, r11, #1\n\t" - "eors r10, r10, r6\n\t" - "eors r11, r11, r7\n\t" - "bics r10, r10, r5\n\t" - "bics r11, r11, r5\n\t" - "eors r6, r6, r10\n\t" - "eors r7, r7, r11\n\t" - "\n\t" - "@ Compute aggregate exponent: ex + ey - 1023 + w\n\t" - "@ (where w = 1 if the conditional shift was done, 0 otherwise)\n\t" - "@ But we subtract 1 because the injection of the mantissa high\n\t" - "@ bit will increment the exponent by 1.\n\t" - "lsls r0, r1, #1\n\t" - "lsls r2, r3, #1\n\t" - "lsrs r0, #21\n\t" - "addw r4, r0, #0x7FF @ save ex + 2047 in r4\n\t" - "lsrs r2, #21\n\t" - "addw r8, r2, #0x7FF @ save ey + 2047 in r8\n\t" - "adds r2, r0\n\t" - "subw r2, r2, #1024\n\t" - "subs r2, r5\n\t" - "\n\t" - "@ r5 is free.\n\t" - "@ Also, if either of the source exponents is 0, or the result\n\t" - "@ exponent is 0 or negative, then the result is zero and the\n\t" - "@ mantissa and the exponent shall be clamped to zero. Since\n\t" - "@ r2 contains the result exponent minus 1, we test on r2\n\t" - "@ being strictly negative.\n\t" - "ands r4, r8 @ if bit 11 = 0 then one of the exponents was 0\n\t" - "mvns r5, r2\n\t" - "ands r5, r5, r4, lsl #20\n\t" - "ands r2, r2, r5, asr #31\n\t" - "ands r6, r6, r5, asr #31\n\t" - "ands r7, r7, r5, asr #31\n\t" - "\n\t" - "@ Sign is the XOR of the sign of the operands. This is true in\n\t" - "@ all cases, including very small results (exponent underflow)\n\t" - "@ and zeros.\n\t" - "eors r1, r3\n\t" - "bfc r1, #0, #31\n\t" - "\n\t" - "@ Plug in the exponent.\n\t" - "bfi r1, r2, #20, #11\n\t" - "\n\t" - "@ r2 and r3 are free.\n\t" - "@ Shift back to the normal 53-bit mantissa, with rounding.\n\t" - "@ Mantissa goes into r0:r1. For r1, we must use an addition\n\t" - "@ because the rounding may have triggered a carry, that should\n\t" - "@ be added to the exponent.\n\t" - "movs r4, r6\n\t" - "lsrs r0, r6, #2\n\t" - "orrs r0, r0, r7, lsl #30\n\t" - "adds r1, r1, r7, lsr #2\n\t" - "ands r4, #0x7\n\t" - "movs r3, #0xC8\n\t" - "lsrs r3, r4\n\t" - "ands r3, #1\n\t" - "adds r0, r3\n\t" - "adcs r1, #0\n\t" - "\n\t" - "pop { r4, r5, r6, r7, r8, r10, r11, pc }\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - -fpr -fpr_mul(fpr x, fpr y) -{ - uint64_t xu, yu, w, zu, zv; - uint32_t x0, x1, y0, y1, z0, z1, z2; - int ex, ey, d, e, s; - - /* - * Extract absolute values as scaled unsigned integers. We - * don't extract exponents yet. - */ - xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52); - yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52); - - /* - * We have two 53-bit integers to multiply; we need to split - * each into a lower half and a upper half. Moreover, we - * prefer to have lower halves to be of 25 bits each, for - * reasons explained later on. - */ - x0 = (uint32_t)xu & 0x01FFFFFF; - x1 = (uint32_t)(xu >> 25); - y0 = (uint32_t)yu & 0x01FFFFFF; - y1 = (uint32_t)(yu >> 25); - w = (uint64_t)x0 * (uint64_t)y0; - z0 = (uint32_t)w & 0x01FFFFFF; - z1 = (uint32_t)(w >> 25); - w = (uint64_t)x0 * (uint64_t)y1; - z1 += (uint32_t)w & 0x01FFFFFF; - z2 = (uint32_t)(w >> 25); - w = (uint64_t)x1 * (uint64_t)y0; - z1 += (uint32_t)w & 0x01FFFFFF; - z2 += (uint32_t)(w >> 25); - zu = (uint64_t)x1 * (uint64_t)y1; - z2 += (z1 >> 25); - z1 &= 0x01FFFFFF; - zu += z2; - - /* - * Since xu and yu are both in the 2^52..2^53-1 range, the - * product is in the 2^104..2^106-1 range. We first reassemble - * it and round it into the 2^54..2^56-1 range; the bottom bit - * is made "sticky". Since the low limbs z0 and z1 are 25 bits - * each, we just take the upper part (zu), and consider z0 and - * z1 only for purposes of stickiness. - * (This is the reason why we chose 25-bit limbs above.) - */ - zu |= ((z0 | z1) + 0x01FFFFFF) >> 25; - - /* - * We normalize zu to the 2^54..s^55-1 range: it could be one - * bit too large at this point. This is done with a conditional - * right-shift that takes into account the sticky bit. - */ - zv = (zu >> 1) | (zu & 1); - w = zu >> 55; - zu ^= (zu ^ zv) & -w; - - /* - * Get the aggregate scaling factor: - * - * - Each exponent is biased by 1023. - * - * - Integral mantissas are scaled by 2^52, hence an - * extra 52 bias for each exponent. - * - * - However, we right-shifted z by 50 bits, and then - * by 0 or 1 extra bit (depending on the value of w). - * - * In total, we must add the exponents, then subtract - * 2 * (1023 + 52), then add 50 + w. - */ - ex = (int)((x >> 52) & 0x7FF); - ey = (int)((y >> 52) & 0x7FF); - e = ex + ey - 2100 + (int)w; - - /* - * Sign bit is the XOR of the operand sign bits. - */ - s = (int)((x ^ y) >> 63); - - /* - * Corrective actions for zeros: if either of the operands is - * zero, then the computations above were wrong. Test for zero - * is whether ex or ey is zero. We just have to set the mantissa - * (zu) to zero, the FPR() function will normalize e. - */ - d = ((ex + 0x7FF) & (ey + 0x7FF)) >> 11; - zu &= -(uint64_t)d; - - /* - * FPR() packs the result and applies proper rounding. - */ - return FPR(s, e, zu); -} - -#endif // yyyASM_CORTEXM4- - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -__attribute__((naked)) -fpr -fpr_div(fpr x __attribute__((unused)), fpr y __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, r7, r8, r10, r11, lr }\n\t" - - "@ Extract mantissas of x and y, in r0:r4 and r2:r5, respectively.\n\t" - "@ We don't touch r1 and r3 as they contain the exponents and\n\t" - "@ signs, which we'll need later on.\n\t" - "ubfx r4, r1, #0, #20\n\t" - "ubfx r5, r3, #0, #20\n\t" - "orrs r4, r4, #0x00100000\n\t" - "orrs r5, r5, #0x00100000\n\t" - "\n\t" - "@ Perform bit-by-bit division. We want a 56-bit result in r8:r10\n\t" - "@ (low bit is 0). Bits come from the carry flag and are\n\t" - "@ injected with rrx, i.e. in position 31; we thus get bits in\n\t" - "@ the reverse order. Bits accumulate in r8; after the first 24\n\t" - "@ bits, we move the quotient bits to r10.\n\t" - "eors r8, r8\n\t" - "\n\t" - -#define DIVSTEP \ - "subs r6, r0, r2\n\t" \ - "sbcs r7, r4, r5\n\t" \ - "rrx r8, r8\n\t" \ - "ands r6, r2, r8, asr #31\n\t" \ - "ands r7, r5, r8, asr #31\n\t" \ - "subs r0, r6\n\t" \ - "sbcs r4, r7\n\t" \ - "adds r0, r0, r0\n\t" \ - "adcs r4, r4, r4\n\t" - -#define DIVSTEP4 DIVSTEP DIVSTEP DIVSTEP DIVSTEP -#define DIVSTEP8 DIVSTEP4 DIVSTEP4 - - DIVSTEP8 - DIVSTEP8 - DIVSTEP8 - - "\n\t" - "@ We have the first 24 bits of the quotient, move them to r10.\n\t" - "rbit r10, r8\n\t" - "\n\t" - - DIVSTEP8 - DIVSTEP8 - DIVSTEP8 - DIVSTEP4 DIVSTEP DIVSTEP DIVSTEP - -#undef DIVSTEP -#undef DIVSTEP4 -#undef DIVSTEP8 - - "\n\t" - "@ Lowest bit will be set if remainder is non-zero at this point\n\t" - "@ (this is the 'sticky' bit).\n\t" - "subs r0, #1\n\t" - "sbcs r4, #0\n\t" - "rrx r8, r8\n\t" - "\n\t" - "@ We now have the next (low) 32 bits of the quotient.\n\t" - "rbit r8, r8\n\t" - "\n\t" - "@ Since both operands had their top bit set, we know that the\n\t" - "@ result at this point is in 2^54..2^56-1. We scale it down\n\t" - "@ to 2^54..2^55-1 with a conditional shift. We also write the\n\t" - "@ result in r4:r5. If the shift is done, r6 will contain -1.\n\t" - "ands r4, r8, #1\n\t" - "lsrs r6, r10, #23\n\t" - "rsbs r6, r6, #0\n\t" - "orrs r4, r4, r8, lsr #1\n\t" - "orrs r4, r4, r10, lsl #31\n\t" - "lsrs r5, r10, #1\n\t" - "eors r8, r8, r4\n\t" - "eors r10, r10, r5\n\t" - "bics r8, r8, r6\n\t" - "bics r10, r10, r6\n\t" - "eors r4, r4, r8\n\t" - "eors r5, r5, r10\n\t" - "\n\t" - "@ Compute aggregate exponent: ex - ey + 1022 + w\n\t" - "@ (where w = 1 if the conditional shift was done, 0 otherwise)\n\t" - "@ But we subtract 1 because the injection of the mantissa high\n\t" - "@ bit will increment the exponent by 1.\n\t" - "lsls r0, r1, #1\n\t" - "lsls r2, r3, #1\n\t" - "lsrs r0, r0, #21\n\t" - "addw r7, r0, #0x7FF @ save ex + 2047 in r7\n\t" - "subs r0, r0, r2, lsr #21\n\t" - "addw r0, r0, #1021\n\t" - "subs r0, r6\n\t" - "\n\t" - "@ If the x operand was zero, then the computation was wrong and\n\t" - "@ the result is zero. Also, if the result exponent is zero or\n\t" - "@ negative, then the mantissa shall be clamped to zero. Since r0\n\t" - "@ contains the result exponent minus 1, we test on r0 being\n\t" - "@ strictly negative.\n\t" - "mvns r2, r0\n\t" - "ands r2, r2, r7, lsl #20\n\t" - "ands r0, r0, r2, asr #31\n\t" - "ands r4, r4, r2, asr #31\n\t" - "ands r5, r5, r2, asr #31\n\t" - "\n\t" - "@ Sign is the XOR of the sign of the operands. This is true in\n\t" - "@ all cases, including very small results (exponent underflow)\n\t" - "@ and zeros.\n\t" - "eors r1, r3\n\t" - "bfc r1, #0, #31\n\t" - "\n\t" - "@ Plug in the exponent.\n\t" - "bfi r1, r0, #20, #11\n\t" - "\n\t" - "@ Shift back to the normal 53-bit mantissa, with rounding.\n\t" - "@ Mantissa goes into r0:r1. For r1, we must use an addition\n\t" - "@ because the rounding may have triggered a carry, that should\n\t" - "@ be added to the exponent.\n\t" - "movs r6, r4\n\t" - "lsrs r0, r4, #2\n\t" - "orrs r0, r0, r5, lsl #30\n\t" - "adds r1, r1, r5, lsr #2\n\t" - "ands r6, #0x7\n\t" - "movs r3, #0xC8\n\t" - "lsrs r3, r6\n\t" - "ands r3, #1\n\t" - "adds r0, r3\n\t" - "adcs r1, #0\n\t" - "\n\t" - "pop { r4, r5, r6, r7, r8, r10, r11, pc }\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - -fpr -fpr_div(fpr x, fpr y) -{ - uint64_t xu, yu, q, q2, w; - int i, ex, ey, e, d, s; - - /* - * Extract mantissas of x and y (unsigned). - */ - xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52); - yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52); - - /* - * Perform bit-by-bit division of xu by yu. We run it for 55 bits. - */ - q = 0; - for (i = 0; i < 55; i ++) { - /* - * If yu is less than or equal xu, then subtract it and - * push a 1 in the quotient; otherwise, leave xu unchanged - * and push a 0. - */ - uint64_t b; - - b = ((xu - yu) >> 63) - 1; - xu -= b & yu; - q |= b & 1; - xu <<= 1; - q <<= 1; - } - - /* - * We got 55 bits in the quotient, followed by an extra zero. We - * want that 56th bit to be "sticky": it should be a 1 if and - * only if the remainder (xu) is non-zero. - */ - q |= (xu | -xu) >> 63; - - /* - * Quotient is at most 2^56-1. Its top bit may be zero, but in - * that case the next-to-top bit will be a one, since the - * initial xu and yu were both in the 2^52..2^53-1 range. - * We perform a conditional shift to normalize q to the - * 2^54..2^55-1 range (with the bottom bit being sticky). - */ - q2 = (q >> 1) | (q & 1); - w = q >> 55; - q ^= (q ^ q2) & -w; - - /* - * Extract exponents to compute the scaling factor: - * - * - Each exponent is biased and we scaled them up by - * 52 bits; but these biases will cancel out. - * - * - The division loop produced a 55-bit shifted result, - * so we must scale it down by 55 bits. - * - * - If w = 1, we right-shifted the integer by 1 bit, - * hence we must add 1 to the scaling. - */ - ex = (int)((x >> 52) & 0x7FF); - ey = (int)((y >> 52) & 0x7FF); - e = ex - ey - 55 + (int)w; - - /* - * Sign is the XOR of the signs of the operands. - */ - s = (int)((x ^ y) >> 63); - - /* - * Corrective actions for zeros: if x = 0, then the computation - * is wrong, and we must clamp e and q to 0. We do not care - * about the case y = 0 (as per assumptions in this module, - * the caller does not perform divisions by zero). - */ - d = (ex + 0x7FF) >> 11; - s &= d; - e &= -d; - q &= -(uint64_t)d; - - /* - * FPR() packs the result and applies proper rounding. - */ - return FPR(s, e, q); -} - -#endif // yyyASM_CORTEXM4- - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -__attribute__((naked)) -fpr -fpr_sqrt(fpr x __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, r7, r8, r10, r11, lr }\n\t" - "\n\t" - "@ Extract mantissa (r0:r1) and exponent (r2). We assume that the\n\t" - "@ sign is positive. If the source is zero, then the mantissa is\n\t" - "@ set to 0.\n\t" - "lsrs r2, r1, #20\n\t" - "bfc r1, #20, #12\n\t" - "addw r3, r2, #0x7FF\n\t" - "subw r2, r2, #1023\n\t" - "lsrs r3, r3, #11\n\t" - "orrs r1, r1, r3, lsl #20\n\t" - "\n\t" - "@ If the exponent is odd, then multiply mantissa by 2 and subtract\n\t" - "@ 1 from the exponent.\n\t" - "ands r3, r2, #1\n\t" - "subs r2, r2, r3\n\t" - "rsbs r3, r3, #0\n\t" - "ands r4, r1, r3\n\t" - "ands r3, r0\n\t" - "adds r0, r3\n\t" - "adcs r1, r4\n\t" - "\n\t" - "@ Left-shift the mantissa by 9 bits to put it in the\n\t" - "@ 2^61..2^63-1 range (unless it is exactly 0).\n\t" - "lsls r1, r1, #9\n\t" - "orrs r1, r1, r0, lsr #23\n\t" - "lsls r0, r0, #9\n\t" - "\n\t" - "@ Compute the square root bit-by-bit.\n\t" - "@ There are 54 iterations; first 30 can work on top word only.\n\t" - "@ q = r3 (bit-reversed)\n\t" - "@ s = r5\n\t" - "eors r3, r3\n\t" - "eors r5, r5\n\t" - -#define SQRT_STEP_HI(bit) \ - "orrs r6, r5, #(1 << (" #bit "))\n\t" \ - "subs r7, r1, r6\n\t" \ - "rrx r3, r3\n\t" \ - "ands r6, r6, r3, asr #31\n\t" \ - "subs r1, r1, r6\n\t" \ - "lsrs r6, r3, #31\n\t" \ - "orrs r5, r5, r6, lsl #((" #bit ") + 1)\n\t" \ - "adds r0, r0\n\t" \ - "adcs r1, r1\n\t" - -#define SQRT_STEP_HIx5(b) \ - SQRT_STEP_HI((b)+4) \ - SQRT_STEP_HI((b)+3) \ - SQRT_STEP_HI((b)+2) \ - SQRT_STEP_HI((b)+1) \ - SQRT_STEP_HI(b) - - SQRT_STEP_HIx5(25) - SQRT_STEP_HIx5(20) - SQRT_STEP_HIx5(15) - SQRT_STEP_HIx5(10) - SQRT_STEP_HIx5(5) - SQRT_STEP_HIx5(0) - -#undef SQRT_STEP_HI -#undef SQRT_STEP_HIx5 - - "@ Top 30 bits of the result must be reversed: they were\n\t" - "@ accumulated with rrx (hence from the top bit).\n\t" - "rbit r3, r3\n\t" - "\n\t" - "@ For the next 24 iterations, we must use two-word operations.\n\t" - "@ bits of q now accumulate in r4\n\t" - "@ s is in r6:r5\n\t" - "eors r4, r4\n\t" - "eors r6, r6\n\t" - "\n\t" - "@ First iteration is special because the potential bit goes into\n\t" - "@ r5, not r6.\n\t" - "orrs r7, r6, #(1 << 31)\n\t" - "subs r8, r0, r7\n\t" - "sbcs r10, r1, r5\n\t" - "rrx r4, r4\n\t" - "ands r7, r7, r4, asr #31\n\t" - "ands r8, r5, r4, asr #31\n\t" - "subs r0, r0, r7\n\t" - "sbcs r1, r1, r8\n\t" - "lsrs r7, r4, #31\n\t" - "orrs r5, r5, r4, lsr #31\n\t" - "adds r0, r0\n\t" - "adcs r1, r1\n\t" - -#define SQRT_STEP_LO(bit) \ - "orrs r7, r6, #(1 << (" #bit "))\n\t" \ - "subs r8, r0, r7\n\t" \ - "sbcs r10, r1, r5\n\t" \ - "rrx r4, r4\n\t" \ - "ands r7, r7, r4, asr #31\n\t" \ - "ands r8, r5, r4, asr #31\n\t" \ - "subs r0, r0, r7\n\t" \ - "sbcs r1, r1, r8\n\t" \ - "lsrs r7, r4, #31\n\t" \ - "orrs r6, r6, r7, lsl #((" #bit ") + 1)\n\t" \ - "adds r0, r0\n\t" \ - "adcs r1, r1\n\t" - -#define SQRT_STEP_LOx4(b) \ - SQRT_STEP_LO((b)+3) \ - SQRT_STEP_LO((b)+2) \ - SQRT_STEP_LO((b)+1) \ - SQRT_STEP_LO(b) - - SQRT_STEP_LO(30) - SQRT_STEP_LO(29) - SQRT_STEP_LO(28) - SQRT_STEP_LOx4(24) - SQRT_STEP_LOx4(20) - SQRT_STEP_LOx4(16) - SQRT_STEP_LOx4(12) - SQRT_STEP_LOx4(8) - -#undef SQRT_STEP_LO -#undef SQRT_STEP_LOx4 - - "@ Put low 24 bits in the right order.\n\t" - "rbit r4, r4\n\t" - "\n\t" - "@ We have a 54-bit result; compute the 55-th bit as the 'sticky'\n\t" - "@ bit: it is non-zero if and only if r0:r1 is non-zero. We put the\n\t" - "@ three low bits (including the sticky bit) in r5.\n\t" - "orrs r0, r1\n\t" - "rsbs r1, r0, #0\n\t" - "orrs r0, r1\n\t" - "lsls r5, r4, #1\n\t" - "orrs r5, r5, r0, lsr #31\n\t" - "ands r5, #0x7\n\t" - "\n\t" - "@ Compute the rounding: r6 is set to 0 or 1, and will be added\n\t" - "@ to the mantissa.\n\t" - "movs r6, #0xC8\n\t" - "lsrs r6, r5\n\t" - "ands r6, #1\n\t" - "\n\t" - "@ Put the mantissa (53 bits, in the 2^52..2^53-1 range) in r0:r1\n\t" - "@ (rounding not applied yet).\n\t" - "lsrs r0, r4, #1\n\t" - "orrs r0, r0, r3, lsl #23\n\t" - "lsrs r1, r3, #9\n\t" - "\n\t" - "@ Compute new exponent. This is half the old one (then reencoded\n\t" - "@ by adding 1023). Exception: if the mantissa is zero, then the\n\t" - "@ encoded exponent is set to 0. At that point, if the mantissa\n\t" - "@ is non-zero, then its high bit (bit 52, i.e. bit 20 of r1) is\n\t" - "@ non-zero. Note that the exponent cannot go out of range.\n\t" - "lsrs r2, r2, #1\n\t" - "addw r2, r2, #1023\n\t" - "lsrs r5, r1, #20\n\t" - "rsbs r5, r5, #0\n\t" - "ands r2, r5\n\t" - "\n\t" - "@ Place exponent. This overwrites the high bit of the mantissa.\n\t" - "bfi r1, r2, #20, #11\n\t" - "\n\t" - "@ Apply rounding. This may create a carry that will spill into\n\t" - "@ the exponent, which is exactly what should be done in that case\n\t" - "@ (i.e. increment the exponent).\n\t" - "adds r0, r0, r6\n\t" - "adcs r1, r1, #0\n\t" - "\n\t" - "pop { r4, r5, r6, r7, r8, r10, r11, pc }\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - -fpr -fpr_sqrt(fpr x) -{ - uint64_t xu, q, s, r; - int ex, e; - - /* - * Extract the mantissa and the exponent. We don't care about - * the sign: by assumption, the operand is nonnegative. - * We want the "true" exponent corresponding to a mantissa - * in the 1..2 range. - */ - xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52); - ex = (int)((x >> 52) & 0x7FF); - e = ex - 1023; - - /* - * If the exponent is odd, double the mantissa and decrement - * the exponent. The exponent is then halved to account for - * the square root. - */ - xu += xu & -(uint64_t)(e & 1); - e >>= 1; - - /* - * Double the mantissa. - */ - xu <<= 1; - - /* - * We now have a mantissa in the 2^53..2^55-1 range. It - * represents a value between 1 (inclusive) and 4 (exclusive) - * in fixed point notation (with 53 fractional bits). We - * compute the square root bit by bit. - */ - q = 0; - s = 0; - r = (uint64_t)1 << 53; - for (int i = 0; i < 54; i ++) { - uint64_t t, b; - - t = s + r; - b = ((xu - t) >> 63) - 1; - s += (r << 1) & b; - xu -= t & b; - q += r & b; - xu <<= 1; - r >>= 1; - } - - /* - * Now, q is a rounded-low 54-bit value, with a leading 1, - * 52 fractional digits, and an additional guard bit. We add - * an extra sticky bit to account for what remains of the operand. - */ - q <<= 1; - q |= (xu | -xu) >> 63; - - /* - * Result q is in the 2^54..2^55-1 range; we bias the exponent - * by 54 bits (the value e at that point contains the "true" - * exponent, but q is now considered an integer, i.e. scaled - * up. - */ - e -= 54; - - /* - * Corrective action for an operand of value zero. - */ - q &= -(uint64_t)((ex + 0x7FF) >> 11); - - /* - * Apply rounding and back result. - */ - return FPR(0, e, q); -} - -#endif // yyyASM_CORTEXM4- - -uint64_t -fpr_expm_p63(fpr x, fpr ccs) -{ - /* - * Polynomial approximation of exp(-x) is taken from FACCT: - * https://eprint.iacr.org/2018/1234 - * Specifically, values are extracted from the implementation - * referenced from the FACCT article, and available at: - * https://github.com/raykzhao/gaussian - * Here, the coefficients have been scaled up by 2^63 and - * converted to integers. - * - * Tests over more than 24 billions of random inputs in the - * 0..log(2) range have never shown a deviation larger than - * 2^(-50) from the true mathematical value. - */ - static const uint64_t C[] = { - 0x00000004741183A3u, - 0x00000036548CFC06u, - 0x0000024FDCBF140Au, - 0x0000171D939DE045u, - 0x0000D00CF58F6F84u, - 0x000680681CF796E3u, - 0x002D82D8305B0FEAu, - 0x011111110E066FD0u, - 0x0555555555070F00u, - 0x155555555581FF00u, - 0x400000000002B400u, - 0x7FFFFFFFFFFF4800u, - 0x8000000000000000u - }; - - uint64_t z, y; - unsigned u; - uint32_t z0, z1, y0, y1; - uint64_t a, b; - - y = C[0]; - z = (uint64_t)fpr_trunc(fpr_mul(x, fpr_ptwo63)) << 1; - for (u = 1; u < (sizeof C) / sizeof(C[0]); u ++) { - /* - * Compute product z * y over 128 bits, but keep only - * the top 64 bits. - * - * TODO: On some architectures/compilers we could use - * some intrinsics (__umulh() on MSVC) or other compiler - * extensions (unsigned __int128 on GCC / Clang) for - * improved speed; however, most 64-bit architectures - * also have appropriate IEEE754 floating-point support, - * which is better. - */ - uint64_t c; - - z0 = (uint32_t)z; - z1 = (uint32_t)(z >> 32); - y0 = (uint32_t)y; - y1 = (uint32_t)(y >> 32); - a = ((uint64_t)z0 * (uint64_t)y1) - + (((uint64_t)z0 * (uint64_t)y0) >> 32); - b = ((uint64_t)z1 * (uint64_t)y0); - c = (a >> 32) + (b >> 32); - c += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32); - c += (uint64_t)z1 * (uint64_t)y1; - y = C[u] - c; - } - - /* - * The scaling factor must be applied at the end. Since y is now - * in fixed-point notation, we have to convert the factor to the - * same format, and do an extra integer multiplication. - */ - z = (uint64_t)fpr_trunc(fpr_mul(ccs, fpr_ptwo63)) << 1; - z0 = (uint32_t)z; - z1 = (uint32_t)(z >> 32); - y0 = (uint32_t)y; - y1 = (uint32_t)(y >> 32); - a = ((uint64_t)z0 * (uint64_t)y1) - + (((uint64_t)z0 * (uint64_t)y0) >> 32); - b = ((uint64_t)z1 * (uint64_t)y0); - y = (a >> 32) + (b >> 32); - y += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32); - y += (uint64_t)z1 * (uint64_t)y1; - - return y; -} - -const fpr fpr_gm_tab[] = { - 0, 0, - 9223372036854775808U, 4607182418800017408U, - 4604544271217802189U, 4604544271217802189U, - 13827916308072577997U, 4604544271217802189U, - 4606496786581982534U, 4600565431771507043U, - 13823937468626282851U, 4606496786581982534U, - 4600565431771507043U, 4606496786581982534U, - 13829868823436758342U, 4600565431771507043U, - 4607009347991985328U, 4596196889902818827U, - 13819568926757594635U, 4607009347991985328U, - 4603179351334086856U, 4605664432017547683U, - 13829036468872323491U, 4603179351334086856U, - 4605664432017547683U, 4603179351334086856U, - 13826551388188862664U, 4605664432017547683U, - 4596196889902818827U, 4607009347991985328U, - 13830381384846761136U, 4596196889902818827U, - 4607139046673687846U, 4591727299969791020U, - 13815099336824566828U, 4607139046673687846U, - 4603889326261607894U, 4605137878724712257U, - 13828509915579488065U, 4603889326261607894U, - 4606118860100255153U, 4602163548591158843U, - 13825535585445934651U, 4606118860100255153U, - 4598900923775164166U, 4606794571824115162U, - 13830166608678890970U, 4598900923775164166U, - 4606794571824115162U, 4598900923775164166U, - 13822272960629939974U, 4606794571824115162U, - 4602163548591158843U, 4606118860100255153U, - 13829490896955030961U, 4602163548591158843U, - 4605137878724712257U, 4603889326261607894U, - 13827261363116383702U, 4605137878724712257U, - 4591727299969791020U, 4607139046673687846U, - 13830511083528463654U, 4591727299969791020U, - 4607171569234046334U, 4587232218149935124U, - 13810604255004710932U, 4607171569234046334U, - 4604224084862889120U, 4604849113969373103U, - 13828221150824148911U, 4604224084862889120U, - 4606317631232591731U, 4601373767755717824U, - 13824745804610493632U, 4606317631232591731U, - 4599740487990714333U, 4606655894547498725U, - 13830027931402274533U, 4599740487990714333U, - 4606912484326125783U, 4597922303871901467U, - 13821294340726677275U, 4606912484326125783U, - 4602805845399633902U, 4605900952042040894U, - 13829272988896816702U, 4602805845399633902U, - 4605409869824231233U, 4603540801876750389U, - 13826912838731526197U, 4605409869824231233U, - 4594454542771183930U, 4607084929468638487U, - 13830456966323414295U, 4594454542771183930U, - 4607084929468638487U, 4594454542771183930U, - 13817826579625959738U, 4607084929468638487U, - 4603540801876750389U, 4605409869824231233U, - 13828781906679007041U, 4603540801876750389U, - 4605900952042040894U, 4602805845399633902U, - 13826177882254409710U, 4605900952042040894U, - 4597922303871901467U, 4606912484326125783U, - 13830284521180901591U, 4597922303871901467U, - 4606655894547498725U, 4599740487990714333U, - 13823112524845490141U, 4606655894547498725U, - 4601373767755717824U, 4606317631232591731U, - 13829689668087367539U, 4601373767755717824U, - 4604849113969373103U, 4604224084862889120U, - 13827596121717664928U, 4604849113969373103U, - 4587232218149935124U, 4607171569234046334U, - 13830543606088822142U, 4587232218149935124U, - 4607179706000002317U, 4582730748936808062U, - 13806102785791583870U, 4607179706000002317U, - 4604386048625945823U, 4604698657331085206U, - 13828070694185861014U, 4604386048625945823U, - 4606409688975526202U, 4600971798440897930U, - 13824343835295673738U, 4606409688975526202U, - 4600154912527631775U, 4606578871587619388U, - 13829950908442395196U, 4600154912527631775U, - 4606963563043808649U, 4597061974398750563U, - 13820434011253526371U, 4606963563043808649U, - 4602994049708411683U, 4605784983948558848U, - 13829157020803334656U, 4602994049708411683U, - 4605539368864982914U, 4603361638657888991U, - 13826733675512664799U, 4605539368864982914U, - 4595327571478659014U, 4607049811591515049U, - 13830421848446290857U, 4595327571478659014U, - 4607114680469659603U, 4593485039402578702U, - 13816857076257354510U, 4607114680469659603U, - 4603716733069447353U, 4605276012900672507U, - 13828648049755448315U, 4603716733069447353U, - 4606012266443150634U, 4602550884377336506U, - 13825922921232112314U, 4606012266443150634U, - 4598476289818621559U, 4606856142606846307U, - 13830228179461622115U, 4598476289818621559U, - 4606727809065869586U, 4599322407794599425U, - 13822694444649375233U, 4606727809065869586U, - 4601771097584682078U, 4606220668805321205U, - 13829592705660097013U, 4601771097584682078U, - 4604995550503212910U, 4604058477489546729U, - 13827430514344322537U, 4604995550503212910U, - 4589965306122607094U, 4607158013403433018U, - 13830530050258208826U, 4589965306122607094U, - 4607158013403433018U, 4589965306122607094U, - 13813337342977382902U, 4607158013403433018U, - 4604058477489546729U, 4604995550503212910U, - 13828367587357988718U, 4604058477489546729U, - 4606220668805321205U, 4601771097584682078U, - 13825143134439457886U, 4606220668805321205U, - 4599322407794599425U, 4606727809065869586U, - 13830099845920645394U, 4599322407794599425U, - 4606856142606846307U, 4598476289818621559U, - 13821848326673397367U, 4606856142606846307U, - 4602550884377336506U, 4606012266443150634U, - 13829384303297926442U, 4602550884377336506U, - 4605276012900672507U, 4603716733069447353U, - 13827088769924223161U, 4605276012900672507U, - 4593485039402578702U, 4607114680469659603U, - 13830486717324435411U, 4593485039402578702U, - 4607049811591515049U, 4595327571478659014U, - 13818699608333434822U, 4607049811591515049U, - 4603361638657888991U, 4605539368864982914U, - 13828911405719758722U, 4603361638657888991U, - 4605784983948558848U, 4602994049708411683U, - 13826366086563187491U, 4605784983948558848U, - 4597061974398750563U, 4606963563043808649U, - 13830335599898584457U, 4597061974398750563U, - 4606578871587619388U, 4600154912527631775U, - 13823526949382407583U, 4606578871587619388U, - 4600971798440897930U, 4606409688975526202U, - 13829781725830302010U, 4600971798440897930U, - 4604698657331085206U, 4604386048625945823U, - 13827758085480721631U, 4604698657331085206U, - 4582730748936808062U, 4607179706000002317U, - 13830551742854778125U, 4582730748936808062U, - 4607181740574479067U, 4578227681973159812U, - 13801599718827935620U, 4607181740574479067U, - 4604465633578481725U, 4604621949701367983U, - 13827993986556143791U, 4604465633578481725U, - 4606453861145241227U, 4600769149537129431U, - 13824141186391905239U, 4606453861145241227U, - 4600360675823176935U, 4606538458821337243U, - 13829910495676113051U, 4600360675823176935U, - 4606987119037722413U, 4596629994023683153U, - 13820002030878458961U, 4606987119037722413U, - 4603087070374583113U, 4605725276488455441U, - 13829097313343231249U, 4603087070374583113U, - 4605602459698789090U, 4603270878689749849U, - 13826642915544525657U, 4605602459698789090U, - 4595762727260045105U, 4607030246558998647U, - 13830402283413774455U, 4595762727260045105U, - 4607127537664763515U, 4592606767730311893U, - 13815978804585087701U, 4607127537664763515U, - 4603803453461190356U, 4605207475328619533U, - 13828579512183395341U, 4603803453461190356U, - 4606066157444814153U, 4602357870542944470U, - 13825729907397720278U, 4606066157444814153U, - 4598688984595225406U, 4606826008603986804U, - 13830198045458762612U, 4598688984595225406U, - 4606761837001494797U, 4599112075441176914U, - 13822484112295952722U, 4606761837001494797U, - 4601967947786150793U, 4606170366472647579U, - 13829542403327423387U, 4601967947786150793U, - 4605067233569943231U, 4603974338538572089U, - 13827346375393347897U, 4605067233569943231U, - 4590846768565625881U, 4607149205763218185U, - 13830521242617993993U, 4590846768565625881U, - 4607165468267934125U, 4588998070480937184U, - 13812370107335712992U, 4607165468267934125U, - 4604141730443515286U, 4604922840319727473U, - 13828294877174503281U, 4604141730443515286U, - 4606269759522929756U, 4601573027631668967U, - 13824945064486444775U, 4606269759522929756U, - 4599531889160152938U, 4606692493141721470U, - 13830064529996497278U, 4599531889160152938U, - 4606884969294623682U, 4598262871476403630U, - 13821634908331179438U, 4606884969294623682U, - 4602710690099904183U, 4605957195211051218U, - 13829329232065827026U, 4602710690099904183U, - 4605343481119364930U, 4603629178146150899U, - 13827001215000926707U, 4605343481119364930U, - 4594016801320007031U, 4607100477024622401U, - 13830472513879398209U, 4594016801320007031U, - 4607068040143112603U, 4594891488091520602U, - 13818263524946296410U, 4607068040143112603U, - 4603451617570386922U, 4605475169017376660U, - 13828847205872152468U, 4603451617570386922U, - 4605843545406134034U, 4602900303344142735U, - 13826272340198918543U, 4605843545406134034U, - 4597492765973365521U, 4606938683557690074U, - 13830310720412465882U, 4597492765973365521U, - 4606618018794815019U, 4599948172872067014U, - 13823320209726842822U, 4606618018794815019U, - 4601173347964633034U, 4606364276725003740U, - 13829736313579779548U, 4601173347964633034U, - 4604774382555066977U, 4604305528345395596U, - 13827677565200171404U, 4604774382555066977U, - 4585465300892538317U, 4607176315382986589U, - 13830548352237762397U, 4585465300892538317U, - 4607176315382986589U, 4585465300892538317U, - 13808837337747314125U, 4607176315382986589U, - 4604305528345395596U, 4604774382555066977U, - 13828146419409842785U, 4604305528345395596U, - 4606364276725003740U, 4601173347964633034U, - 13824545384819408842U, 4606364276725003740U, - 4599948172872067014U, 4606618018794815019U, - 13829990055649590827U, 4599948172872067014U, - 4606938683557690074U, 4597492765973365521U, - 13820864802828141329U, 4606938683557690074U, - 4602900303344142735U, 4605843545406134034U, - 13829215582260909842U, 4602900303344142735U, - 4605475169017376660U, 4603451617570386922U, - 13826823654425162730U, 4605475169017376660U, - 4594891488091520602U, 4607068040143112603U, - 13830440076997888411U, 4594891488091520602U, - 4607100477024622401U, 4594016801320007031U, - 13817388838174782839U, 4607100477024622401U, - 4603629178146150899U, 4605343481119364930U, - 13828715517974140738U, 4603629178146150899U, - 4605957195211051218U, 4602710690099904183U, - 13826082726954679991U, 4605957195211051218U, - 4598262871476403630U, 4606884969294623682U, - 13830257006149399490U, 4598262871476403630U, - 4606692493141721470U, 4599531889160152938U, - 13822903926014928746U, 4606692493141721470U, - 4601573027631668967U, 4606269759522929756U, - 13829641796377705564U, 4601573027631668967U, - 4604922840319727473U, 4604141730443515286U, - 13827513767298291094U, 4604922840319727473U, - 4588998070480937184U, 4607165468267934125U, - 13830537505122709933U, 4588998070480937184U, - 4607149205763218185U, 4590846768565625881U, - 13814218805420401689U, 4607149205763218185U, - 4603974338538572089U, 4605067233569943231U, - 13828439270424719039U, 4603974338538572089U, - 4606170366472647579U, 4601967947786150793U, - 13825339984640926601U, 4606170366472647579U, - 4599112075441176914U, 4606761837001494797U, - 13830133873856270605U, 4599112075441176914U, - 4606826008603986804U, 4598688984595225406U, - 13822061021450001214U, 4606826008603986804U, - 4602357870542944470U, 4606066157444814153U, - 13829438194299589961U, 4602357870542944470U, - 4605207475328619533U, 4603803453461190356U, - 13827175490315966164U, 4605207475328619533U, - 4592606767730311893U, 4607127537664763515U, - 13830499574519539323U, 4592606767730311893U, - 4607030246558998647U, 4595762727260045105U, - 13819134764114820913U, 4607030246558998647U, - 4603270878689749849U, 4605602459698789090U, - 13828974496553564898U, 4603270878689749849U, - 4605725276488455441U, 4603087070374583113U, - 13826459107229358921U, 4605725276488455441U, - 4596629994023683153U, 4606987119037722413U, - 13830359155892498221U, 4596629994023683153U, - 4606538458821337243U, 4600360675823176935U, - 13823732712677952743U, 4606538458821337243U, - 4600769149537129431U, 4606453861145241227U, - 13829825898000017035U, 4600769149537129431U, - 4604621949701367983U, 4604465633578481725U, - 13827837670433257533U, 4604621949701367983U, - 4578227681973159812U, 4607181740574479067U, - 13830553777429254875U, 4578227681973159812U, - 4607182249242036882U, 4573724215515480177U, - 13797096252370255985U, 4607182249242036882U, - 4604505071555817232U, 4604583231088591477U, - 13827955267943367285U, 4604505071555817232U, - 4606475480113671417U, 4600667422348321968U, - 13824039459203097776U, 4606475480113671417U, - 4600463181646572228U, 4606517779747998088U, - 13829889816602773896U, 4600463181646572228U, - 4606998399608725124U, 4596413578358834022U, - 13819785615213609830U, 4606998399608725124U, - 4603133304188877240U, 4605694995810664660U, - 13829067032665440468U, 4603133304188877240U, - 4605633586259814045U, 4603225210076562971U, - 13826597246931338779U, 4605633586259814045U, - 4595979936813835462U, 4607019963775302583U, - 13830392000630078391U, 4595979936813835462U, - 4607133460805585796U, 4592167175087283203U, - 13815539211942059011U, 4607133460805585796U, - 4603846496621587377U, 4605172808754305228U, - 13828544845609081036U, 4603846496621587377U, - 4606092657816072624U, 4602260871257280788U, - 13825632908112056596U, 4606092657816072624U, - 4598795050632330097U, 4606810452769876110U, - 13830182489624651918U, 4598795050632330097U, - 4606778366364612594U, 4599006600037663623U, - 13822378636892439431U, 4606778366364612594U, - 4602065906208722008U, 4606144763310860551U, - 13829516800165636359U, 4602065906208722008U, - 4605102686554936490U, 4603931940768740167U, - 13827303977623515975U, 4605102686554936490U, - 4591287158938884897U, 4607144295058764886U, - 13830516331913540694U, 4591287158938884897U, - 4607168688050493276U, 4588115294056142819U, - 13811487330910918627U, 4607168688050493276U, - 4604183020748362039U, 4604886103475043762U, - 13828258140329819570U, 4604183020748362039U, - 4606293848208650998U, 4601473544562720001U, - 13824845581417495809U, 4606293848208650998U, - 4599636300858866724U, 4606674353838411301U, - 13830046390693187109U, 4599636300858866724U, - 4606898891031025132U, 4598136582470364665U, - 13821508619325140473U, 4606898891031025132U, - 4602758354025980442U, 4605929219593405673U, - 13829301256448181481U, 4602758354025980442U, - 4605376811039722786U, 4603585091850767959U, - 13826957128705543767U, 4605376811039722786U, - 4594235767444503503U, 4607092871118901179U, - 13830464907973676987U, 4594235767444503503U, - 4607076652372832968U, 4594673119063280916U, - 13818045155918056724U, 4607076652372832968U, - 4603496309891590679U, 4605442656228245717U, - 13828814693083021525U, 4603496309891590679U, - 4605872393621214213U, 4602853162432841185U, - 13826225199287616993U, 4605872393621214213U, - 4597707695679609371U, 4606925748668145757U, - 13830297785522921565U, 4597707695679609371U, - 4606637115963965612U, 4599844446633109139U, - 13823216483487884947U, 4606637115963965612U, - 4601273700967202825U, 4606341107699334546U, - 13829713144554110354U, 4601273700967202825U, - 4604811873195349477U, 4604264921241055824U, - 13827636958095831632U, 4604811873195349477U, - 4586348876009622851U, 4607174111710118367U, - 13830546148564894175U, 4586348876009622851U, - 4607178180169683960U, 4584498631466405633U, - 13807870668321181441U, 4607178180169683960U, - 4604345904647073908U, 4604736643460027021U, - 13828108680314802829U, 4604345904647073908U, - 4606387137437298591U, 4601072712526242277U, - 13824444749381018085U, 4606387137437298591U, - 4600051662802353687U, 4606598603759044570U, - 13829970640613820378U, 4600051662802353687U, - 4606951288507767453U, 4597277522845151878U, - 13820649559699927686U, 4606951288507767453U, - 4602947266358709886U, 4605814408482919348U, - 13829186445337695156U, 4602947266358709886U, - 4605507406967535927U, 4603406726595779752U, - 13826778763450555560U, 4605507406967535927U, - 4595109641634432498U, 4607059093103722971U, - 13830431129958498779U, 4595109641634432498U, - 4607107746899444102U, 4593797652641645341U, - 13817169689496421149U, 4607107746899444102U, - 4603673059103075106U, 4605309881318010327U, - 13828681918172786135U, 4603673059103075106U, - 4605984877841711338U, 4602646891659203088U, - 13826018928513978896U, 4605984877841711338U, - 4598369669086960528U, 4606870719641066940U, - 13830242756495842748U, 4598369669086960528U, - 4606710311774494716U, 4599427256825614420U, - 13822799293680390228U, 4606710311774494716U, - 4601672213217083403U, 4606245366082353408U, - 13829617402937129216U, 4601672213217083403U, - 4604959323120302796U, 4604100215502905499U, - 13827472252357681307U, 4604959323120302796U, - 4589524267239410099U, 4607161910007591876U, - 13830533946862367684U, 4589524267239410099U, - 4607153778602162496U, 4590406145430462614U, - 13813778182285238422U, 4607153778602162496U, - 4604016517974851588U, 4605031521104517324U, - 13828403557959293132U, 4604016517974851588U, - 4606195668621671667U, 4601869677011524443U, - 13825241713866300251U, 4606195668621671667U, - 4599217346014614711U, 4606744984357082948U, - 13830117021211858756U, 4599217346014614711U, - 4606841238740778884U, 4598582729657176439U, - 13821954766511952247U, 4606841238740778884U, - 4602454542796181607U, 4606039359984203741U, - 13829411396838979549U, 4602454542796181607U, - 4605241877142478242U, 4603760198400967492U, - 13827132235255743300U, 4605241877142478242U, - 4593046061348462537U, 4607121277474223905U, - 13830493314328999713U, 4593046061348462537U, - 4607040195955932526U, 4595545269419264690U, - 13818917306274040498U, 4607040195955932526U, - 4603316355454250015U, 4605571053506370248U, - 13828943090361146056U, 4603316355454250015U, - 4605755272910869620U, 4603040651631881451U, - 13826412688486657259U, 4605755272910869620U, - 4596846128749438754U, 4606975506703684317U, - 13830347543558460125U, 4596846128749438754U, - 4606558823023444576U, 4600257918160607478U, - 13823629955015383286U, 4606558823023444576U, - 4600870609507958271U, 4606431930490633905U, - 13829803967345409713U, 4600870609507958271U, - 4604660425598397818U, 4604425958770613225U, - 13827797995625389033U, 4604660425598397818U, - 4580962600092897021U, 4607180892816495009U, - 13830552929671270817U, 4580962600092897021U, - 4607180892816495009U, 4580962600092897021U, - 13804334636947672829U, 4607180892816495009U, - 4604425958770613225U, 4604660425598397818U, - 13828032462453173626U, 4604425958770613225U, - 4606431930490633905U, 4600870609507958271U, - 13824242646362734079U, 4606431930490633905U, - 4600257918160607478U, 4606558823023444576U, - 13829930859878220384U, 4600257918160607478U, - 4606975506703684317U, 4596846128749438754U, - 13820218165604214562U, 4606975506703684317U, - 4603040651631881451U, 4605755272910869620U, - 13829127309765645428U, 4603040651631881451U, - 4605571053506370248U, 4603316355454250015U, - 13826688392309025823U, 4605571053506370248U, - 4595545269419264690U, 4607040195955932526U, - 13830412232810708334U, 4595545269419264690U, - 4607121277474223905U, 4593046061348462537U, - 13816418098203238345U, 4607121277474223905U, - 4603760198400967492U, 4605241877142478242U, - 13828613913997254050U, 4603760198400967492U, - 4606039359984203741U, 4602454542796181607U, - 13825826579650957415U, 4606039359984203741U, - 4598582729657176439U, 4606841238740778884U, - 13830213275595554692U, 4598582729657176439U, - 4606744984357082948U, 4599217346014614711U, - 13822589382869390519U, 4606744984357082948U, - 4601869677011524443U, 4606195668621671667U, - 13829567705476447475U, 4601869677011524443U, - 4605031521104517324U, 4604016517974851588U, - 13827388554829627396U, 4605031521104517324U, - 4590406145430462614U, 4607153778602162496U, - 13830525815456938304U, 4590406145430462614U, - 4607161910007591876U, 4589524267239410099U, - 13812896304094185907U, 4607161910007591876U, - 4604100215502905499U, 4604959323120302796U, - 13828331359975078604U, 4604100215502905499U, - 4606245366082353408U, 4601672213217083403U, - 13825044250071859211U, 4606245366082353408U, - 4599427256825614420U, 4606710311774494716U, - 13830082348629270524U, 4599427256825614420U, - 4606870719641066940U, 4598369669086960528U, - 13821741705941736336U, 4606870719641066940U, - 4602646891659203088U, 4605984877841711338U, - 13829356914696487146U, 4602646891659203088U, - 4605309881318010327U, 4603673059103075106U, - 13827045095957850914U, 4605309881318010327U, - 4593797652641645341U, 4607107746899444102U, - 13830479783754219910U, 4593797652641645341U, - 4607059093103722971U, 4595109641634432498U, - 13818481678489208306U, 4607059093103722971U, - 4603406726595779752U, 4605507406967535927U, - 13828879443822311735U, 4603406726595779752U, - 4605814408482919348U, 4602947266358709886U, - 13826319303213485694U, 4605814408482919348U, - 4597277522845151878U, 4606951288507767453U, - 13830323325362543261U, 4597277522845151878U, - 4606598603759044570U, 4600051662802353687U, - 13823423699657129495U, 4606598603759044570U, - 4601072712526242277U, 4606387137437298591U, - 13829759174292074399U, 4601072712526242277U, - 4604736643460027021U, 4604345904647073908U, - 13827717941501849716U, 4604736643460027021U, - 4584498631466405633U, 4607178180169683960U, - 13830550217024459768U, 4584498631466405633U, - 4607174111710118367U, 4586348876009622851U, - 13809720912864398659U, 4607174111710118367U, - 4604264921241055824U, 4604811873195349477U, - 13828183910050125285U, 4604264921241055824U, - 4606341107699334546U, 4601273700967202825U, - 13824645737821978633U, 4606341107699334546U, - 4599844446633109139U, 4606637115963965612U, - 13830009152818741420U, 4599844446633109139U, - 4606925748668145757U, 4597707695679609371U, - 13821079732534385179U, 4606925748668145757U, - 4602853162432841185U, 4605872393621214213U, - 13829244430475990021U, 4602853162432841185U, - 4605442656228245717U, 4603496309891590679U, - 13826868346746366487U, 4605442656228245717U, - 4594673119063280916U, 4607076652372832968U, - 13830448689227608776U, 4594673119063280916U, - 4607092871118901179U, 4594235767444503503U, - 13817607804299279311U, 4607092871118901179U, - 4603585091850767959U, 4605376811039722786U, - 13828748847894498594U, 4603585091850767959U, - 4605929219593405673U, 4602758354025980442U, - 13826130390880756250U, 4605929219593405673U, - 4598136582470364665U, 4606898891031025132U, - 13830270927885800940U, 4598136582470364665U, - 4606674353838411301U, 4599636300858866724U, - 13823008337713642532U, 4606674353838411301U, - 4601473544562720001U, 4606293848208650998U, - 13829665885063426806U, 4601473544562720001U, - 4604886103475043762U, 4604183020748362039U, - 13827555057603137847U, 4604886103475043762U, - 4588115294056142819U, 4607168688050493276U, - 13830540724905269084U, 4588115294056142819U, - 4607144295058764886U, 4591287158938884897U, - 13814659195793660705U, 4607144295058764886U, - 4603931940768740167U, 4605102686554936490U, - 13828474723409712298U, 4603931940768740167U, - 4606144763310860551U, 4602065906208722008U, - 13825437943063497816U, 4606144763310860551U, - 4599006600037663623U, 4606778366364612594U, - 13830150403219388402U, 4599006600037663623U, - 4606810452769876110U, 4598795050632330097U, - 13822167087487105905U, 4606810452769876110U, - 4602260871257280788U, 4606092657816072624U, - 13829464694670848432U, 4602260871257280788U, - 4605172808754305228U, 4603846496621587377U, - 13827218533476363185U, 4605172808754305228U, - 4592167175087283203U, 4607133460805585796U, - 13830505497660361604U, 4592167175087283203U, - 4607019963775302583U, 4595979936813835462U, - 13819351973668611270U, 4607019963775302583U, - 4603225210076562971U, 4605633586259814045U, - 13829005623114589853U, 4603225210076562971U, - 4605694995810664660U, 4603133304188877240U, - 13826505341043653048U, 4605694995810664660U, - 4596413578358834022U, 4606998399608725124U, - 13830370436463500932U, 4596413578358834022U, - 4606517779747998088U, 4600463181646572228U, - 13823835218501348036U, 4606517779747998088U, - 4600667422348321968U, 4606475480113671417U, - 13829847516968447225U, 4600667422348321968U, - 4604583231088591477U, 4604505071555817232U, - 13827877108410593040U, 4604583231088591477U, - 4573724215515480177U, 4607182249242036882U, - 13830554286096812690U, 4573724215515480177U, - 4607182376410422530U, 4569220649180767418U, - 13792592686035543226U, 4607182376410422530U, - 4604524701268679793U, 4604563781218984604U, - 13827935818073760412U, 4604524701268679793U, - 4606486172460753999U, 4600616459743653188U, - 13823988496598428996U, 4606486172460753999U, - 4600514338912178239U, 4606507322377452870U, - 13829879359232228678U, 4600514338912178239U, - 4607003915349878877U, 4596305267720071930U, - 13819677304574847738U, 4607003915349878877U, - 4603156351203636159U, 4605679749231851918U, - 13829051786086627726U, 4603156351203636159U, - 4605649044311923410U, 4603202304363743346U, - 13826574341218519154U, 4605649044311923410U, - 4596088445927168004U, 4607014697483910382U, - 13830386734338686190U, 4596088445927168004U, - 4607136295912168606U, 4591947271803021404U, - 13815319308657797212U, 4607136295912168606U, - 4603867938232615808U, 4605155376589456981U, - 13828527413444232789U, 4603867938232615808U, - 4606105796280968177U, 4602212250118051877U, - 13825584286972827685U, 4606105796280968177U, - 4598848011564831930U, 4606802552898869248U, - 13830174589753645056U, 4598848011564831930U, - 4606786509620734768U, 4598953786765296928U, - 13822325823620072736U, 4606786509620734768U, - 4602114767134999006U, 4606131849150971908U, - 13829503886005747716U, 4602114767134999006U, - 4605120315324767624U, 4603910660507251362U, - 13827282697362027170U, 4605120315324767624U, - 4591507261658050721U, 4607141713064252300U, - 13830513749919028108U, 4591507261658050721U, - 4607170170974224083U, 4587673791460508439U, - 13811045828315284247U, 4607170170974224083U, - 4604203581176243359U, 4604867640218014515U, - 13828239677072790323U, 4604203581176243359U, - 4606305777984577632U, 4601423692641949331U, - 13824795729496725139U, 4606305777984577632U, - 4599688422741010356U, 4606665164148251002U, - 13830037201003026810U, 4599688422741010356U, - 4606905728766014348U, 4598029484874872834U, - 13821401521729648642U, 4606905728766014348U, - 4602782121393764535U, 4605915122243179241U, - 13829287159097955049U, 4602782121393764535U, - 4605393374401988274U, 4603562972219549215U, - 13826935009074325023U, 4605393374401988274U, - 4594345179472540681U, 4607088942243446236U, - 13830460979098222044U, 4594345179472540681U, - 4607080832832247697U, 4594563856311064231U, - 13817935893165840039U, 4607080832832247697U, - 4603518581031047189U, 4605426297151190466U, - 13828798334005966274U, 4603518581031047189U, - 4605886709123365959U, 4602829525820289164U, - 13826201562675064972U, 4605886709123365959U, - 4597815040470278984U, 4606919157647773535U, - 13830291194502549343U, 4597815040470278984U, - 4606646545123403481U, 4599792496117920694U, - 13823164532972696502U, 4606646545123403481U, - 4601323770373937522U, 4606329407841126011U, - 13829701444695901819U, 4601323770373937522U, - 4604830524903495634U, 4604244531615310815U, - 13827616568470086623U, 4604830524903495634U, - 4586790578280679046U, 4607172882816799076U, - 13830544919671574884U, 4586790578280679046U, - 4607178985458280057U, 4583614727651146525U, - 13806986764505922333U, 4607178985458280057U, - 4604366005771528720U, 4604717681185626434U, - 13828089718040402242U, 4604366005771528720U, - 4606398451906509788U, 4601022290077223616U, - 13824394326931999424U, 4606398451906509788U, - 4600103317933788342U, 4606588777269136769U, - 13829960814123912577U, 4600103317933788342U, - 4606957467106717424U, 4597169786279785693U, - 13820541823134561501U, 4606957467106717424U, - 4602970680601913687U, 4605799732098147061U, - 13829171768952922869U, 4602970680601913687U, - 4605523422498301790U, 4603384207141321914U, - 13826756243996097722U, 4605523422498301790U, - 4595218635031890910U, 4607054494135176056U, - 13830426530989951864U, 4595218635031890910U, - 4607111255739239816U, 4593688012422887515U, - 13817060049277663323U, 4607111255739239816U, - 4603694922063032361U, 4605292980606880364U, - 13828665017461656172U, 4603694922063032361U, - 4605998608960791335U, 4602598930031891166U, - 13825970966886666974U, 4605998608960791335U, - 4598423001813699022U, 4606863472012527185U, - 13830235508867302993U, 4598423001813699022U, - 4606719100629313491U, 4599374859150636784U, - 13822746896005412592U, 4606719100629313491U, - 4601721693286060937U, 4606233055365547081U, - 13829605092220322889U, 4601721693286060937U, - 4604977468824438271U, 4604079374282302598U, - 13827451411137078406U, 4604977468824438271U, - 4589744810590291021U, 4607160003989618959U, - 13830532040844394767U, 4589744810590291021U, - 4607155938267770208U, 4590185751760970393U, - 13813557788615746201U, 4607155938267770208U, - 4604037525321326463U, 4605013567986435066U, - 13828385604841210874U, 4604037525321326463U, - 4606208206518262803U, 4601820425647934753U, - 13825192462502710561U, 4606208206518262803U, - 4599269903251194481U, 4606736437002195879U, - 13830108473856971687U, 4599269903251194481U, - 4606848731493011465U, 4598529532600161144U, - 13821901569454936952U, 4606848731493011465U, - 4602502755147763107U, 4606025850160239809U, - 13829397887015015617U, 4602502755147763107U, - 4605258978359093269U, 4603738491917026584U, - 13827110528771802392U, 4605258978359093269U, - 4593265590854265407U, 4607118021058468598U, - 13830490057913244406U, 4593265590854265407U, - 4607045045516813836U, 4595436449949385485U, - 13818808486804161293U, 4607045045516813836U, - 4603339021357904144U, 4605555245917486022U, - 13828927282772261830U, 4603339021357904144U, - 4605770164172969910U, 4603017373458244943U, - 13826389410313020751U, 4605770164172969910U, - 4596954088216812973U, 4606969576261663845U, - 13830341613116439653U, 4596954088216812973U, - 4606568886807728474U, 4600206446098256018U, - 13823578482953031826U, 4606568886807728474U, - 4600921238092511730U, 4606420848538580260U, - 13829792885393356068U, 4600921238092511730U, - 4604679572075463103U, 4604406033021674239U, - 13827778069876450047U, 4604679572075463103U, - 4581846703643734566U, 4607180341788068727U, - 13830552378642844535U, 4581846703643734566U, - 4607181359080094673U, 4579996072175835083U, - 13803368109030610891U, 4607181359080094673U, - 4604445825685214043U, 4604641218080103285U, - 13828013254934879093U, 4604445825685214043U, - 4606442934727379583U, 4600819913163773071U, - 13824191950018548879U, 4606442934727379583U, - 4600309328230211502U, 4606548680329491866U, - 13829920717184267674U, 4600309328230211502U, - 4606981354314050484U, 4596738097012783531U, - 13820110133867559339U, 4606981354314050484U, - 4603063884010218172U, 4605740310302420207U, - 13829112347157196015U, 4603063884010218172U, - 4605586791482848547U, 4603293641160266722U, - 13826665678015042530U, 4605586791482848547U, - 4595654028864046335U, 4607035262954517034U, - 13830407299809292842U, 4595654028864046335U, - 4607124449686274900U, 4592826452951465409U, - 13816198489806241217U, 4607124449686274900U, - 4603781852316960384U, 4605224709411790590U, - 13828596746266566398U, 4603781852316960384U, - 4606052795787882823U, 4602406247776385022U, - 13825778284631160830U, 4606052795787882823U, - 4598635880488956483U, 4606833664420673202U, - 13830205701275449010U, 4598635880488956483U, - 4606753451050079834U, 4599164736579548843U, - 13822536773434324651U, 4606753451050079834U, - 4601918851211878557U, 4606183055233559255U, - 13829555092088335063U, 4601918851211878557U, - 4605049409688478101U, 4603995455647851249U, - 13827367492502627057U, 4605049409688478101U, - 4590626485056654602U, 4607151534426937478U, - 13830523571281713286U, 4590626485056654602U, - 4607163731439411601U, 4589303678145802340U, - 13812675715000578148U, 4607163731439411601U, - 4604121000955189926U, 4604941113561600762U, - 13828313150416376570U, 4604121000955189926U, - 4606257600839867033U, 4601622657843474729U, - 13824994694698250537U, 4606257600839867033U, - 4599479600326345459U, 4606701442584137310U, - 13830073479438913118U, 4599479600326345459U, - 4606877885424248132U, 4598316292140394014U, - 13821688328995169822U, 4606877885424248132U, - 4602686793990243041U, 4605971073215153165U, - 13829343110069928973U, 4602686793990243041U, - 4605326714874986465U, 4603651144395358093U, - 13827023181250133901U, 4605326714874986465U, - 4593907249284540294U, 4607104153983298999U, - 13830476190838074807U, 4593907249284540294U, - 4607063608453868552U, 4595000592312171144U, - 13818372629166946952U, 4607063608453868552U, - 4603429196809300824U, 4605491322423429598U, - 13828863359278205406U, 4603429196809300824U, - 4605829012964735987U, 4602923807199184054U, - 13826295844053959862U, 4605829012964735987U, - 4597385183080791534U, 4606945027305114062U, - 13830317064159889870U, 4597385183080791534U, - 4606608350964852124U, 4599999947619525579U, - 13823371984474301387U, 4606608350964852124U, - 4601123065313358619U, 4606375745674388705U, - 13829747782529164513U, 4601123065313358619U, - 4604755543975806820U, 4604325745441780828U, - 13827697782296556636U, 4604755543975806820U, - 4585023436363055487U, 4607177290141793710U, - 13830549326996569518U, 4585023436363055487U, - 4607175255902437396U, 4585907115494236537U, - 13809279152349012345U, 4607175255902437396U, - 4604285253548209224U, 4604793159020491611U, - 13828165195875267419U, 4604285253548209224U, - 4606352730697093817U, 4601223560006786057U, - 13824595596861561865U, 4606352730697093817U, - 4599896339047301634U, 4606627607157935956U, - 13829999644012711764U, 4599896339047301634U, - 4606932257325205256U, 4597600270510262682U, - 13820972307365038490U, 4606932257325205256U, - 4602876755014813164U, 4605858005670328613U, - 13829230042525104421U, 4602876755014813164U, - 4605458946901419122U, 4603473988668005304U, - 13826846025522781112U, 4605458946901419122U, - 4594782329999411347U, 4607072388129742377U, - 13830444424984518185U, 4594782329999411347U, - 4607096716058023245U, 4594126307716900071U, - 13817498344571675879U, 4607096716058023245U, - 4603607160562208225U, 4605360179893335444U, - 13828732216748111252U, 4603607160562208225U, - 4605943243960030558U, 4602734543519989142U, - 13826106580374764950U, 4605943243960030558U, - 4598209407597805010U, 4606891971185517504U, - 13830264008040293312U, 4598209407597805010U, - 4606683463531482757U, 4599584122834874440U, - 13822956159689650248U, 4606683463531482757U, - 4601523323048804569U, 4606281842017099424U, - 13829653878871875232U, 4601523323048804569U, - 4604904503566677638U, 4604162403772767740U, - 13827534440627543548U, 4604904503566677638U, - 4588556721781247689U, 4607167120476811757U, - 13830539157331587565U, 4588556721781247689U, - 4607146792632922887U, 4591066993883984169U, - 13814439030738759977U, 4607146792632922887U, - 4603953166845776383U, 4605084992581147553U, - 13828457029435923361U, 4603953166845776383U, - 4606157602458368090U, 4602016966272225497U, - 13825389003127001305U, 4606157602458368090U, - 4599059363095165615U, 4606770142132396069U, - 13830142178987171877U, 4599059363095165615U, - 4606818271362779153U, 4598742041476147134U, - 13822114078330922942U, 4606818271362779153U, - 4602309411551204896U, 4606079444829232727U, - 13829451481684008535U, 4602309411551204896U, - 4605190175055178825U, 4603825001630339212U, - 13827197038485115020U, 4605190175055178825U, - 4592387007752762956U, 4607130541380624519U, - 13830502578235400327U, 4592387007752762956U, - 4607025146816593591U, 4595871363584150300U, - 13819243400438926108U, 4607025146816593591U, - 4603248068256948438U, 4605618058006716661U, - 13828990094861492469U, 4603248068256948438U, - 4605710171610479304U, 4603110210506737381U, - 13826482247361513189U, 4605710171610479304U, - 4596521820799644122U, 4606992800820440327U, - 13830364837675216135U, 4596521820799644122U, - 4606528158595189433U, 4600411960456200676U, - 13823783997310976484U, 4606528158595189433U, - 4600718319105833937U, 4606464709641375231U, - 13829836746496151039U, 4600718319105833937U, - 4604602620643553229U, 4604485382263976838U, - 13827857419118752646U, 4604602620643553229U, - 4576459225186735875U, 4607182037296057423U, - 13830554074150833231U, 4576459225186735875U, - 4607182037296057423U, 4576459225186735875U, - 13799831262041511683U, 4607182037296057423U, - 4604485382263976838U, 4604602620643553229U, - 13827974657498329037U, 4604485382263976838U, - 4606464709641375231U, 4600718319105833937U, - 13824090355960609745U, 4606464709641375231U, - 4600411960456200676U, 4606528158595189433U, - 13829900195449965241U, 4600411960456200676U, - 4606992800820440327U, 4596521820799644122U, - 13819893857654419930U, 4606992800820440327U, - 4603110210506737381U, 4605710171610479304U, - 13829082208465255112U, 4603110210506737381U, - 4605618058006716661U, 4603248068256948438U, - 13826620105111724246U, 4605618058006716661U, - 4595871363584150300U, 4607025146816593591U, - 13830397183671369399U, 4595871363584150300U, - 4607130541380624519U, 4592387007752762956U, - 13815759044607538764U, 4607130541380624519U, - 4603825001630339212U, 4605190175055178825U, - 13828562211909954633U, 4603825001630339212U, - 4606079444829232727U, 4602309411551204896U, - 13825681448405980704U, 4606079444829232727U, - 4598742041476147134U, 4606818271362779153U, - 13830190308217554961U, 4598742041476147134U, - 4606770142132396069U, 4599059363095165615U, - 13822431399949941423U, 4606770142132396069U, - 4602016966272225497U, 4606157602458368090U, - 13829529639313143898U, 4602016966272225497U, - 4605084992581147553U, 4603953166845776383U, - 13827325203700552191U, 4605084992581147553U, - 4591066993883984169U, 4607146792632922887U, - 13830518829487698695U, 4591066993883984169U, - 4607167120476811757U, 4588556721781247689U, - 13811928758636023497U, 4607167120476811757U, - 4604162403772767740U, 4604904503566677638U, - 13828276540421453446U, 4604162403772767740U, - 4606281842017099424U, 4601523323048804569U, - 13824895359903580377U, 4606281842017099424U, - 4599584122834874440U, 4606683463531482757U, - 13830055500386258565U, 4599584122834874440U, - 4606891971185517504U, 4598209407597805010U, - 13821581444452580818U, 4606891971185517504U, - 4602734543519989142U, 4605943243960030558U, - 13829315280814806366U, 4602734543519989142U, - 4605360179893335444U, 4603607160562208225U, - 13826979197416984033U, 4605360179893335444U, - 4594126307716900071U, 4607096716058023245U, - 13830468752912799053U, 4594126307716900071U, - 4607072388129742377U, 4594782329999411347U, - 13818154366854187155U, 4607072388129742377U, - 4603473988668005304U, 4605458946901419122U, - 13828830983756194930U, 4603473988668005304U, - 4605858005670328613U, 4602876755014813164U, - 13826248791869588972U, 4605858005670328613U, - 4597600270510262682U, 4606932257325205256U, - 13830304294179981064U, 4597600270510262682U, - 4606627607157935956U, 4599896339047301634U, - 13823268375902077442U, 4606627607157935956U, - 4601223560006786057U, 4606352730697093817U, - 13829724767551869625U, 4601223560006786057U, - 4604793159020491611U, 4604285253548209224U, - 13827657290402985032U, 4604793159020491611U, - 4585907115494236537U, 4607175255902437396U, - 13830547292757213204U, 4585907115494236537U, - 4607177290141793710U, 4585023436363055487U, - 13808395473217831295U, 4607177290141793710U, - 4604325745441780828U, 4604755543975806820U, - 13828127580830582628U, 4604325745441780828U, - 4606375745674388705U, 4601123065313358619U, - 13824495102168134427U, 4606375745674388705U, - 4599999947619525579U, 4606608350964852124U, - 13829980387819627932U, 4599999947619525579U, - 4606945027305114062U, 4597385183080791534U, - 13820757219935567342U, 4606945027305114062U, - 4602923807199184054U, 4605829012964735987U, - 13829201049819511795U, 4602923807199184054U, - 4605491322423429598U, 4603429196809300824U, - 13826801233664076632U, 4605491322423429598U, - 4595000592312171144U, 4607063608453868552U, - 13830435645308644360U, 4595000592312171144U, - 4607104153983298999U, 4593907249284540294U, - 13817279286139316102U, 4607104153983298999U, - 4603651144395358093U, 4605326714874986465U, - 13828698751729762273U, 4603651144395358093U, - 4605971073215153165U, 4602686793990243041U, - 13826058830845018849U, 4605971073215153165U, - 4598316292140394014U, 4606877885424248132U, - 13830249922279023940U, 4598316292140394014U, - 4606701442584137310U, 4599479600326345459U, - 13822851637181121267U, 4606701442584137310U, - 4601622657843474729U, 4606257600839867033U, - 13829629637694642841U, 4601622657843474729U, - 4604941113561600762U, 4604121000955189926U, - 13827493037809965734U, 4604941113561600762U, - 4589303678145802340U, 4607163731439411601U, - 13830535768294187409U, 4589303678145802340U, - 4607151534426937478U, 4590626485056654602U, - 13813998521911430410U, 4607151534426937478U, - 4603995455647851249U, 4605049409688478101U, - 13828421446543253909U, 4603995455647851249U, - 4606183055233559255U, 4601918851211878557U, - 13825290888066654365U, 4606183055233559255U, - 4599164736579548843U, 4606753451050079834U, - 13830125487904855642U, 4599164736579548843U, - 4606833664420673202U, 4598635880488956483U, - 13822007917343732291U, 4606833664420673202U, - 4602406247776385022U, 4606052795787882823U, - 13829424832642658631U, 4602406247776385022U, - 4605224709411790590U, 4603781852316960384U, - 13827153889171736192U, 4605224709411790590U, - 4592826452951465409U, 4607124449686274900U, - 13830496486541050708U, 4592826452951465409U, - 4607035262954517034U, 4595654028864046335U, - 13819026065718822143U, 4607035262954517034U, - 4603293641160266722U, 4605586791482848547U, - 13828958828337624355U, 4603293641160266722U, - 4605740310302420207U, 4603063884010218172U, - 13826435920864993980U, 4605740310302420207U, - 4596738097012783531U, 4606981354314050484U, - 13830353391168826292U, 4596738097012783531U, - 4606548680329491866U, 4600309328230211502U, - 13823681365084987310U, 4606548680329491866U, - 4600819913163773071U, 4606442934727379583U, - 13829814971582155391U, 4600819913163773071U, - 4604641218080103285U, 4604445825685214043U, - 13827817862539989851U, 4604641218080103285U, - 4579996072175835083U, 4607181359080094673U, - 13830553395934870481U, 4579996072175835083U, - 4607180341788068727U, 4581846703643734566U, - 13805218740498510374U, 4607180341788068727U, - 4604406033021674239U, 4604679572075463103U, - 13828051608930238911U, 4604406033021674239U, - 4606420848538580260U, 4600921238092511730U, - 13824293274947287538U, 4606420848538580260U, - 4600206446098256018U, 4606568886807728474U, - 13829940923662504282U, 4600206446098256018U, - 4606969576261663845U, 4596954088216812973U, - 13820326125071588781U, 4606969576261663845U, - 4603017373458244943U, 4605770164172969910U, - 13829142201027745718U, 4603017373458244943U, - 4605555245917486022U, 4603339021357904144U, - 13826711058212679952U, 4605555245917486022U, - 4595436449949385485U, 4607045045516813836U, - 13830417082371589644U, 4595436449949385485U, - 4607118021058468598U, 4593265590854265407U, - 13816637627709041215U, 4607118021058468598U, - 4603738491917026584U, 4605258978359093269U, - 13828631015213869077U, 4603738491917026584U, - 4606025850160239809U, 4602502755147763107U, - 13825874792002538915U, 4606025850160239809U, - 4598529532600161144U, 4606848731493011465U, - 13830220768347787273U, 4598529532600161144U, - 4606736437002195879U, 4599269903251194481U, - 13822641940105970289U, 4606736437002195879U, - 4601820425647934753U, 4606208206518262803U, - 13829580243373038611U, 4601820425647934753U, - 4605013567986435066U, 4604037525321326463U, - 13827409562176102271U, 4605013567986435066U, - 4590185751760970393U, 4607155938267770208U, - 13830527975122546016U, 4590185751760970393U, - 4607160003989618959U, 4589744810590291021U, - 13813116847445066829U, 4607160003989618959U, - 4604079374282302598U, 4604977468824438271U, - 13828349505679214079U, 4604079374282302598U, - 4606233055365547081U, 4601721693286060937U, - 13825093730140836745U, 4606233055365547081U, - 4599374859150636784U, 4606719100629313491U, - 13830091137484089299U, 4599374859150636784U, - 4606863472012527185U, 4598423001813699022U, - 13821795038668474830U, 4606863472012527185U, - 4602598930031891166U, 4605998608960791335U, - 13829370645815567143U, 4602598930031891166U, - 4605292980606880364U, 4603694922063032361U, - 13827066958917808169U, 4605292980606880364U, - 4593688012422887515U, 4607111255739239816U, - 13830483292594015624U, 4593688012422887515U, - 4607054494135176056U, 4595218635031890910U, - 13818590671886666718U, 4607054494135176056U, - 4603384207141321914U, 4605523422498301790U, - 13828895459353077598U, 4603384207141321914U, - 4605799732098147061U, 4602970680601913687U, - 13826342717456689495U, 4605799732098147061U, - 4597169786279785693U, 4606957467106717424U, - 13830329503961493232U, 4597169786279785693U, - 4606588777269136769U, 4600103317933788342U, - 13823475354788564150U, 4606588777269136769U, - 4601022290077223616U, 4606398451906509788U, - 13829770488761285596U, 4601022290077223616U, - 4604717681185626434U, 4604366005771528720U, - 13827738042626304528U, 4604717681185626434U, - 4583614727651146525U, 4607178985458280057U, - 13830551022313055865U, 4583614727651146525U, - 4607172882816799076U, 4586790578280679046U, - 13810162615135454854U, 4607172882816799076U, - 4604244531615310815U, 4604830524903495634U, - 13828202561758271442U, 4604244531615310815U, - 4606329407841126011U, 4601323770373937522U, - 13824695807228713330U, 4606329407841126011U, - 4599792496117920694U, 4606646545123403481U, - 13830018581978179289U, 4599792496117920694U, - 4606919157647773535U, 4597815040470278984U, - 13821187077325054792U, 4606919157647773535U, - 4602829525820289164U, 4605886709123365959U, - 13829258745978141767U, 4602829525820289164U, - 4605426297151190466U, 4603518581031047189U, - 13826890617885822997U, 4605426297151190466U, - 4594563856311064231U, 4607080832832247697U, - 13830452869687023505U, 4594563856311064231U, - 4607088942243446236U, 4594345179472540681U, - 13817717216327316489U, 4607088942243446236U, - 4603562972219549215U, 4605393374401988274U, - 13828765411256764082U, 4603562972219549215U, - 4605915122243179241U, 4602782121393764535U, - 13826154158248540343U, 4605915122243179241U, - 4598029484874872834U, 4606905728766014348U, - 13830277765620790156U, 4598029484874872834U, - 4606665164148251002U, 4599688422741010356U, - 13823060459595786164U, 4606665164148251002U, - 4601423692641949331U, 4606305777984577632U, - 13829677814839353440U, 4601423692641949331U, - 4604867640218014515U, 4604203581176243359U, - 13827575618031019167U, 4604867640218014515U, - 4587673791460508439U, 4607170170974224083U, - 13830542207828999891U, 4587673791460508439U, - 4607141713064252300U, 4591507261658050721U, - 13814879298512826529U, 4607141713064252300U, - 4603910660507251362U, 4605120315324767624U, - 13828492352179543432U, 4603910660507251362U, - 4606131849150971908U, 4602114767134999006U, - 13825486803989774814U, 4606131849150971908U, - 4598953786765296928U, 4606786509620734768U, - 13830158546475510576U, 4598953786765296928U, - 4606802552898869248U, 4598848011564831930U, - 13822220048419607738U, 4606802552898869248U, - 4602212250118051877U, 4606105796280968177U, - 13829477833135743985U, 4602212250118051877U, - 4605155376589456981U, 4603867938232615808U, - 13827239975087391616U, 4605155376589456981U, - 4591947271803021404U, 4607136295912168606U, - 13830508332766944414U, 4591947271803021404U, - 4607014697483910382U, 4596088445927168004U, - 13819460482781943812U, 4607014697483910382U, - 4603202304363743346U, 4605649044311923410U, - 13829021081166699218U, 4603202304363743346U, - 4605679749231851918U, 4603156351203636159U, - 13826528388058411967U, 4605679749231851918U, - 4596305267720071930U, 4607003915349878877U, - 13830375952204654685U, 4596305267720071930U, - 4606507322377452870U, 4600514338912178239U, - 13823886375766954047U, 4606507322377452870U, - 4600616459743653188U, 4606486172460753999U, - 13829858209315529807U, 4600616459743653188U, - 4604563781218984604U, 4604524701268679793U, - 13827896738123455601U, 4604563781218984604U, - 4569220649180767418U, 4607182376410422530U, - 13830554413265198338U, 4569220649180767418U -}; - -const fpr fpr_p2_tab[] = { - 4611686018427387904U, - 4607182418800017408U, - 4602678819172646912U, - 4598175219545276416U, - 4593671619917905920U, - 4589168020290535424U, - 4584664420663164928U, - 4580160821035794432U, - 4575657221408423936U, - 4571153621781053440U, - 4566650022153682944U -}; - -#elif FALCON_FPNATIVE // yyyFPEMU+0 yyyFPNATIVE+1 - -const fpr fpr_gm_tab[] = { - {0}, {0}, /* unused */ - {-0.000000000000000000000000000}, { 1.000000000000000000000000000}, - { 0.707106781186547524400844362}, { 0.707106781186547524400844362}, - {-0.707106781186547524400844362}, { 0.707106781186547524400844362}, - { 0.923879532511286756128183189}, { 0.382683432365089771728459984}, - {-0.382683432365089771728459984}, { 0.923879532511286756128183189}, - { 0.382683432365089771728459984}, { 0.923879532511286756128183189}, - {-0.923879532511286756128183189}, { 0.382683432365089771728459984}, - { 0.980785280403230449126182236}, { 0.195090322016128267848284868}, - {-0.195090322016128267848284868}, { 0.980785280403230449126182236}, - { 0.555570233019602224742830814}, { 0.831469612302545237078788378}, - {-0.831469612302545237078788378}, { 0.555570233019602224742830814}, - { 0.831469612302545237078788378}, { 0.555570233019602224742830814}, - {-0.555570233019602224742830814}, { 0.831469612302545237078788378}, - { 0.195090322016128267848284868}, { 0.980785280403230449126182236}, - {-0.980785280403230449126182236}, { 0.195090322016128267848284868}, - { 0.995184726672196886244836953}, { 0.098017140329560601994195564}, - {-0.098017140329560601994195564}, { 0.995184726672196886244836953}, - { 0.634393284163645498215171613}, { 0.773010453362736960810906610}, - {-0.773010453362736960810906610}, { 0.634393284163645498215171613}, - { 0.881921264348355029712756864}, { 0.471396736825997648556387626}, - {-0.471396736825997648556387626}, { 0.881921264348355029712756864}, - { 0.290284677254462367636192376}, { 0.956940335732208864935797887}, - {-0.956940335732208864935797887}, { 0.290284677254462367636192376}, - { 0.956940335732208864935797887}, { 0.290284677254462367636192376}, - {-0.290284677254462367636192376}, { 0.956940335732208864935797887}, - { 0.471396736825997648556387626}, { 0.881921264348355029712756864}, - {-0.881921264348355029712756864}, { 0.471396736825997648556387626}, - { 0.773010453362736960810906610}, { 0.634393284163645498215171613}, - {-0.634393284163645498215171613}, { 0.773010453362736960810906610}, - { 0.098017140329560601994195564}, { 0.995184726672196886244836953}, - {-0.995184726672196886244836953}, { 0.098017140329560601994195564}, - { 0.998795456205172392714771605}, { 0.049067674327418014254954977}, - {-0.049067674327418014254954977}, { 0.998795456205172392714771605}, - { 0.671558954847018400625376850}, { 0.740951125354959091175616897}, - {-0.740951125354959091175616897}, { 0.671558954847018400625376850}, - { 0.903989293123443331586200297}, { 0.427555093430282094320966857}, - {-0.427555093430282094320966857}, { 0.903989293123443331586200297}, - { 0.336889853392220050689253213}, { 0.941544065183020778412509403}, - {-0.941544065183020778412509403}, { 0.336889853392220050689253213}, - { 0.970031253194543992603984207}, { 0.242980179903263889948274162}, - {-0.242980179903263889948274162}, { 0.970031253194543992603984207}, - { 0.514102744193221726593693839}, { 0.857728610000272069902269984}, - {-0.857728610000272069902269984}, { 0.514102744193221726593693839}, - { 0.803207531480644909806676513}, { 0.595699304492433343467036529}, - {-0.595699304492433343467036529}, { 0.803207531480644909806676513}, - { 0.146730474455361751658850130}, { 0.989176509964780973451673738}, - {-0.989176509964780973451673738}, { 0.146730474455361751658850130}, - { 0.989176509964780973451673738}, { 0.146730474455361751658850130}, - {-0.146730474455361751658850130}, { 0.989176509964780973451673738}, - { 0.595699304492433343467036529}, { 0.803207531480644909806676513}, - {-0.803207531480644909806676513}, { 0.595699304492433343467036529}, - { 0.857728610000272069902269984}, { 0.514102744193221726593693839}, - {-0.514102744193221726593693839}, { 0.857728610000272069902269984}, - { 0.242980179903263889948274162}, { 0.970031253194543992603984207}, - {-0.970031253194543992603984207}, { 0.242980179903263889948274162}, - { 0.941544065183020778412509403}, { 0.336889853392220050689253213}, - {-0.336889853392220050689253213}, { 0.941544065183020778412509403}, - { 0.427555093430282094320966857}, { 0.903989293123443331586200297}, - {-0.903989293123443331586200297}, { 0.427555093430282094320966857}, - { 0.740951125354959091175616897}, { 0.671558954847018400625376850}, - {-0.671558954847018400625376850}, { 0.740951125354959091175616897}, - { 0.049067674327418014254954977}, { 0.998795456205172392714771605}, - {-0.998795456205172392714771605}, { 0.049067674327418014254954977}, - { 0.999698818696204220115765650}, { 0.024541228522912288031734529}, - {-0.024541228522912288031734529}, { 0.999698818696204220115765650}, - { 0.689540544737066924616730630}, { 0.724247082951466920941069243}, - {-0.724247082951466920941069243}, { 0.689540544737066924616730630}, - { 0.914209755703530654635014829}, { 0.405241314004989870908481306}, - {-0.405241314004989870908481306}, { 0.914209755703530654635014829}, - { 0.359895036534988148775104572}, { 0.932992798834738887711660256}, - {-0.932992798834738887711660256}, { 0.359895036534988148775104572}, - { 0.975702130038528544460395766}, { 0.219101240156869797227737547}, - {-0.219101240156869797227737547}, { 0.975702130038528544460395766}, - { 0.534997619887097210663076905}, { 0.844853565249707073259571205}, - {-0.844853565249707073259571205}, { 0.534997619887097210663076905}, - { 0.817584813151583696504920884}, { 0.575808191417845300745972454}, - {-0.575808191417845300745972454}, { 0.817584813151583696504920884}, - { 0.170961888760301226363642357}, { 0.985277642388941244774018433}, - {-0.985277642388941244774018433}, { 0.170961888760301226363642357}, - { 0.992479534598709998156767252}, { 0.122410675199216198498704474}, - {-0.122410675199216198498704474}, { 0.992479534598709998156767252}, - { 0.615231590580626845484913563}, { 0.788346427626606262009164705}, - {-0.788346427626606262009164705}, { 0.615231590580626845484913563}, - { 0.870086991108711418652292404}, { 0.492898192229784036873026689}, - {-0.492898192229784036873026689}, { 0.870086991108711418652292404}, - { 0.266712757474898386325286515}, { 0.963776065795439866686464356}, - {-0.963776065795439866686464356}, { 0.266712757474898386325286515}, - { 0.949528180593036667195936074}, { 0.313681740398891476656478846}, - {-0.313681740398891476656478846}, { 0.949528180593036667195936074}, - { 0.449611329654606600046294579}, { 0.893224301195515320342416447}, - {-0.893224301195515320342416447}, { 0.449611329654606600046294579}, - { 0.757208846506484547575464054}, { 0.653172842953776764084203014}, - {-0.653172842953776764084203014}, { 0.757208846506484547575464054}, - { 0.073564563599667423529465622}, { 0.997290456678690216135597140}, - {-0.997290456678690216135597140}, { 0.073564563599667423529465622}, - { 0.997290456678690216135597140}, { 0.073564563599667423529465622}, - {-0.073564563599667423529465622}, { 0.997290456678690216135597140}, - { 0.653172842953776764084203014}, { 0.757208846506484547575464054}, - {-0.757208846506484547575464054}, { 0.653172842953776764084203014}, - { 0.893224301195515320342416447}, { 0.449611329654606600046294579}, - {-0.449611329654606600046294579}, { 0.893224301195515320342416447}, - { 0.313681740398891476656478846}, { 0.949528180593036667195936074}, - {-0.949528180593036667195936074}, { 0.313681740398891476656478846}, - { 0.963776065795439866686464356}, { 0.266712757474898386325286515}, - {-0.266712757474898386325286515}, { 0.963776065795439866686464356}, - { 0.492898192229784036873026689}, { 0.870086991108711418652292404}, - {-0.870086991108711418652292404}, { 0.492898192229784036873026689}, - { 0.788346427626606262009164705}, { 0.615231590580626845484913563}, - {-0.615231590580626845484913563}, { 0.788346427626606262009164705}, - { 0.122410675199216198498704474}, { 0.992479534598709998156767252}, - {-0.992479534598709998156767252}, { 0.122410675199216198498704474}, - { 0.985277642388941244774018433}, { 0.170961888760301226363642357}, - {-0.170961888760301226363642357}, { 0.985277642388941244774018433}, - { 0.575808191417845300745972454}, { 0.817584813151583696504920884}, - {-0.817584813151583696504920884}, { 0.575808191417845300745972454}, - { 0.844853565249707073259571205}, { 0.534997619887097210663076905}, - {-0.534997619887097210663076905}, { 0.844853565249707073259571205}, - { 0.219101240156869797227737547}, { 0.975702130038528544460395766}, - {-0.975702130038528544460395766}, { 0.219101240156869797227737547}, - { 0.932992798834738887711660256}, { 0.359895036534988148775104572}, - {-0.359895036534988148775104572}, { 0.932992798834738887711660256}, - { 0.405241314004989870908481306}, { 0.914209755703530654635014829}, - {-0.914209755703530654635014829}, { 0.405241314004989870908481306}, - { 0.724247082951466920941069243}, { 0.689540544737066924616730630}, - {-0.689540544737066924616730630}, { 0.724247082951466920941069243}, - { 0.024541228522912288031734529}, { 0.999698818696204220115765650}, - {-0.999698818696204220115765650}, { 0.024541228522912288031734529}, - { 0.999924701839144540921646491}, { 0.012271538285719926079408262}, - {-0.012271538285719926079408262}, { 0.999924701839144540921646491}, - { 0.698376249408972853554813503}, { 0.715730825283818654125532623}, - {-0.715730825283818654125532623}, { 0.698376249408972853554813503}, - { 0.919113851690057743908477789}, { 0.393992040061048108596188661}, - {-0.393992040061048108596188661}, { 0.919113851690057743908477789}, - { 0.371317193951837543411934967}, { 0.928506080473215565937167396}, - {-0.928506080473215565937167396}, { 0.371317193951837543411934967}, - { 0.978317370719627633106240097}, { 0.207111376192218549708116020}, - {-0.207111376192218549708116020}, { 0.978317370719627633106240097}, - { 0.545324988422046422313987347}, { 0.838224705554838043186996856}, - {-0.838224705554838043186996856}, { 0.545324988422046422313987347}, - { 0.824589302785025264474803737}, { 0.565731810783613197389765011}, - {-0.565731810783613197389765011}, { 0.824589302785025264474803737}, - { 0.183039887955140958516532578}, { 0.983105487431216327180301155}, - {-0.983105487431216327180301155}, { 0.183039887955140958516532578}, - { 0.993906970002356041546922813}, { 0.110222207293883058807899140}, - {-0.110222207293883058807899140}, { 0.993906970002356041546922813}, - { 0.624859488142386377084072816}, { 0.780737228572094478301588484}, - {-0.780737228572094478301588484}, { 0.624859488142386377084072816}, - { 0.876070094195406607095844268}, { 0.482183772079122748517344481}, - {-0.482183772079122748517344481}, { 0.876070094195406607095844268}, - { 0.278519689385053105207848526}, { 0.960430519415565811199035138}, - {-0.960430519415565811199035138}, { 0.278519689385053105207848526}, - { 0.953306040354193836916740383}, { 0.302005949319228067003463232}, - {-0.302005949319228067003463232}, { 0.953306040354193836916740383}, - { 0.460538710958240023633181487}, { 0.887639620402853947760181617}, - {-0.887639620402853947760181617}, { 0.460538710958240023633181487}, - { 0.765167265622458925888815999}, { 0.643831542889791465068086063}, - {-0.643831542889791465068086063}, { 0.765167265622458925888815999}, - { 0.085797312344439890461556332}, { 0.996312612182778012627226190}, - {-0.996312612182778012627226190}, { 0.085797312344439890461556332}, - { 0.998118112900149207125155861}, { 0.061320736302208577782614593}, - {-0.061320736302208577782614593}, { 0.998118112900149207125155861}, - { 0.662415777590171761113069817}, { 0.749136394523459325469203257}, - {-0.749136394523459325469203257}, { 0.662415777590171761113069817}, - { 0.898674465693953843041976744}, { 0.438616238538527637647025738}, - {-0.438616238538527637647025738}, { 0.898674465693953843041976744}, - { 0.325310292162262934135954708}, { 0.945607325380521325730945387}, - {-0.945607325380521325730945387}, { 0.325310292162262934135954708}, - { 0.966976471044852109087220226}, { 0.254865659604514571553980779}, - {-0.254865659604514571553980779}, { 0.966976471044852109087220226}, - { 0.503538383725717558691867071}, { 0.863972856121586737918147054}, - {-0.863972856121586737918147054}, { 0.503538383725717558691867071}, - { 0.795836904608883536262791915}, { 0.605511041404325513920626941}, - {-0.605511041404325513920626941}, { 0.795836904608883536262791915}, - { 0.134580708507126186316358409}, { 0.990902635427780025108237011}, - {-0.990902635427780025108237011}, { 0.134580708507126186316358409}, - { 0.987301418157858382399815802}, { 0.158858143333861441684385360}, - {-0.158858143333861441684385360}, { 0.987301418157858382399815802}, - { 0.585797857456438860328080838}, { 0.810457198252594791726703434}, - {-0.810457198252594791726703434}, { 0.585797857456438860328080838}, - { 0.851355193105265142261290312}, { 0.524589682678468906215098464}, - {-0.524589682678468906215098464}, { 0.851355193105265142261290312}, - { 0.231058108280671119643236018}, { 0.972939952205560145467720114}, - {-0.972939952205560145467720114}, { 0.231058108280671119643236018}, - { 0.937339011912574923201899593}, { 0.348418680249434568419308588}, - {-0.348418680249434568419308588}, { 0.937339011912574923201899593}, - { 0.416429560097637182562598911}, { 0.909167983090522376563884788}, - {-0.909167983090522376563884788}, { 0.416429560097637182562598911}, - { 0.732654271672412834615546649}, { 0.680600997795453050594430464}, - {-0.680600997795453050594430464}, { 0.732654271672412834615546649}, - { 0.036807222941358832324332691}, { 0.999322384588349500896221011}, - {-0.999322384588349500896221011}, { 0.036807222941358832324332691}, - { 0.999322384588349500896221011}, { 0.036807222941358832324332691}, - {-0.036807222941358832324332691}, { 0.999322384588349500896221011}, - { 0.680600997795453050594430464}, { 0.732654271672412834615546649}, - {-0.732654271672412834615546649}, { 0.680600997795453050594430464}, - { 0.909167983090522376563884788}, { 0.416429560097637182562598911}, - {-0.416429560097637182562598911}, { 0.909167983090522376563884788}, - { 0.348418680249434568419308588}, { 0.937339011912574923201899593}, - {-0.937339011912574923201899593}, { 0.348418680249434568419308588}, - { 0.972939952205560145467720114}, { 0.231058108280671119643236018}, - {-0.231058108280671119643236018}, { 0.972939952205560145467720114}, - { 0.524589682678468906215098464}, { 0.851355193105265142261290312}, - {-0.851355193105265142261290312}, { 0.524589682678468906215098464}, - { 0.810457198252594791726703434}, { 0.585797857456438860328080838}, - {-0.585797857456438860328080838}, { 0.810457198252594791726703434}, - { 0.158858143333861441684385360}, { 0.987301418157858382399815802}, - {-0.987301418157858382399815802}, { 0.158858143333861441684385360}, - { 0.990902635427780025108237011}, { 0.134580708507126186316358409}, - {-0.134580708507126186316358409}, { 0.990902635427780025108237011}, - { 0.605511041404325513920626941}, { 0.795836904608883536262791915}, - {-0.795836904608883536262791915}, { 0.605511041404325513920626941}, - { 0.863972856121586737918147054}, { 0.503538383725717558691867071}, - {-0.503538383725717558691867071}, { 0.863972856121586737918147054}, - { 0.254865659604514571553980779}, { 0.966976471044852109087220226}, - {-0.966976471044852109087220226}, { 0.254865659604514571553980779}, - { 0.945607325380521325730945387}, { 0.325310292162262934135954708}, - {-0.325310292162262934135954708}, { 0.945607325380521325730945387}, - { 0.438616238538527637647025738}, { 0.898674465693953843041976744}, - {-0.898674465693953843041976744}, { 0.438616238538527637647025738}, - { 0.749136394523459325469203257}, { 0.662415777590171761113069817}, - {-0.662415777590171761113069817}, { 0.749136394523459325469203257}, - { 0.061320736302208577782614593}, { 0.998118112900149207125155861}, - {-0.998118112900149207125155861}, { 0.061320736302208577782614593}, - { 0.996312612182778012627226190}, { 0.085797312344439890461556332}, - {-0.085797312344439890461556332}, { 0.996312612182778012627226190}, - { 0.643831542889791465068086063}, { 0.765167265622458925888815999}, - {-0.765167265622458925888815999}, { 0.643831542889791465068086063}, - { 0.887639620402853947760181617}, { 0.460538710958240023633181487}, - {-0.460538710958240023633181487}, { 0.887639620402853947760181617}, - { 0.302005949319228067003463232}, { 0.953306040354193836916740383}, - {-0.953306040354193836916740383}, { 0.302005949319228067003463232}, - { 0.960430519415565811199035138}, { 0.278519689385053105207848526}, - {-0.278519689385053105207848526}, { 0.960430519415565811199035138}, - { 0.482183772079122748517344481}, { 0.876070094195406607095844268}, - {-0.876070094195406607095844268}, { 0.482183772079122748517344481}, - { 0.780737228572094478301588484}, { 0.624859488142386377084072816}, - {-0.624859488142386377084072816}, { 0.780737228572094478301588484}, - { 0.110222207293883058807899140}, { 0.993906970002356041546922813}, - {-0.993906970002356041546922813}, { 0.110222207293883058807899140}, - { 0.983105487431216327180301155}, { 0.183039887955140958516532578}, - {-0.183039887955140958516532578}, { 0.983105487431216327180301155}, - { 0.565731810783613197389765011}, { 0.824589302785025264474803737}, - {-0.824589302785025264474803737}, { 0.565731810783613197389765011}, - { 0.838224705554838043186996856}, { 0.545324988422046422313987347}, - {-0.545324988422046422313987347}, { 0.838224705554838043186996856}, - { 0.207111376192218549708116020}, { 0.978317370719627633106240097}, - {-0.978317370719627633106240097}, { 0.207111376192218549708116020}, - { 0.928506080473215565937167396}, { 0.371317193951837543411934967}, - {-0.371317193951837543411934967}, { 0.928506080473215565937167396}, - { 0.393992040061048108596188661}, { 0.919113851690057743908477789}, - {-0.919113851690057743908477789}, { 0.393992040061048108596188661}, - { 0.715730825283818654125532623}, { 0.698376249408972853554813503}, - {-0.698376249408972853554813503}, { 0.715730825283818654125532623}, - { 0.012271538285719926079408262}, { 0.999924701839144540921646491}, - {-0.999924701839144540921646491}, { 0.012271538285719926079408262}, - { 0.999981175282601142656990438}, { 0.006135884649154475359640235}, - {-0.006135884649154475359640235}, { 0.999981175282601142656990438}, - { 0.702754744457225302452914421}, { 0.711432195745216441522130290}, - {-0.711432195745216441522130290}, { 0.702754744457225302452914421}, - { 0.921514039342041943465396332}, { 0.388345046698826291624993541}, - {-0.388345046698826291624993541}, { 0.921514039342041943465396332}, - { 0.377007410216418256726567823}, { 0.926210242138311341974793388}, - {-0.926210242138311341974793388}, { 0.377007410216418256726567823}, - { 0.979569765685440534439326110}, { 0.201104634842091911558443546}, - {-0.201104634842091911558443546}, { 0.979569765685440534439326110}, - { 0.550457972936604802977289893}, { 0.834862874986380056304401383}, - {-0.834862874986380056304401383}, { 0.550457972936604802977289893}, - { 0.828045045257755752067527592}, { 0.560661576197336023839710223}, - {-0.560661576197336023839710223}, { 0.828045045257755752067527592}, - { 0.189068664149806212754997837}, { 0.981963869109555264072848154}, - {-0.981963869109555264072848154}, { 0.189068664149806212754997837}, - { 0.994564570734255452119106243}, { 0.104121633872054579120943880}, - {-0.104121633872054579120943880}, { 0.994564570734255452119106243}, - { 0.629638238914927025372981341}, { 0.776888465673232450040827983}, - {-0.776888465673232450040827983}, { 0.629638238914927025372981341}, - { 0.879012226428633477831323711}, { 0.476799230063322133342158117}, - {-0.476799230063322133342158117}, { 0.879012226428633477831323711}, - { 0.284407537211271843618310615}, { 0.958703474895871555374645792}, - {-0.958703474895871555374645792}, { 0.284407537211271843618310615}, - { 0.955141168305770721498157712}, { 0.296150888243623824121786128}, - {-0.296150888243623824121786128}, { 0.955141168305770721498157712}, - { 0.465976495767966177902756065}, { 0.884797098430937780104007041}, - {-0.884797098430937780104007041}, { 0.465976495767966177902756065}, - { 0.769103337645579639346626069}, { 0.639124444863775743801488193}, - {-0.639124444863775743801488193}, { 0.769103337645579639346626069}, - { 0.091908956497132728624990979}, { 0.995767414467659793982495643}, - {-0.995767414467659793982495643}, { 0.091908956497132728624990979}, - { 0.998475580573294752208559038}, { 0.055195244349689939809447526}, - {-0.055195244349689939809447526}, { 0.998475580573294752208559038}, - { 0.666999922303637506650154222}, { 0.745057785441465962407907310}, - {-0.745057785441465962407907310}, { 0.666999922303637506650154222}, - { 0.901348847046022014570746093}, { 0.433093818853151968484222638}, - {-0.433093818853151968484222638}, { 0.901348847046022014570746093}, - { 0.331106305759876401737190737}, { 0.943593458161960361495301445}, - {-0.943593458161960361495301445}, { 0.331106305759876401737190737}, - { 0.968522094274417316221088329}, { 0.248927605745720168110682816}, - {-0.248927605745720168110682816}, { 0.968522094274417316221088329}, - { 0.508830142543107036931749324}, { 0.860866938637767279344583877}, - {-0.860866938637767279344583877}, { 0.508830142543107036931749324}, - { 0.799537269107905033500246232}, { 0.600616479383868926653875896}, - {-0.600616479383868926653875896}, { 0.799537269107905033500246232}, - { 0.140658239332849230714788846}, { 0.990058210262297105505906464}, - {-0.990058210262297105505906464}, { 0.140658239332849230714788846}, - { 0.988257567730749491404792538}, { 0.152797185258443427720336613}, - {-0.152797185258443427720336613}, { 0.988257567730749491404792538}, - { 0.590759701858874228423887908}, { 0.806847553543799272206514313}, - {-0.806847553543799272206514313}, { 0.590759701858874228423887908}, - { 0.854557988365400520767862276}, { 0.519355990165589587361829932}, - {-0.519355990165589587361829932}, { 0.854557988365400520767862276}, - { 0.237023605994367206867735915}, { 0.971503890986251775537099622}, - {-0.971503890986251775537099622}, { 0.237023605994367206867735915}, - { 0.939459223602189911962669246}, { 0.342660717311994397592781983}, - {-0.342660717311994397592781983}, { 0.939459223602189911962669246}, - { 0.422000270799799685941287941}, { 0.906595704514915365332960588}, - {-0.906595704514915365332960588}, { 0.422000270799799685941287941}, - { 0.736816568877369875090132520}, { 0.676092703575315960360419228}, - {-0.676092703575315960360419228}, { 0.736816568877369875090132520}, - { 0.042938256934940823077124540}, { 0.999077727752645382888781997}, - {-0.999077727752645382888781997}, { 0.042938256934940823077124540}, - { 0.999529417501093163079703322}, { 0.030674803176636625934021028}, - {-0.030674803176636625934021028}, { 0.999529417501093163079703322}, - { 0.685083667772700381362052545}, { 0.728464390448225196492035438}, - {-0.728464390448225196492035438}, { 0.685083667772700381362052545}, - { 0.911706032005429851404397325}, { 0.410843171057903942183466675}, - {-0.410843171057903942183466675}, { 0.911706032005429851404397325}, - { 0.354163525420490382357395796}, { 0.935183509938947577642207480}, - {-0.935183509938947577642207480}, { 0.354163525420490382357395796}, - { 0.974339382785575860518721668}, { 0.225083911359792835991642120}, - {-0.225083911359792835991642120}, { 0.974339382785575860518721668}, - { 0.529803624686294668216054671}, { 0.848120344803297251279133563}, - {-0.848120344803297251279133563}, { 0.529803624686294668216054671}, - { 0.814036329705948361654516690}, { 0.580813958095764545075595272}, - {-0.580813958095764545075595272}, { 0.814036329705948361654516690}, - { 0.164913120489969921418189113}, { 0.986308097244598647863297524}, - {-0.986308097244598647863297524}, { 0.164913120489969921418189113}, - { 0.991709753669099522860049931}, { 0.128498110793793172624415589}, - {-0.128498110793793172624415589}, { 0.991709753669099522860049931}, - { 0.610382806276309452716352152}, { 0.792106577300212351782342879}, - {-0.792106577300212351782342879}, { 0.610382806276309452716352152}, - { 0.867046245515692651480195629}, { 0.498227666972781852410983869}, - {-0.498227666972781852410983869}, { 0.867046245515692651480195629}, - { 0.260794117915275518280186509}, { 0.965394441697689374550843858}, - {-0.965394441697689374550843858}, { 0.260794117915275518280186509}, - { 0.947585591017741134653387321}, { 0.319502030816015677901518272}, - {-0.319502030816015677901518272}, { 0.947585591017741134653387321}, - { 0.444122144570429231642069418}, { 0.895966249756185155914560282}, - {-0.895966249756185155914560282}, { 0.444122144570429231642069418}, - { 0.753186799043612482483430486}, { 0.657806693297078656931182264}, - {-0.657806693297078656931182264}, { 0.753186799043612482483430486}, - { 0.067443919563664057897972422}, { 0.997723066644191609848546728}, - {-0.997723066644191609848546728}, { 0.067443919563664057897972422}, - { 0.996820299291165714972629398}, { 0.079682437971430121147120656}, - {-0.079682437971430121147120656}, { 0.996820299291165714972629398}, - { 0.648514401022112445084560551}, { 0.761202385484261814029709836}, - {-0.761202385484261814029709836}, { 0.648514401022112445084560551}, - { 0.890448723244757889952150560}, { 0.455083587126343823535869268}, - {-0.455083587126343823535869268}, { 0.890448723244757889952150560}, - { 0.307849640041534893682063646}, { 0.951435020969008369549175569}, - {-0.951435020969008369549175569}, { 0.307849640041534893682063646}, - { 0.962121404269041595429604316}, { 0.272621355449948984493347477}, - {-0.272621355449948984493347477}, { 0.962121404269041595429604316}, - { 0.487550160148435954641485027}, { 0.873094978418290098636085973}, - {-0.873094978418290098636085973}, { 0.487550160148435954641485027}, - { 0.784556597155575233023892575}, { 0.620057211763289178646268191}, - {-0.620057211763289178646268191}, { 0.784556597155575233023892575}, - { 0.116318630911904767252544319}, { 0.993211949234794533104601012}, - {-0.993211949234794533104601012}, { 0.116318630911904767252544319}, - { 0.984210092386929073193874387}, { 0.177004220412148756196839844}, - {-0.177004220412148756196839844}, { 0.984210092386929073193874387}, - { 0.570780745886967280232652864}, { 0.821102514991104679060430820}, - {-0.821102514991104679060430820}, { 0.570780745886967280232652864}, - { 0.841554977436898409603499520}, { 0.540171472729892881297845480}, - {-0.540171472729892881297845480}, { 0.841554977436898409603499520}, - { 0.213110319916091373967757518}, { 0.977028142657754351485866211}, - {-0.977028142657754351485866211}, { 0.213110319916091373967757518}, - { 0.930766961078983731944872340}, { 0.365612997804773870011745909}, - {-0.365612997804773870011745909}, { 0.930766961078983731944872340}, - { 0.399624199845646828544117031}, { 0.916679059921042663116457013}, - {-0.916679059921042663116457013}, { 0.399624199845646828544117031}, - { 0.720002507961381629076682999}, { 0.693971460889654009003734389}, - {-0.693971460889654009003734389}, { 0.720002507961381629076682999}, - { 0.018406729905804820927366313}, { 0.999830581795823422015722275}, - {-0.999830581795823422015722275}, { 0.018406729905804820927366313}, - { 0.999830581795823422015722275}, { 0.018406729905804820927366313}, - {-0.018406729905804820927366313}, { 0.999830581795823422015722275}, - { 0.693971460889654009003734389}, { 0.720002507961381629076682999}, - {-0.720002507961381629076682999}, { 0.693971460889654009003734389}, - { 0.916679059921042663116457013}, { 0.399624199845646828544117031}, - {-0.399624199845646828544117031}, { 0.916679059921042663116457013}, - { 0.365612997804773870011745909}, { 0.930766961078983731944872340}, - {-0.930766961078983731944872340}, { 0.365612997804773870011745909}, - { 0.977028142657754351485866211}, { 0.213110319916091373967757518}, - {-0.213110319916091373967757518}, { 0.977028142657754351485866211}, - { 0.540171472729892881297845480}, { 0.841554977436898409603499520}, - {-0.841554977436898409603499520}, { 0.540171472729892881297845480}, - { 0.821102514991104679060430820}, { 0.570780745886967280232652864}, - {-0.570780745886967280232652864}, { 0.821102514991104679060430820}, - { 0.177004220412148756196839844}, { 0.984210092386929073193874387}, - {-0.984210092386929073193874387}, { 0.177004220412148756196839844}, - { 0.993211949234794533104601012}, { 0.116318630911904767252544319}, - {-0.116318630911904767252544319}, { 0.993211949234794533104601012}, - { 0.620057211763289178646268191}, { 0.784556597155575233023892575}, - {-0.784556597155575233023892575}, { 0.620057211763289178646268191}, - { 0.873094978418290098636085973}, { 0.487550160148435954641485027}, - {-0.487550160148435954641485027}, { 0.873094978418290098636085973}, - { 0.272621355449948984493347477}, { 0.962121404269041595429604316}, - {-0.962121404269041595429604316}, { 0.272621355449948984493347477}, - { 0.951435020969008369549175569}, { 0.307849640041534893682063646}, - {-0.307849640041534893682063646}, { 0.951435020969008369549175569}, - { 0.455083587126343823535869268}, { 0.890448723244757889952150560}, - {-0.890448723244757889952150560}, { 0.455083587126343823535869268}, - { 0.761202385484261814029709836}, { 0.648514401022112445084560551}, - {-0.648514401022112445084560551}, { 0.761202385484261814029709836}, - { 0.079682437971430121147120656}, { 0.996820299291165714972629398}, - {-0.996820299291165714972629398}, { 0.079682437971430121147120656}, - { 0.997723066644191609848546728}, { 0.067443919563664057897972422}, - {-0.067443919563664057897972422}, { 0.997723066644191609848546728}, - { 0.657806693297078656931182264}, { 0.753186799043612482483430486}, - {-0.753186799043612482483430486}, { 0.657806693297078656931182264}, - { 0.895966249756185155914560282}, { 0.444122144570429231642069418}, - {-0.444122144570429231642069418}, { 0.895966249756185155914560282}, - { 0.319502030816015677901518272}, { 0.947585591017741134653387321}, - {-0.947585591017741134653387321}, { 0.319502030816015677901518272}, - { 0.965394441697689374550843858}, { 0.260794117915275518280186509}, - {-0.260794117915275518280186509}, { 0.965394441697689374550843858}, - { 0.498227666972781852410983869}, { 0.867046245515692651480195629}, - {-0.867046245515692651480195629}, { 0.498227666972781852410983869}, - { 0.792106577300212351782342879}, { 0.610382806276309452716352152}, - {-0.610382806276309452716352152}, { 0.792106577300212351782342879}, - { 0.128498110793793172624415589}, { 0.991709753669099522860049931}, - {-0.991709753669099522860049931}, { 0.128498110793793172624415589}, - { 0.986308097244598647863297524}, { 0.164913120489969921418189113}, - {-0.164913120489969921418189113}, { 0.986308097244598647863297524}, - { 0.580813958095764545075595272}, { 0.814036329705948361654516690}, - {-0.814036329705948361654516690}, { 0.580813958095764545075595272}, - { 0.848120344803297251279133563}, { 0.529803624686294668216054671}, - {-0.529803624686294668216054671}, { 0.848120344803297251279133563}, - { 0.225083911359792835991642120}, { 0.974339382785575860518721668}, - {-0.974339382785575860518721668}, { 0.225083911359792835991642120}, - { 0.935183509938947577642207480}, { 0.354163525420490382357395796}, - {-0.354163525420490382357395796}, { 0.935183509938947577642207480}, - { 0.410843171057903942183466675}, { 0.911706032005429851404397325}, - {-0.911706032005429851404397325}, { 0.410843171057903942183466675}, - { 0.728464390448225196492035438}, { 0.685083667772700381362052545}, - {-0.685083667772700381362052545}, { 0.728464390448225196492035438}, - { 0.030674803176636625934021028}, { 0.999529417501093163079703322}, - {-0.999529417501093163079703322}, { 0.030674803176636625934021028}, - { 0.999077727752645382888781997}, { 0.042938256934940823077124540}, - {-0.042938256934940823077124540}, { 0.999077727752645382888781997}, - { 0.676092703575315960360419228}, { 0.736816568877369875090132520}, - {-0.736816568877369875090132520}, { 0.676092703575315960360419228}, - { 0.906595704514915365332960588}, { 0.422000270799799685941287941}, - {-0.422000270799799685941287941}, { 0.906595704514915365332960588}, - { 0.342660717311994397592781983}, { 0.939459223602189911962669246}, - {-0.939459223602189911962669246}, { 0.342660717311994397592781983}, - { 0.971503890986251775537099622}, { 0.237023605994367206867735915}, - {-0.237023605994367206867735915}, { 0.971503890986251775537099622}, - { 0.519355990165589587361829932}, { 0.854557988365400520767862276}, - {-0.854557988365400520767862276}, { 0.519355990165589587361829932}, - { 0.806847553543799272206514313}, { 0.590759701858874228423887908}, - {-0.590759701858874228423887908}, { 0.806847553543799272206514313}, - { 0.152797185258443427720336613}, { 0.988257567730749491404792538}, - {-0.988257567730749491404792538}, { 0.152797185258443427720336613}, - { 0.990058210262297105505906464}, { 0.140658239332849230714788846}, - {-0.140658239332849230714788846}, { 0.990058210262297105505906464}, - { 0.600616479383868926653875896}, { 0.799537269107905033500246232}, - {-0.799537269107905033500246232}, { 0.600616479383868926653875896}, - { 0.860866938637767279344583877}, { 0.508830142543107036931749324}, - {-0.508830142543107036931749324}, { 0.860866938637767279344583877}, - { 0.248927605745720168110682816}, { 0.968522094274417316221088329}, - {-0.968522094274417316221088329}, { 0.248927605745720168110682816}, - { 0.943593458161960361495301445}, { 0.331106305759876401737190737}, - {-0.331106305759876401737190737}, { 0.943593458161960361495301445}, - { 0.433093818853151968484222638}, { 0.901348847046022014570746093}, - {-0.901348847046022014570746093}, { 0.433093818853151968484222638}, - { 0.745057785441465962407907310}, { 0.666999922303637506650154222}, - {-0.666999922303637506650154222}, { 0.745057785441465962407907310}, - { 0.055195244349689939809447526}, { 0.998475580573294752208559038}, - {-0.998475580573294752208559038}, { 0.055195244349689939809447526}, - { 0.995767414467659793982495643}, { 0.091908956497132728624990979}, - {-0.091908956497132728624990979}, { 0.995767414467659793982495643}, - { 0.639124444863775743801488193}, { 0.769103337645579639346626069}, - {-0.769103337645579639346626069}, { 0.639124444863775743801488193}, - { 0.884797098430937780104007041}, { 0.465976495767966177902756065}, - {-0.465976495767966177902756065}, { 0.884797098430937780104007041}, - { 0.296150888243623824121786128}, { 0.955141168305770721498157712}, - {-0.955141168305770721498157712}, { 0.296150888243623824121786128}, - { 0.958703474895871555374645792}, { 0.284407537211271843618310615}, - {-0.284407537211271843618310615}, { 0.958703474895871555374645792}, - { 0.476799230063322133342158117}, { 0.879012226428633477831323711}, - {-0.879012226428633477831323711}, { 0.476799230063322133342158117}, - { 0.776888465673232450040827983}, { 0.629638238914927025372981341}, - {-0.629638238914927025372981341}, { 0.776888465673232450040827983}, - { 0.104121633872054579120943880}, { 0.994564570734255452119106243}, - {-0.994564570734255452119106243}, { 0.104121633872054579120943880}, - { 0.981963869109555264072848154}, { 0.189068664149806212754997837}, - {-0.189068664149806212754997837}, { 0.981963869109555264072848154}, - { 0.560661576197336023839710223}, { 0.828045045257755752067527592}, - {-0.828045045257755752067527592}, { 0.560661576197336023839710223}, - { 0.834862874986380056304401383}, { 0.550457972936604802977289893}, - {-0.550457972936604802977289893}, { 0.834862874986380056304401383}, - { 0.201104634842091911558443546}, { 0.979569765685440534439326110}, - {-0.979569765685440534439326110}, { 0.201104634842091911558443546}, - { 0.926210242138311341974793388}, { 0.377007410216418256726567823}, - {-0.377007410216418256726567823}, { 0.926210242138311341974793388}, - { 0.388345046698826291624993541}, { 0.921514039342041943465396332}, - {-0.921514039342041943465396332}, { 0.388345046698826291624993541}, - { 0.711432195745216441522130290}, { 0.702754744457225302452914421}, - {-0.702754744457225302452914421}, { 0.711432195745216441522130290}, - { 0.006135884649154475359640235}, { 0.999981175282601142656990438}, - {-0.999981175282601142656990438}, { 0.006135884649154475359640235}, - { 0.999995293809576171511580126}, { 0.003067956762965976270145365}, - {-0.003067956762965976270145365}, { 0.999995293809576171511580126}, - { 0.704934080375904908852523758}, { 0.709272826438865651316533772}, - {-0.709272826438865651316533772}, { 0.704934080375904908852523758}, - { 0.922701128333878570437264227}, { 0.385516053843918864075607949}, - {-0.385516053843918864075607949}, { 0.922701128333878570437264227}, - { 0.379847208924051170576281147}, { 0.925049240782677590302371869}, - {-0.925049240782677590302371869}, { 0.379847208924051170576281147}, - { 0.980182135968117392690210009}, { 0.198098410717953586179324918}, - {-0.198098410717953586179324918}, { 0.980182135968117392690210009}, - { 0.553016705580027531764226988}, { 0.833170164701913186439915922}, - {-0.833170164701913186439915922}, { 0.553016705580027531764226988}, - { 0.829761233794523042469023765}, { 0.558118531220556115693702964}, - {-0.558118531220556115693702964}, { 0.829761233794523042469023765}, - { 0.192080397049892441679288205}, { 0.981379193313754574318224190}, - {-0.981379193313754574318224190}, { 0.192080397049892441679288205}, - { 0.994879330794805620591166107}, { 0.101069862754827824987887585}, - {-0.101069862754827824987887585}, { 0.994879330794805620591166107}, - { 0.632018735939809021909403706}, { 0.774953106594873878359129282}, - {-0.774953106594873878359129282}, { 0.632018735939809021909403706}, - { 0.880470889052160770806542929}, { 0.474100214650550014398580015}, - {-0.474100214650550014398580015}, { 0.880470889052160770806542929}, - { 0.287347459544729526477331841}, { 0.957826413027532890321037029}, - {-0.957826413027532890321037029}, { 0.287347459544729526477331841}, - { 0.956045251349996443270479823}, { 0.293219162694258650606608599}, - {-0.293219162694258650606608599}, { 0.956045251349996443270479823}, - { 0.468688822035827933697617870}, { 0.883363338665731594736308015}, - {-0.883363338665731594736308015}, { 0.468688822035827933697617870}, - { 0.771060524261813773200605759}, { 0.636761861236284230413943435}, - {-0.636761861236284230413943435}, { 0.771060524261813773200605759}, - { 0.094963495329638998938034312}, { 0.995480755491926941769171600}, - {-0.995480755491926941769171600}, { 0.094963495329638998938034312}, - { 0.998640218180265222418199049}, { 0.052131704680283321236358216}, - {-0.052131704680283321236358216}, { 0.998640218180265222418199049}, - { 0.669282588346636065720696366}, { 0.743007952135121693517362293}, - {-0.743007952135121693517362293}, { 0.669282588346636065720696366}, - { 0.902673318237258806751502391}, { 0.430326481340082633908199031}, - {-0.430326481340082633908199031}, { 0.902673318237258806751502391}, - { 0.333999651442009404650865481}, { 0.942573197601446879280758735}, - {-0.942573197601446879280758735}, { 0.333999651442009404650865481}, - { 0.969281235356548486048290738}, { 0.245955050335794611599924709}, - {-0.245955050335794611599924709}, { 0.969281235356548486048290738}, - { 0.511468850437970399504391001}, { 0.859301818357008404783582139}, - {-0.859301818357008404783582139}, { 0.511468850437970399504391001}, - { 0.801376171723140219430247777}, { 0.598160706996342311724958652}, - {-0.598160706996342311724958652}, { 0.801376171723140219430247777}, - { 0.143695033150294454819773349}, { 0.989622017463200834623694454}, - {-0.989622017463200834623694454}, { 0.143695033150294454819773349}, - { 0.988721691960323767604516485}, { 0.149764534677321517229695737}, - {-0.149764534677321517229695737}, { 0.988721691960323767604516485}, - { 0.593232295039799808047809426}, { 0.805031331142963597922659282}, - {-0.805031331142963597922659282}, { 0.593232295039799808047809426}, - { 0.856147328375194481019630732}, { 0.516731799017649881508753876}, - {-0.516731799017649881508753876}, { 0.856147328375194481019630732}, - { 0.240003022448741486568922365}, { 0.970772140728950302138169611}, - {-0.970772140728950302138169611}, { 0.240003022448741486568922365}, - { 0.940506070593268323787291309}, { 0.339776884406826857828825803}, - {-0.339776884406826857828825803}, { 0.940506070593268323787291309}, - { 0.424779681209108833357226189}, { 0.905296759318118774354048329}, - {-0.905296759318118774354048329}, { 0.424779681209108833357226189}, - { 0.738887324460615147933116508}, { 0.673829000378756060917568372}, - {-0.673829000378756060917568372}, { 0.738887324460615147933116508}, - { 0.046003182130914628814301788}, { 0.998941293186856850633930266}, - {-0.998941293186856850633930266}, { 0.046003182130914628814301788}, - { 0.999618822495178597116830637}, { 0.027608145778965741612354872}, - {-0.027608145778965741612354872}, { 0.999618822495178597116830637}, - { 0.687315340891759108199186948}, { 0.726359155084345976817494315}, - {-0.726359155084345976817494315}, { 0.687315340891759108199186948}, - { 0.912962190428398164628018233}, { 0.408044162864978680820747499}, - {-0.408044162864978680820747499}, { 0.912962190428398164628018233}, - { 0.357030961233430032614954036}, { 0.934092550404258914729877883}, - {-0.934092550404258914729877883}, { 0.357030961233430032614954036}, - { 0.975025345066994146844913468}, { 0.222093620973203534094094721}, - {-0.222093620973203534094094721}, { 0.975025345066994146844913468}, - { 0.532403127877197971442805218}, { 0.846490938774052078300544488}, - {-0.846490938774052078300544488}, { 0.532403127877197971442805218}, - { 0.815814410806733789010772660}, { 0.578313796411655563342245019}, - {-0.578313796411655563342245019}, { 0.815814410806733789010772660}, - { 0.167938294974731178054745536}, { 0.985797509167567424700995000}, - {-0.985797509167567424700995000}, { 0.167938294974731178054745536}, - { 0.992099313142191757112085445}, { 0.125454983411546238542336453}, - {-0.125454983411546238542336453}, { 0.992099313142191757112085445}, - { 0.612810082429409703935211936}, { 0.790230221437310055030217152}, - {-0.790230221437310055030217152}, { 0.612810082429409703935211936}, - { 0.868570705971340895340449876}, { 0.495565261825772531150266670}, - {-0.495565261825772531150266670}, { 0.868570705971340895340449876}, - { 0.263754678974831383611349322}, { 0.964589793289812723836432159}, - {-0.964589793289812723836432159}, { 0.263754678974831383611349322}, - { 0.948561349915730288158494826}, { 0.316593375556165867243047035}, - {-0.316593375556165867243047035}, { 0.948561349915730288158494826}, - { 0.446868840162374195353044389}, { 0.894599485631382678433072126}, - {-0.894599485631382678433072126}, { 0.446868840162374195353044389}, - { 0.755201376896536527598710756}, { 0.655492852999615385312679701}, - {-0.655492852999615385312679701}, { 0.755201376896536527598710756}, - { 0.070504573389613863027351471}, { 0.997511456140303459699448390}, - {-0.997511456140303459699448390}, { 0.070504573389613863027351471}, - { 0.997060070339482978987989949}, { 0.076623861392031492278332463}, - {-0.076623861392031492278332463}, { 0.997060070339482978987989949}, - { 0.650846684996380915068975573}, { 0.759209188978388033485525443}, - {-0.759209188978388033485525443}, { 0.650846684996380915068975573}, - { 0.891840709392342727796478697}, { 0.452349587233770874133026703}, - {-0.452349587233770874133026703}, { 0.891840709392342727796478697}, - { 0.310767152749611495835997250}, { 0.950486073949481721759926101}, - {-0.950486073949481721759926101}, { 0.310767152749611495835997250}, - { 0.962953266873683886347921481}, { 0.269668325572915106525464462}, - {-0.269668325572915106525464462}, { 0.962953266873683886347921481}, - { 0.490226483288291154229598449}, { 0.871595086655951034842481435}, - {-0.871595086655951034842481435}, { 0.490226483288291154229598449}, - { 0.786455213599085757522319464}, { 0.617647307937803932403979402}, - {-0.617647307937803932403979402}, { 0.786455213599085757522319464}, - { 0.119365214810991364593637790}, { 0.992850414459865090793563344}, - {-0.992850414459865090793563344}, { 0.119365214810991364593637790}, - { 0.984748501801904218556553176}, { 0.173983873387463827950700807}, - {-0.173983873387463827950700807}, { 0.984748501801904218556553176}, - { 0.573297166698042212820171239}, { 0.819347520076796960824689637}, - {-0.819347520076796960824689637}, { 0.573297166698042212820171239}, - { 0.843208239641845437161743865}, { 0.537587076295645482502214932}, - {-0.537587076295645482502214932}, { 0.843208239641845437161743865}, - { 0.216106797076219509948385131}, { 0.976369731330021149312732194}, - {-0.976369731330021149312732194}, { 0.216106797076219509948385131}, - { 0.931884265581668106718557199}, { 0.362755724367397216204854462}, - {-0.362755724367397216204854462}, { 0.931884265581668106718557199}, - { 0.402434650859418441082533934}, { 0.915448716088267819566431292}, - {-0.915448716088267819566431292}, { 0.402434650859418441082533934}, - { 0.722128193929215321243607198}, { 0.691759258364157774906734132}, - {-0.691759258364157774906734132}, { 0.722128193929215321243607198}, - { 0.021474080275469507418374898}, { 0.999769405351215321657617036}, - {-0.999769405351215321657617036}, { 0.021474080275469507418374898}, - { 0.999882347454212525633049627}, { 0.015339206284988101044151868}, - {-0.015339206284988101044151868}, { 0.999882347454212525633049627}, - { 0.696177131491462944788582591}, { 0.717870045055731736211325329}, - {-0.717870045055731736211325329}, { 0.696177131491462944788582591}, - { 0.917900775621390457642276297}, { 0.396809987416710328595290911}, - {-0.396809987416710328595290911}, { 0.917900775621390457642276297}, - { 0.368466829953372331712746222}, { 0.929640895843181265457918066}, - {-0.929640895843181265457918066}, { 0.368466829953372331712746222}, - { 0.977677357824509979943404762}, { 0.210111836880469621717489972}, - {-0.210111836880469621717489972}, { 0.977677357824509979943404762}, - { 0.542750784864515906586768661}, { 0.839893794195999504583383987}, - {-0.839893794195999504583383987}, { 0.542750784864515906586768661}, - { 0.822849781375826332046780034}, { 0.568258952670131549790548489}, - {-0.568258952670131549790548489}, { 0.822849781375826332046780034}, - { 0.180022901405699522679906590}, { 0.983662419211730274396237776}, - {-0.983662419211730274396237776}, { 0.180022901405699522679906590}, - { 0.993564135520595333782021697}, { 0.113270952177564349018228733}, - {-0.113270952177564349018228733}, { 0.993564135520595333782021697}, - { 0.622461279374149972519166721}, { 0.782650596166575738458949301}, - {-0.782650596166575738458949301}, { 0.622461279374149972519166721}, - { 0.874586652278176112634431897}, { 0.484869248000791101822951699}, - {-0.484869248000791101822951699}, { 0.874586652278176112634431897}, - { 0.275571819310958163076425168}, { 0.961280485811320641748659653}, - {-0.961280485811320641748659653}, { 0.275571819310958163076425168}, - { 0.952375012719765858529893608}, { 0.304929229735402406490728633}, - {-0.304929229735402406490728633}, { 0.952375012719765858529893608}, - { 0.457813303598877221904961155}, { 0.889048355854664562540777729}, - {-0.889048355854664562540777729}, { 0.457813303598877221904961155}, - { 0.763188417263381271704838297}, { 0.646176012983316364832802220}, - {-0.646176012983316364832802220}, { 0.763188417263381271704838297}, - { 0.082740264549375693111987083}, { 0.996571145790554847093566910}, - {-0.996571145790554847093566910}, { 0.082740264549375693111987083}, - { 0.997925286198596012623025462}, { 0.064382630929857460819324537}, - {-0.064382630929857460819324537}, { 0.997925286198596012623025462}, - { 0.660114342067420478559490747}, { 0.751165131909686411205819422}, - {-0.751165131909686411205819422}, { 0.660114342067420478559490747}, - { 0.897324580705418281231391836}, { 0.441371268731716692879988968}, - {-0.441371268731716692879988968}, { 0.897324580705418281231391836}, - { 0.322407678801069848384807478}, { 0.946600913083283570044599823}, - {-0.946600913083283570044599823}, { 0.322407678801069848384807478}, - { 0.966190003445412555433832961}, { 0.257831102162159005614471295}, - {-0.257831102162159005614471295}, { 0.966190003445412555433832961}, - { 0.500885382611240786241285004}, { 0.865513624090569082825488358}, - {-0.865513624090569082825488358}, { 0.500885382611240786241285004}, - { 0.793975477554337164895083757}, { 0.607949784967773667243642671}, - {-0.607949784967773667243642671}, { 0.793975477554337164895083757}, - { 0.131540028702883111103387493}, { 0.991310859846115418957349799}, - {-0.991310859846115418957349799}, { 0.131540028702883111103387493}, - { 0.986809401814185476970235952}, { 0.161886393780111837641387995}, - {-0.161886393780111837641387995}, { 0.986809401814185476970235952}, - { 0.583308652937698294392830961}, { 0.812250586585203913049744181}, - {-0.812250586585203913049744181}, { 0.583308652937698294392830961}, - { 0.849741768000852489471268395}, { 0.527199134781901348464274575}, - {-0.527199134781901348464274575}, { 0.849741768000852489471268395}, - { 0.228072083170885739254457379}, { 0.973644249650811925318383912}, - {-0.973644249650811925318383912}, { 0.228072083170885739254457379}, - { 0.936265667170278246576310996}, { 0.351292756085567125601307623}, - {-0.351292756085567125601307623}, { 0.936265667170278246576310996}, - { 0.413638312238434547471944324}, { 0.910441292258067196934095369}, - {-0.910441292258067196934095369}, { 0.413638312238434547471944324}, - { 0.730562769227827561177758850}, { 0.682845546385248068164596123}, - {-0.682845546385248068164596123}, { 0.730562769227827561177758850}, - { 0.033741171851377584833716112}, { 0.999430604555461772019008327}, - {-0.999430604555461772019008327}, { 0.033741171851377584833716112}, - { 0.999204758618363895492950001}, { 0.039872927587739811128578738}, - {-0.039872927587739811128578738}, { 0.999204758618363895492950001}, - { 0.678350043129861486873655042}, { 0.734738878095963464563223604}, - {-0.734738878095963464563223604}, { 0.678350043129861486873655042}, - { 0.907886116487666212038681480}, { 0.419216888363223956433010020}, - {-0.419216888363223956433010020}, { 0.907886116487666212038681480}, - { 0.345541324963989065539191723}, { 0.938403534063108112192420774}, - {-0.938403534063108112192420774}, { 0.345541324963989065539191723}, - { 0.972226497078936305708321144}, { 0.234041958583543423191242045}, - {-0.234041958583543423191242045}, { 0.972226497078936305708321144}, - { 0.521975292937154342694258318}, { 0.852960604930363657746588082}, - {-0.852960604930363657746588082}, { 0.521975292937154342694258318}, - { 0.808656181588174991946968128}, { 0.588281548222645304786439813}, - {-0.588281548222645304786439813}, { 0.808656181588174991946968128}, - { 0.155828397654265235743101486}, { 0.987784141644572154230969032}, - {-0.987784141644572154230969032}, { 0.155828397654265235743101486}, - { 0.990485084256457037998682243}, { 0.137620121586486044948441663}, - {-0.137620121586486044948441663}, { 0.990485084256457037998682243}, - { 0.603066598540348201693430617}, { 0.797690840943391108362662755}, - {-0.797690840943391108362662755}, { 0.603066598540348201693430617}, - { 0.862423956111040538690933878}, { 0.506186645345155291048942344}, - {-0.506186645345155291048942344}, { 0.862423956111040538690933878}, - { 0.251897818154216950498106628}, { 0.967753837093475465243391912}, - {-0.967753837093475465243391912}, { 0.251897818154216950498106628}, - { 0.944604837261480265659265493}, { 0.328209843579092526107916817}, - {-0.328209843579092526107916817}, { 0.944604837261480265659265493}, - { 0.435857079922255491032544080}, { 0.900015892016160228714535267}, - {-0.900015892016160228714535267}, { 0.435857079922255491032544080}, - { 0.747100605980180144323078847}, { 0.664710978203344868130324985}, - {-0.664710978203344868130324985}, { 0.747100605980180144323078847}, - { 0.058258264500435759613979782}, { 0.998301544933892840738782163}, - {-0.998301544933892840738782163}, { 0.058258264500435759613979782}, - { 0.996044700901251989887944810}, { 0.088853552582524596561586535}, - {-0.088853552582524596561586535}, { 0.996044700901251989887944810}, - { 0.641481012808583151988739898}, { 0.767138911935820381181694573}, - {-0.767138911935820381181694573}, { 0.641481012808583151988739898}, - { 0.886222530148880631647990821}, { 0.463259783551860197390719637}, - {-0.463259783551860197390719637}, { 0.886222530148880631647990821}, - { 0.299079826308040476750336973}, { 0.954228095109105629780430732}, - {-0.954228095109105629780430732}, { 0.299079826308040476750336973}, - { 0.959571513081984528335528181}, { 0.281464937925757984095231007}, - {-0.281464937925757984095231007}, { 0.959571513081984528335528181}, - { 0.479493757660153026679839798}, { 0.877545290207261291668470750}, - {-0.877545290207261291668470750}, { 0.479493757660153026679839798}, - { 0.778816512381475953374724325}, { 0.627251815495144113509622565}, - {-0.627251815495144113509622565}, { 0.778816512381475953374724325}, - { 0.107172424956808849175529148}, { 0.994240449453187946358413442}, - {-0.994240449453187946358413442}, { 0.107172424956808849175529148}, - { 0.982539302287441255907040396}, { 0.186055151663446648105438304}, - {-0.186055151663446648105438304}, { 0.982539302287441255907040396}, - { 0.563199344013834115007363772}, { 0.826321062845663480311195452}, - {-0.826321062845663480311195452}, { 0.563199344013834115007363772}, - { 0.836547727223511984524285790}, { 0.547894059173100165608820571}, - {-0.547894059173100165608820571}, { 0.836547727223511984524285790}, - { 0.204108966092816874181696950}, { 0.978948175319062194715480124}, - {-0.978948175319062194715480124}, { 0.204108966092816874181696950}, - { 0.927362525650401087274536959}, { 0.374164062971457997104393020}, - {-0.374164062971457997104393020}, { 0.927362525650401087274536959}, - { 0.391170384302253888687512949}, { 0.920318276709110566440076541}, - {-0.920318276709110566440076541}, { 0.391170384302253888687512949}, - { 0.713584868780793592903125099}, { 0.700568793943248366792866380}, - {-0.700568793943248366792866380}, { 0.713584868780793592903125099}, - { 0.009203754782059819315102378}, { 0.999957644551963866333120920}, - {-0.999957644551963866333120920}, { 0.009203754782059819315102378}, - { 0.999957644551963866333120920}, { 0.009203754782059819315102378}, - {-0.009203754782059819315102378}, { 0.999957644551963866333120920}, - { 0.700568793943248366792866380}, { 0.713584868780793592903125099}, - {-0.713584868780793592903125099}, { 0.700568793943248366792866380}, - { 0.920318276709110566440076541}, { 0.391170384302253888687512949}, - {-0.391170384302253888687512949}, { 0.920318276709110566440076541}, - { 0.374164062971457997104393020}, { 0.927362525650401087274536959}, - {-0.927362525650401087274536959}, { 0.374164062971457997104393020}, - { 0.978948175319062194715480124}, { 0.204108966092816874181696950}, - {-0.204108966092816874181696950}, { 0.978948175319062194715480124}, - { 0.547894059173100165608820571}, { 0.836547727223511984524285790}, - {-0.836547727223511984524285790}, { 0.547894059173100165608820571}, - { 0.826321062845663480311195452}, { 0.563199344013834115007363772}, - {-0.563199344013834115007363772}, { 0.826321062845663480311195452}, - { 0.186055151663446648105438304}, { 0.982539302287441255907040396}, - {-0.982539302287441255907040396}, { 0.186055151663446648105438304}, - { 0.994240449453187946358413442}, { 0.107172424956808849175529148}, - {-0.107172424956808849175529148}, { 0.994240449453187946358413442}, - { 0.627251815495144113509622565}, { 0.778816512381475953374724325}, - {-0.778816512381475953374724325}, { 0.627251815495144113509622565}, - { 0.877545290207261291668470750}, { 0.479493757660153026679839798}, - {-0.479493757660153026679839798}, { 0.877545290207261291668470750}, - { 0.281464937925757984095231007}, { 0.959571513081984528335528181}, - {-0.959571513081984528335528181}, { 0.281464937925757984095231007}, - { 0.954228095109105629780430732}, { 0.299079826308040476750336973}, - {-0.299079826308040476750336973}, { 0.954228095109105629780430732}, - { 0.463259783551860197390719637}, { 0.886222530148880631647990821}, - {-0.886222530148880631647990821}, { 0.463259783551860197390719637}, - { 0.767138911935820381181694573}, { 0.641481012808583151988739898}, - {-0.641481012808583151988739898}, { 0.767138911935820381181694573}, - { 0.088853552582524596561586535}, { 0.996044700901251989887944810}, - {-0.996044700901251989887944810}, { 0.088853552582524596561586535}, - { 0.998301544933892840738782163}, { 0.058258264500435759613979782}, - {-0.058258264500435759613979782}, { 0.998301544933892840738782163}, - { 0.664710978203344868130324985}, { 0.747100605980180144323078847}, - {-0.747100605980180144323078847}, { 0.664710978203344868130324985}, - { 0.900015892016160228714535267}, { 0.435857079922255491032544080}, - {-0.435857079922255491032544080}, { 0.900015892016160228714535267}, - { 0.328209843579092526107916817}, { 0.944604837261480265659265493}, - {-0.944604837261480265659265493}, { 0.328209843579092526107916817}, - { 0.967753837093475465243391912}, { 0.251897818154216950498106628}, - {-0.251897818154216950498106628}, { 0.967753837093475465243391912}, - { 0.506186645345155291048942344}, { 0.862423956111040538690933878}, - {-0.862423956111040538690933878}, { 0.506186645345155291048942344}, - { 0.797690840943391108362662755}, { 0.603066598540348201693430617}, - {-0.603066598540348201693430617}, { 0.797690840943391108362662755}, - { 0.137620121586486044948441663}, { 0.990485084256457037998682243}, - {-0.990485084256457037998682243}, { 0.137620121586486044948441663}, - { 0.987784141644572154230969032}, { 0.155828397654265235743101486}, - {-0.155828397654265235743101486}, { 0.987784141644572154230969032}, - { 0.588281548222645304786439813}, { 0.808656181588174991946968128}, - {-0.808656181588174991946968128}, { 0.588281548222645304786439813}, - { 0.852960604930363657746588082}, { 0.521975292937154342694258318}, - {-0.521975292937154342694258318}, { 0.852960604930363657746588082}, - { 0.234041958583543423191242045}, { 0.972226497078936305708321144}, - {-0.972226497078936305708321144}, { 0.234041958583543423191242045}, - { 0.938403534063108112192420774}, { 0.345541324963989065539191723}, - {-0.345541324963989065539191723}, { 0.938403534063108112192420774}, - { 0.419216888363223956433010020}, { 0.907886116487666212038681480}, - {-0.907886116487666212038681480}, { 0.419216888363223956433010020}, - { 0.734738878095963464563223604}, { 0.678350043129861486873655042}, - {-0.678350043129861486873655042}, { 0.734738878095963464563223604}, - { 0.039872927587739811128578738}, { 0.999204758618363895492950001}, - {-0.999204758618363895492950001}, { 0.039872927587739811128578738}, - { 0.999430604555461772019008327}, { 0.033741171851377584833716112}, - {-0.033741171851377584833716112}, { 0.999430604555461772019008327}, - { 0.682845546385248068164596123}, { 0.730562769227827561177758850}, - {-0.730562769227827561177758850}, { 0.682845546385248068164596123}, - { 0.910441292258067196934095369}, { 0.413638312238434547471944324}, - {-0.413638312238434547471944324}, { 0.910441292258067196934095369}, - { 0.351292756085567125601307623}, { 0.936265667170278246576310996}, - {-0.936265667170278246576310996}, { 0.351292756085567125601307623}, - { 0.973644249650811925318383912}, { 0.228072083170885739254457379}, - {-0.228072083170885739254457379}, { 0.973644249650811925318383912}, - { 0.527199134781901348464274575}, { 0.849741768000852489471268395}, - {-0.849741768000852489471268395}, { 0.527199134781901348464274575}, - { 0.812250586585203913049744181}, { 0.583308652937698294392830961}, - {-0.583308652937698294392830961}, { 0.812250586585203913049744181}, - { 0.161886393780111837641387995}, { 0.986809401814185476970235952}, - {-0.986809401814185476970235952}, { 0.161886393780111837641387995}, - { 0.991310859846115418957349799}, { 0.131540028702883111103387493}, - {-0.131540028702883111103387493}, { 0.991310859846115418957349799}, - { 0.607949784967773667243642671}, { 0.793975477554337164895083757}, - {-0.793975477554337164895083757}, { 0.607949784967773667243642671}, - { 0.865513624090569082825488358}, { 0.500885382611240786241285004}, - {-0.500885382611240786241285004}, { 0.865513624090569082825488358}, - { 0.257831102162159005614471295}, { 0.966190003445412555433832961}, - {-0.966190003445412555433832961}, { 0.257831102162159005614471295}, - { 0.946600913083283570044599823}, { 0.322407678801069848384807478}, - {-0.322407678801069848384807478}, { 0.946600913083283570044599823}, - { 0.441371268731716692879988968}, { 0.897324580705418281231391836}, - {-0.897324580705418281231391836}, { 0.441371268731716692879988968}, - { 0.751165131909686411205819422}, { 0.660114342067420478559490747}, - {-0.660114342067420478559490747}, { 0.751165131909686411205819422}, - { 0.064382630929857460819324537}, { 0.997925286198596012623025462}, - {-0.997925286198596012623025462}, { 0.064382630929857460819324537}, - { 0.996571145790554847093566910}, { 0.082740264549375693111987083}, - {-0.082740264549375693111987083}, { 0.996571145790554847093566910}, - { 0.646176012983316364832802220}, { 0.763188417263381271704838297}, - {-0.763188417263381271704838297}, { 0.646176012983316364832802220}, - { 0.889048355854664562540777729}, { 0.457813303598877221904961155}, - {-0.457813303598877221904961155}, { 0.889048355854664562540777729}, - { 0.304929229735402406490728633}, { 0.952375012719765858529893608}, - {-0.952375012719765858529893608}, { 0.304929229735402406490728633}, - { 0.961280485811320641748659653}, { 0.275571819310958163076425168}, - {-0.275571819310958163076425168}, { 0.961280485811320641748659653}, - { 0.484869248000791101822951699}, { 0.874586652278176112634431897}, - {-0.874586652278176112634431897}, { 0.484869248000791101822951699}, - { 0.782650596166575738458949301}, { 0.622461279374149972519166721}, - {-0.622461279374149972519166721}, { 0.782650596166575738458949301}, - { 0.113270952177564349018228733}, { 0.993564135520595333782021697}, - {-0.993564135520595333782021697}, { 0.113270952177564349018228733}, - { 0.983662419211730274396237776}, { 0.180022901405699522679906590}, - {-0.180022901405699522679906590}, { 0.983662419211730274396237776}, - { 0.568258952670131549790548489}, { 0.822849781375826332046780034}, - {-0.822849781375826332046780034}, { 0.568258952670131549790548489}, - { 0.839893794195999504583383987}, { 0.542750784864515906586768661}, - {-0.542750784864515906586768661}, { 0.839893794195999504583383987}, - { 0.210111836880469621717489972}, { 0.977677357824509979943404762}, - {-0.977677357824509979943404762}, { 0.210111836880469621717489972}, - { 0.929640895843181265457918066}, { 0.368466829953372331712746222}, - {-0.368466829953372331712746222}, { 0.929640895843181265457918066}, - { 0.396809987416710328595290911}, { 0.917900775621390457642276297}, - {-0.917900775621390457642276297}, { 0.396809987416710328595290911}, - { 0.717870045055731736211325329}, { 0.696177131491462944788582591}, - {-0.696177131491462944788582591}, { 0.717870045055731736211325329}, - { 0.015339206284988101044151868}, { 0.999882347454212525633049627}, - {-0.999882347454212525633049627}, { 0.015339206284988101044151868}, - { 0.999769405351215321657617036}, { 0.021474080275469507418374898}, - {-0.021474080275469507418374898}, { 0.999769405351215321657617036}, - { 0.691759258364157774906734132}, { 0.722128193929215321243607198}, - {-0.722128193929215321243607198}, { 0.691759258364157774906734132}, - { 0.915448716088267819566431292}, { 0.402434650859418441082533934}, - {-0.402434650859418441082533934}, { 0.915448716088267819566431292}, - { 0.362755724367397216204854462}, { 0.931884265581668106718557199}, - {-0.931884265581668106718557199}, { 0.362755724367397216204854462}, - { 0.976369731330021149312732194}, { 0.216106797076219509948385131}, - {-0.216106797076219509948385131}, { 0.976369731330021149312732194}, - { 0.537587076295645482502214932}, { 0.843208239641845437161743865}, - {-0.843208239641845437161743865}, { 0.537587076295645482502214932}, - { 0.819347520076796960824689637}, { 0.573297166698042212820171239}, - {-0.573297166698042212820171239}, { 0.819347520076796960824689637}, - { 0.173983873387463827950700807}, { 0.984748501801904218556553176}, - {-0.984748501801904218556553176}, { 0.173983873387463827950700807}, - { 0.992850414459865090793563344}, { 0.119365214810991364593637790}, - {-0.119365214810991364593637790}, { 0.992850414459865090793563344}, - { 0.617647307937803932403979402}, { 0.786455213599085757522319464}, - {-0.786455213599085757522319464}, { 0.617647307937803932403979402}, - { 0.871595086655951034842481435}, { 0.490226483288291154229598449}, - {-0.490226483288291154229598449}, { 0.871595086655951034842481435}, - { 0.269668325572915106525464462}, { 0.962953266873683886347921481}, - {-0.962953266873683886347921481}, { 0.269668325572915106525464462}, - { 0.950486073949481721759926101}, { 0.310767152749611495835997250}, - {-0.310767152749611495835997250}, { 0.950486073949481721759926101}, - { 0.452349587233770874133026703}, { 0.891840709392342727796478697}, - {-0.891840709392342727796478697}, { 0.452349587233770874133026703}, - { 0.759209188978388033485525443}, { 0.650846684996380915068975573}, - {-0.650846684996380915068975573}, { 0.759209188978388033485525443}, - { 0.076623861392031492278332463}, { 0.997060070339482978987989949}, - {-0.997060070339482978987989949}, { 0.076623861392031492278332463}, - { 0.997511456140303459699448390}, { 0.070504573389613863027351471}, - {-0.070504573389613863027351471}, { 0.997511456140303459699448390}, - { 0.655492852999615385312679701}, { 0.755201376896536527598710756}, - {-0.755201376896536527598710756}, { 0.655492852999615385312679701}, - { 0.894599485631382678433072126}, { 0.446868840162374195353044389}, - {-0.446868840162374195353044389}, { 0.894599485631382678433072126}, - { 0.316593375556165867243047035}, { 0.948561349915730288158494826}, - {-0.948561349915730288158494826}, { 0.316593375556165867243047035}, - { 0.964589793289812723836432159}, { 0.263754678974831383611349322}, - {-0.263754678974831383611349322}, { 0.964589793289812723836432159}, - { 0.495565261825772531150266670}, { 0.868570705971340895340449876}, - {-0.868570705971340895340449876}, { 0.495565261825772531150266670}, - { 0.790230221437310055030217152}, { 0.612810082429409703935211936}, - {-0.612810082429409703935211936}, { 0.790230221437310055030217152}, - { 0.125454983411546238542336453}, { 0.992099313142191757112085445}, - {-0.992099313142191757112085445}, { 0.125454983411546238542336453}, - { 0.985797509167567424700995000}, { 0.167938294974731178054745536}, - {-0.167938294974731178054745536}, { 0.985797509167567424700995000}, - { 0.578313796411655563342245019}, { 0.815814410806733789010772660}, - {-0.815814410806733789010772660}, { 0.578313796411655563342245019}, - { 0.846490938774052078300544488}, { 0.532403127877197971442805218}, - {-0.532403127877197971442805218}, { 0.846490938774052078300544488}, - { 0.222093620973203534094094721}, { 0.975025345066994146844913468}, - {-0.975025345066994146844913468}, { 0.222093620973203534094094721}, - { 0.934092550404258914729877883}, { 0.357030961233430032614954036}, - {-0.357030961233430032614954036}, { 0.934092550404258914729877883}, - { 0.408044162864978680820747499}, { 0.912962190428398164628018233}, - {-0.912962190428398164628018233}, { 0.408044162864978680820747499}, - { 0.726359155084345976817494315}, { 0.687315340891759108199186948}, - {-0.687315340891759108199186948}, { 0.726359155084345976817494315}, - { 0.027608145778965741612354872}, { 0.999618822495178597116830637}, - {-0.999618822495178597116830637}, { 0.027608145778965741612354872}, - { 0.998941293186856850633930266}, { 0.046003182130914628814301788}, - {-0.046003182130914628814301788}, { 0.998941293186856850633930266}, - { 0.673829000378756060917568372}, { 0.738887324460615147933116508}, - {-0.738887324460615147933116508}, { 0.673829000378756060917568372}, - { 0.905296759318118774354048329}, { 0.424779681209108833357226189}, - {-0.424779681209108833357226189}, { 0.905296759318118774354048329}, - { 0.339776884406826857828825803}, { 0.940506070593268323787291309}, - {-0.940506070593268323787291309}, { 0.339776884406826857828825803}, - { 0.970772140728950302138169611}, { 0.240003022448741486568922365}, - {-0.240003022448741486568922365}, { 0.970772140728950302138169611}, - { 0.516731799017649881508753876}, { 0.856147328375194481019630732}, - {-0.856147328375194481019630732}, { 0.516731799017649881508753876}, - { 0.805031331142963597922659282}, { 0.593232295039799808047809426}, - {-0.593232295039799808047809426}, { 0.805031331142963597922659282}, - { 0.149764534677321517229695737}, { 0.988721691960323767604516485}, - {-0.988721691960323767604516485}, { 0.149764534677321517229695737}, - { 0.989622017463200834623694454}, { 0.143695033150294454819773349}, - {-0.143695033150294454819773349}, { 0.989622017463200834623694454}, - { 0.598160706996342311724958652}, { 0.801376171723140219430247777}, - {-0.801376171723140219430247777}, { 0.598160706996342311724958652}, - { 0.859301818357008404783582139}, { 0.511468850437970399504391001}, - {-0.511468850437970399504391001}, { 0.859301818357008404783582139}, - { 0.245955050335794611599924709}, { 0.969281235356548486048290738}, - {-0.969281235356548486048290738}, { 0.245955050335794611599924709}, - { 0.942573197601446879280758735}, { 0.333999651442009404650865481}, - {-0.333999651442009404650865481}, { 0.942573197601446879280758735}, - { 0.430326481340082633908199031}, { 0.902673318237258806751502391}, - {-0.902673318237258806751502391}, { 0.430326481340082633908199031}, - { 0.743007952135121693517362293}, { 0.669282588346636065720696366}, - {-0.669282588346636065720696366}, { 0.743007952135121693517362293}, - { 0.052131704680283321236358216}, { 0.998640218180265222418199049}, - {-0.998640218180265222418199049}, { 0.052131704680283321236358216}, - { 0.995480755491926941769171600}, { 0.094963495329638998938034312}, - {-0.094963495329638998938034312}, { 0.995480755491926941769171600}, - { 0.636761861236284230413943435}, { 0.771060524261813773200605759}, - {-0.771060524261813773200605759}, { 0.636761861236284230413943435}, - { 0.883363338665731594736308015}, { 0.468688822035827933697617870}, - {-0.468688822035827933697617870}, { 0.883363338665731594736308015}, - { 0.293219162694258650606608599}, { 0.956045251349996443270479823}, - {-0.956045251349996443270479823}, { 0.293219162694258650606608599}, - { 0.957826413027532890321037029}, { 0.287347459544729526477331841}, - {-0.287347459544729526477331841}, { 0.957826413027532890321037029}, - { 0.474100214650550014398580015}, { 0.880470889052160770806542929}, - {-0.880470889052160770806542929}, { 0.474100214650550014398580015}, - { 0.774953106594873878359129282}, { 0.632018735939809021909403706}, - {-0.632018735939809021909403706}, { 0.774953106594873878359129282}, - { 0.101069862754827824987887585}, { 0.994879330794805620591166107}, - {-0.994879330794805620591166107}, { 0.101069862754827824987887585}, - { 0.981379193313754574318224190}, { 0.192080397049892441679288205}, - {-0.192080397049892441679288205}, { 0.981379193313754574318224190}, - { 0.558118531220556115693702964}, { 0.829761233794523042469023765}, - {-0.829761233794523042469023765}, { 0.558118531220556115693702964}, - { 0.833170164701913186439915922}, { 0.553016705580027531764226988}, - {-0.553016705580027531764226988}, { 0.833170164701913186439915922}, - { 0.198098410717953586179324918}, { 0.980182135968117392690210009}, - {-0.980182135968117392690210009}, { 0.198098410717953586179324918}, - { 0.925049240782677590302371869}, { 0.379847208924051170576281147}, - {-0.379847208924051170576281147}, { 0.925049240782677590302371869}, - { 0.385516053843918864075607949}, { 0.922701128333878570437264227}, - {-0.922701128333878570437264227}, { 0.385516053843918864075607949}, - { 0.709272826438865651316533772}, { 0.704934080375904908852523758}, - {-0.704934080375904908852523758}, { 0.709272826438865651316533772}, - { 0.003067956762965976270145365}, { 0.999995293809576171511580126}, - {-0.999995293809576171511580126}, { 0.003067956762965976270145365} -}; - -const fpr fpr_p2_tab[] = { - { 2.00000000000 }, - { 1.00000000000 }, - { 0.50000000000 }, - { 0.25000000000 }, - { 0.12500000000 }, - { 0.06250000000 }, - { 0.03125000000 }, - { 0.01562500000 }, - { 0.00781250000 }, - { 0.00390625000 }, - { 0.00195312500 } -}; - -#else // yyyFPNATIVE+0 yyyFPEMU+0 - -#error No FP implementation selected - -#endif // yyyFPNATIVE- yyyFPEMU- diff --git a/crypto_sign/falcon-512-tree/m4-ct/fpr.h b/crypto_sign/falcon-512-tree/m4-ct/fpr.h deleted file mode 100644 index 8176212d..00000000 --- a/crypto_sign/falcon-512-tree/m4-ct/fpr.h +++ /dev/null @@ -1,893 +0,0 @@ -/* - * Floating-point operations. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#if FALCON_FPEMU // yyyFPEMU+1 yyyFPNATIVE+0 - -/* ====================================================================== */ -/* - * Custom floating-point implementation with integer arithmetics. We - * use IEEE-754 "binary64" format, with some simplifications: - * - * - Top bit is s = 1 for negative, 0 for positive. - * - * - Exponent e uses the next 11 bits (bits 52 to 62, inclusive). - * - * - Mantissa m uses the 52 low bits. - * - * Encoded value is, in general: (-1)^s * 2^(e-1023) * (1 + m*2^(-52)) - * i.e. the mantissa really is a 53-bit number (less than 2.0, but not - * less than 1.0), but the top bit (equal to 1 by definition) is omitted - * in the encoding. - * - * In IEEE-754, there are some special values: - * - * - If e = 2047, then the value is either an infinite (m = 0) or - * a NaN (m != 0). - * - * - If e = 0, then the value is either a zero (m = 0) or a subnormal, - * aka "denormalized number" (m != 0). - * - * Of these, we only need the zeros. The caller is responsible for not - * providing operands that would lead to infinites, NaNs or subnormals. - * If inputs are such that values go out of range, then indeterminate - * values are returned (it would still be deterministic, but no specific - * value may be relied upon). - * - * At the C level, the three parts are stored in a 64-bit unsigned - * word. - * - * One may note that a property of the IEEE-754 format is that order - * is preserved for positive values: if two positive floating-point - * values x and y are such that x < y, then their respective encodings - * as _signed_ 64-bit integers i64(x) and i64(y) will be such that - * i64(x) < i64(y). For negative values, order is reversed: if x < 0, - * y < 0, and x < y, then ia64(x) > ia64(y). - * - * IMPORTANT ASSUMPTIONS: - * ====================== - * - * For proper computations, and constant-time behaviour, we assume the - * following: - * - * - 32x32->64 multiplication (unsigned) has an execution time that - * is independent of its operands. This is true of most modern - * x86 and ARM cores. Notable exceptions are the ARM Cortex M0, M0+ - * and M3 (in the M0 and M0+, this is done in software, so it depends - * on that routine), and the PowerPC cores from the G3/G4 lines. - * For more info, see: https://www.bearssl.org/ctmul.html - * - * - Left-shifts and right-shifts of 32-bit values have an execution - * time which does not depend on the shifted value nor on the - * shift count. An historical exception is the Pentium IV, but most - * modern CPU have barrel shifters. Some small microcontrollers - * might have varying-time shifts (not the ARM Cortex M*, though). - * - * - Right-shift of a signed negative value performs a sign extension. - * As per the C standard, this operation returns an - * implementation-defined result (this is NOT an "undefined - * behaviour"). On most/all systems, an arithmetic shift is - * performed, because this is what makes most sense. - */ - -/* - * Normally we should declare the 'fpr' type to be a struct or union - * around the internal 64-bit value; however, we want to use the - * direct 64-bit integer type to enable a lighter call convention on - * ARM platforms. This means that direct (invalid) use of operators - * such as '*' or '+' will not be caught by the compiler. We rely on - * the "normal" (non-emulated) code to detect such instances. - */ -typedef uint64_t fpr; - -/* - * For computations, we split values into an integral mantissa in the - * 2^54..2^55 range, and an (adjusted) exponent. The lowest bit is - * "sticky" (it is set to 1 if any of the bits below it is 1); when - * re-encoding, the low two bits are dropped, but may induce an - * increment in the value for proper rounding. - */ - -/* - * Right-shift a 64-bit unsigned value by a possibly secret shift count. - * We assumed that the underlying architecture had a barrel shifter for - * 32-bit shifts, but for 64-bit shifts on a 32-bit system, this will - * typically invoke a software routine that is not necessarily - * constant-time; hence the function below. - * - * Shift count n MUST be in the 0..63 range. - */ -static inline uint64_t -fpr_ursh(uint64_t x, int n) -{ - x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5); - return x >> (n & 31); -} - -/* - * Right-shift a 64-bit signed value by a possibly secret shift count - * (see fpr_ursh() for the rationale). - * - * Shift count n MUST be in the 0..63 range. - */ -static inline int64_t -fpr_irsh(int64_t x, int n) -{ - x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5); - return x >> (n & 31); -} - -/* - * Left-shift a 64-bit unsigned value by a possibly secret shift count - * (see fpr_ursh() for the rationale). - * - * Shift count n MUST be in the 0..63 range. - */ -static inline uint64_t -fpr_ulsh(uint64_t x, int n) -{ - x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5); - return x << (n & 31); -} - -/* - * Expectations: - * s = 0 or 1 - * exponent e is "arbitrary" and unbiased - * 2^54 <= m < 2^55 - * Numerical value is (-1)^2 * m * 2^e - * - * Exponents which are too low lead to value zero. If the exponent is - * too large, the returned value is indeterminate. - * - * If m = 0, then a zero is returned (using the provided sign). - * If e < -1076, then a zero is returned (regardless of the value of m). - * If e >= -1076 and e != 0, m must be within the expected range - * (2^54 to 2^55-1). - */ -static inline fpr -FPR(int s, int e, uint64_t m) -{ - fpr x; - uint32_t t; - unsigned f; - - /* - * If e >= -1076, then the value is "normal"; otherwise, it - * should be a subnormal, which we clamp down to zero. - */ - e += 1076; - t = (uint32_t)e >> 31; - m &= (uint64_t)t - 1; - - /* - * If m = 0 then we want a zero; make e = 0 too, but conserve - * the sign. - */ - t = (uint32_t)(m >> 54); - e &= -(int)t; - - /* - * The 52 mantissa bits come from m. Value m has its top bit set - * (unless it is a zero); we leave it "as is": the top bit will - * increment the exponent by 1, except when m = 0, which is - * exactly what we want. - */ - x = (((uint64_t)s << 63) | (m >> 2)) + ((uint64_t)(uint32_t)e << 52); - - /* - * Rounding: if the low three bits of m are 011, 110 or 111, - * then the value should be incremented to get the next - * representable value. This implements the usual - * round-to-nearest rule (with preference to even values in case - * of a tie). Note that the increment may make a carry spill - * into the exponent field, which is again exactly what we want - * in that case. - */ - f = (unsigned)m & 7U; - x += (0xC8U >> f) & 1; - return x; -} - -#define fpr_scaled Zf(fpr_scaled) -fpr fpr_scaled(int64_t i, int sc); - -static inline fpr -fpr_of(int64_t i) -{ - return fpr_scaled(i, 0); -} - -static const fpr fpr_q = 4667981563525332992; -static const fpr fpr_inverse_of_q = 4545632735260551042; -static const fpr fpr_inv_2sqrsigma0 = 4594603506513722306; -static const fpr fpr_inv_sigma = 4573359825155195350; -static const fpr fpr_sigma_min_9 = 4608495221497168882; -static const fpr fpr_sigma_min_10 = 4608586345619182117; -static const fpr fpr_log2 = 4604418534313441775; -static const fpr fpr_inv_log2 = 4609176140021203710; -static const fpr fpr_bnorm_max = 4670353323383631276; -static const fpr fpr_zero = 0; -static const fpr fpr_one = 4607182418800017408; -static const fpr fpr_two = 4611686018427387904; -static const fpr fpr_onehalf = 4602678819172646912; -static const fpr fpr_invsqrt2 = 4604544271217802189; -static const fpr fpr_invsqrt8 = 4600040671590431693; -static const fpr fpr_ptwo31 = 4746794007248502784; -static const fpr fpr_ptwo31m1 = 4746794007244308480; -static const fpr fpr_mtwo31m1 = 13970166044099084288U; -static const fpr fpr_ptwo63m1 = 4890909195324358656; -static const fpr fpr_mtwo63m1 = 14114281232179134464U; -static const fpr fpr_ptwo63 = 4890909195324358656; - -static inline int64_t -fpr_rint(fpr x) -{ - uint64_t m, d; - int e; - uint32_t s, dd, f; - - /* - * We assume that the value fits in -(2^63-1)..+(2^63-1). We can - * thus extract the mantissa as a 63-bit integer, then right-shift - * it as needed. - */ - m = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1); - e = 1085 - ((int)(x >> 52) & 0x7FF); - - /* - * If a shift of more than 63 bits is needed, then simply set m - * to zero. This also covers the case of an input operand equal - * to zero. - */ - m &= -(uint64_t)((uint32_t)(e - 64) >> 31); - e &= 63; - - /* - * Right-shift m as needed. Shift count is e. Proper rounding - * mandates that: - * - If the highest dropped bit is zero, then round low. - * - If the highest dropped bit is one, and at least one of the - * other dropped bits is one, then round up. - * - If the highest dropped bit is one, and all other dropped - * bits are zero, then round up if the lowest kept bit is 1, - * or low otherwise (i.e. ties are broken by "rounding to even"). - * - * We thus first extract a word consisting of all the dropped bit - * AND the lowest kept bit; then we shrink it down to three bits, - * the lowest being "sticky". - */ - d = fpr_ulsh(m, 63 - e); - dd = (uint32_t)d | ((uint32_t)(d >> 32) & 0x1FFFFFFF); - f = (uint32_t)(d >> 61) | ((dd | -dd) >> 31); - m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U); - - /* - * Apply the sign bit. - */ - s = (uint32_t)(x >> 63); - return ((int64_t)m ^ -(int64_t)s) + (int64_t)s; -} - -static inline int64_t -fpr_floor(fpr x) -{ - uint64_t t; - int64_t xi; - int e, cc; - - /* - * We extract the integer as a _signed_ 64-bit integer with - * a scaling factor. Since we assume that the value fits - * in the -(2^63-1)..+(2^63-1) range, we can left-shift the - * absolute value to make it in the 2^62..2^63-1 range: we - * will only need a right-shift afterwards. - */ - e = (int)(x >> 52) & 0x7FF; - t = x >> 63; - xi = (int64_t)(((x << 10) | ((uint64_t)1 << 62)) - & (((uint64_t)1 << 63) - 1)); - xi = (xi ^ -(int64_t)t) + (int64_t)t; - cc = 1085 - e; - - /* - * We perform an arithmetic right-shift on the value. This - * applies floor() semantics on both positive and negative values - * (rounding toward minus infinity). - */ - xi = fpr_irsh(xi, cc & 63); - - /* - * If the true shift count was 64 or more, then we should instead - * replace xi with 0 (if nonnegative) or -1 (if negative). Edge - * case: -0 will be floored to -1, not 0 (whether this is correct - * is debatable; in any case, the other functions normalize zero - * to +0). - * - * For an input of zero, the non-shifted xi was incorrect (we used - * a top implicit bit of value 1, not 0), but this does not matter - * since this operation will clamp it down. - */ - xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31); - return xi; -} - -static inline int64_t -fpr_trunc(fpr x) -{ - uint64_t t, xu; - int e, cc; - - /* - * Extract the absolute value. Since we assume that the value - * fits in the -(2^63-1)..+(2^63-1) range, we can left-shift - * the absolute value into the 2^62..2^63-1 range, and then - * do a right shift afterwards. - */ - e = (int)(x >> 52) & 0x7FF; - xu = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1); - cc = 1085 - e; - xu = fpr_ursh(xu, cc & 63); - - /* - * If the exponent is too low (cc > 63), then the shift was wrong - * and we must clamp the value to 0. This also covers the case - * of an input equal to zero. - */ - xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31); - - /* - * Apply back the sign, if the source value is negative. - */ - t = x >> 63; - xu = (xu ^ -t) + t; - return *(int64_t *)&xu; -} - -#define fpr_add Zf(fpr_add) -fpr fpr_add(fpr x, fpr y); - -static inline fpr -fpr_sub(fpr x, fpr y) -{ - y ^= (uint64_t)1 << 63; - return fpr_add(x, y); -} - -static inline fpr -fpr_neg(fpr x) -{ - x ^= (uint64_t)1 << 63; - return x; -} - -static inline fpr -fpr_half(fpr x) -{ - /* - * To divide a value by 2, we just have to subtract 1 from its - * exponent, but we have to take care of zero. - */ - uint32_t t; - - x -= (uint64_t)1 << 52; - t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11; - x &= (uint64_t)t - 1; - return x; -} - -static inline fpr -fpr_double(fpr x) -{ - /* - * To double a value, we just increment by one the exponent. We - * don't care about infinites or NaNs; however, 0 is a - * special case. - */ - x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52; - return x; -} - -#define fpr_mul Zf(fpr_mul) -fpr fpr_mul(fpr x, fpr y); - -static inline fpr -fpr_sqr(fpr x) -{ - return fpr_mul(x, x); -} - -#define fpr_div Zf(fpr_div) -fpr fpr_div(fpr x, fpr y); - -static inline fpr -fpr_inv(fpr x) -{ - return fpr_div(4607182418800017408u, x); -} - -#define fpr_sqrt Zf(fpr_sqrt) -fpr fpr_sqrt(fpr x); - -static inline int -fpr_lt(fpr x, fpr y) -{ - /* - * If x >= 0 or y >= 0, a signed comparison yields the proper - * result: - * - For positive values, the order is preserved. - * - The sign bit is at the same place as in integers, so - * sign is preserved. - * - * If both x and y are negative, then the order is reversed. - * We cannot simply invert the comparison result in that case - * because it would not handle the edge case x = y properly. - */ - int cc0, cc1; - - cc0 = *(int64_t *)&x < *(int64_t *)&y; - cc1 = *(int64_t *)&x > *(int64_t *)&y; - return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63)); -} - -/* - * Compute exp(x) for x such that |x| <= ln 2. We want a precision of 50 - * bits or so. - */ -#define fpr_expm_p63 Zf(fpr_expm_p63) -uint64_t fpr_expm_p63(fpr x, fpr ccs); - -#define fpr_gm_tab Zf(fpr_gm_tab) -extern const fpr fpr_gm_tab[]; - -#define fpr_p2_tab Zf(fpr_p2_tab) -extern const fpr fpr_p2_tab[]; - -/* ====================================================================== */ - -#elif FALCON_FPNATIVE // yyyFPEMU+0 yyyFPNATIVE+1 - -/* ====================================================================== */ - -#include - -/* - * We wrap the native 'double' type into a structure so that the C compiler - * complains if we inadvertently use raw arithmetic operators on the 'fpr' - * type instead of using the inline functions below. This should have no - * extra runtime cost, since all the functions below are 'inline'. - */ -typedef struct { double v; } fpr; - -static inline fpr -FPR(double v) -{ - fpr x; - - x.v = v; - return x; -} - -static inline fpr -fpr_of(int64_t i) -{ - return FPR((double)i); -} - -static const fpr fpr_q = { 12289.0 }; -static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 }; -static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 }; -static const fpr fpr_inv_sigma = { .005819826392951607426919370871 }; -static const fpr fpr_sigma_min_9 = { 1.291500756233514568549480827642 }; -static const fpr fpr_sigma_min_10 = { 1.311734375905083682667395805765 }; -static const fpr fpr_log2 = { 0.69314718055994530941723212146 }; -static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 }; -static const fpr fpr_bnorm_max = { 16822.4121 }; -static const fpr fpr_zero = { 0.0 }; -static const fpr fpr_one = { 1.0 }; -static const fpr fpr_two = { 2.0 }; -static const fpr fpr_onehalf = { 0.5 }; -static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 }; -static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 }; -static const fpr fpr_ptwo31 = { 2147483648.0 }; -static const fpr fpr_ptwo31m1 = { 2147483647.0 }; -static const fpr fpr_mtwo31m1 = { -2147483647.0 }; -static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 }; -static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 }; -static const fpr fpr_ptwo63 = { 9223372036854775808.0 }; - -static inline int64_t -fpr_rint(fpr x) -{ - /* - * We do not want to use llrint() since it might be not - * constant-time. - * - * Suppose that x >= 0. If x >= 2^52, then it is already an - * integer. Otherwise, if x < 2^52, then computing x+2^52 will - * yield a value that will be rounded to the nearest integer - * with exactly the right rules (round-to-nearest-even). - * - * In order to have constant-time processing, we must do the - * computation for both x >= 0 and x < 0 cases, and use a - * cast to an integer to access the sign and select the proper - * value. Such casts also allow us to find out if |x| < 2^52. - */ - int64_t sx, tx, rp, rn, m; - uint32_t ub; - - sx = (int64_t)(x.v - 1.0); - tx = (int64_t)x.v; - rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496; - rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496; - - /* - * If tx >= 2^52 or tx < -2^52, then result is tx. - * Otherwise, if sx >= 0, then result is rp. - * Otherwise, result is rn. We use the fact that when x is - * close to 0 (|x| <= 0.25) then both rp and rn are correct; - * and if x is not close to 0, then trunc(x-1.0) yields the - * appropriate sign. - */ - - /* - * Clamp rp to zero if tx < 0. - * Clamp rn to zero if tx >= 0. - */ - m = sx >> 63; - rn &= m; - rp &= ~m; - - /* - * Get the 12 upper bits of tx; if they are not all zeros or - * all ones, then tx >= 2^52 or tx < -2^52, and we clamp both - * rp and rn to zero. Otherwise, we clamp tx to zero. - */ - ub = (uint32_t)((uint64_t)tx >> 52); - m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31); - rp &= m; - rn &= m; - tx &= ~m; - - /* - * Only one of tx, rn or rp (at most) can be non-zero at this - * point. - */ - return tx | rn | rp; -} - -static inline int64_t -fpr_floor(fpr x) -{ - int64_t r; - - /* - * The cast performs a trunc() (rounding toward 0) and thus is - * wrong by 1 for most negative values. The correction below is - * constant-time as long as the compiler turns the - * floating-point conversion result into a 0/1 integer without a - * conditional branch or another non-constant-time construction. - * This should hold on all modern architectures with an FPU (and - * if it is false on a given arch, then chances are that the FPU - * itself is not constant-time, making the point moot). - */ - r = (int64_t)x.v; - return r - (x.v < (double)r); -} - -static inline int64_t -fpr_trunc(fpr x) -{ - return (int64_t)x.v; -} - -static inline fpr -fpr_add(fpr x, fpr y) -{ - return FPR(x.v + y.v); -} - -static inline fpr -fpr_sub(fpr x, fpr y) -{ - return FPR(x.v - y.v); -} - -static inline fpr -fpr_neg(fpr x) -{ - return FPR(-x.v); -} - -static inline fpr -fpr_half(fpr x) -{ - return FPR(x.v * 0.5); -} - -static inline fpr -fpr_double(fpr x) -{ - return FPR(x.v + x.v); -} - -static inline fpr -fpr_mul(fpr x, fpr y) -{ - return FPR(x.v * y.v); -} - -static inline fpr -fpr_sqr(fpr x) -{ - return FPR(x.v * x.v); -} - -static inline fpr -fpr_inv(fpr x) -{ - return FPR(1.0 / x.v); -} - -static inline fpr -fpr_div(fpr x, fpr y) -{ - return FPR(x.v / y.v); -} - -#if FALCON_AVX2 // yyyAVX2+1 -TARGET_AVX2 -static inline void -fpr_sqrt_avx2(double *t) -{ - __m128d x; - - x = _mm_load1_pd(t); - x = _mm_sqrt_pd(x); - _mm_storel_pd(t, x); -} -#endif // yyyAVX2- - -static inline fpr -fpr_sqrt(fpr x) -{ - /* - * We prefer not to have a dependency on libm when it can be - * avoided. On x86, calling the sqrt() libm function inlines - * the relevant opcode (fsqrt or sqrtsd, depending on whether - * the 387 FPU or SSE2 is used for floating-point operations) - * but then makes an optional call to the library function - * for proper error handling, in case the operand is negative. - * - * To avoid this dependency, we use intrinsics or inline assembly - * on recognized platforms: - * - * - If AVX2 is explicitly enabled, then we use SSE2 intrinsics. - * - * - On GCC/Clang with SSE maths, we use SSE2 intrinsics. - * - * - On GCC/Clang on i386, or MSVC on i386, we use inline assembly - * to call the 387 FPU fsqrt opcode. - * - * - On GCC/Clang/XLC on PowerPC, we use inline assembly to call - * the fsqrt opcode (Clang needs a special hack). - * - * - On GCC/Clang on ARM with hardware floating-point, we use - * inline assembly to call the vqsrt.f64 opcode. Due to a - * complex ecosystem of compilers and assembly syntaxes, we - * have to call it "fsqrt" or "fsqrtd", depending on case. - * - * If the platform is not recognized, a call to the system - * library function sqrt() is performed. On some compilers, this - * may actually inline the relevant opcode, and call the library - * function only when the input is invalid (e.g. negative); - * Falcon never actually calls sqrt() on a negative value, but - * the dependency to libm will still be there. - */ - -#if FALCON_AVX2 // yyyAVX2+1 - fpr_sqrt_avx2(&x.v); - return x; -#else // yyyAVX2+0 -#if defined __GNUC__ && defined __SSE2_MATH__ - return FPR(_mm_cvtsd_f64(_mm_sqrt_pd(_mm_set1_pd(x.v)))); -#elif defined __GNUC__ && defined __i386__ - __asm__ __volatile__ ( - "fldl %0\n\t" - "fsqrt\n\t" - "fstpl %0\n\t" - : "+m" (x.v) : : ); - return x; -#elif defined _M_IX86 - __asm { - fld x.v - fsqrt - fstp x.v - } - return x; -#elif defined __PPC__ && defined __GNUC__ - fpr y; - -#if defined __clang__ - /* - * Normally we should use a 'd' constraint (register that contains - * a 'double' value) but Clang 3.8.1 chokes on it. Instead we use - * an 'f' constraint, counting on the fact that 'float' values - * are managed in double-precision registers anyway, and the - * compiler will not add extra rounding steps. - */ - __asm__ ( "fsqrt %0, %1" : "=f" (y.v) : "f" (x.v) : ); -#else - __asm__ ( "fsqrt %0, %1" : "=d" (y.v) : "d" (x.v) : ); -#endif - return y; -#elif (defined __ARM_FP && ((__ARM_FP & 0x08) == 0x08)) \ - || (!defined __ARM_FP && defined __ARM_VFPV2__) - /* - * On ARM, assembly syntaxes are a bit of a mess, depending on - * whether GCC or Clang is used, and the binutils version, and - * whether this is 32-bit or 64-bit mode. The code below appears - * to work on: - * 32-bit GCC-4.9.2 Clang-3.5 Binutils-2.25 - * 64-bit GCC-6.3.0 Clang-3.9 Binutils-2.28 - */ -#if defined __aarch64__ && __aarch64__ - __asm__ ( "fsqrt %d0, %d0" : "+w" (x.v) : : ); -#else - __asm__ ( "fsqrtd %P0, %P0" : "+w" (x.v) : : ); -#endif - return x; -#else - return FPR(sqrt(x.v)); -#endif -#endif // yyyAVX2- -} - -static inline int -fpr_lt(fpr x, fpr y) -{ - return x.v < y.v; -} - -TARGET_AVX2 -static inline uint64_t -fpr_expm_p63(fpr x, fpr ccs) -{ - /* - * Polynomial approximation of exp(-x) is taken from FACCT: - * https://eprint.iacr.org/2018/1234 - * Specifically, values are extracted from the implementation - * referenced from the FACCT article, and available at: - * https://github.com/raykzhao/gaussian - * Tests over more than 24 billions of random inputs in the - * 0..log(2) range have never shown a deviation larger than - * 2^(-50) from the true mathematical value. - */ - -#if FALCON_AVX2 // yyyAVX2+1 - - /* - * AVX2 implementation uses more operations than Horner's method, - * but with a lower expression tree depth. This helps because - * additions and multiplications have a latency of 4 cycles on - * a Skylake, but the CPU can issue two of them per cycle. - */ - - static const union { - double d[12]; - __m256d v[3]; - } c = { - { - 0.999999999999994892974086724280, - 0.500000000000019206858326015208, - 0.166666666666984014666397229121, - 0.041666666666110491190622155955, - 0.008333333327800835146903501993, - 0.001388888894063186997887560103, - 0.000198412739277311890541063977, - 0.000024801566833585381209939524, - 0.000002755586350219122514855659, - 0.000000275607356160477811864927, - 0.000000025299506379442070029551, - 0.000000002073772366009083061987 - } - }; - - double d1, d2, d4, d8, y; - __m256d d14, d58, d9c; - - d1 = -x.v; - d2 = d1 * d1; - d4 = d2 * d2; - d8 = d4 * d4; - d14 = _mm256_set_pd(d4, d2 * d1, d2, d1); - d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4)); - d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8)); - d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0])); - d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14); - d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58); - d9c = _mm256_hadd_pd(d9c, d9c); - y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c) - + _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1)); - y *= ccs.v; - - /* - * Final conversion goes through int64_t first, because that's what - * the underlying opcode (vcvttsd2si) will do, and we know that the - * result will fit, since x >= 0 and ccs < 1. If we did the - * conversion directly to uint64_t, then the compiler would add some - * extra code to cover the case of a source value of 2^63 or more, - * and though the alternate path would never be exercised, the - * extra comparison would cost us some cycles. - */ - return (uint64_t)(int64_t)(y * fpr_ptwo63.v); - -#else // yyyAVX2+0 - - /* - * Normal implementation uses Horner's method, which minimizes - * the number of operations. - */ - - double d, y; - - d = x.v; - y = 0.000000002073772366009083061987; - y = 0.000000025299506379442070029551 - y * d; - y = 0.000000275607356160477811864927 - y * d; - y = 0.000002755586350219122514855659 - y * d; - y = 0.000024801566833585381209939524 - y * d; - y = 0.000198412739277311890541063977 - y * d; - y = 0.001388888894063186997887560103 - y * d; - y = 0.008333333327800835146903501993 - y * d; - y = 0.041666666666110491190622155955 - y * d; - y = 0.166666666666984014666397229121 - y * d; - y = 0.500000000000019206858326015208 - y * d; - y = 0.999999999999994892974086724280 - y * d; - y = 1.000000000000000000000000000000 - y * d; - y *= ccs.v; - return (uint64_t)(y * fpr_ptwo63.v); - -#endif // yyyAVX2- -} - -#define fpr_gm_tab Zf(fpr_gm_tab) -extern const fpr fpr_gm_tab[]; - -#define fpr_p2_tab Zf(fpr_p2_tab) -extern const fpr fpr_p2_tab[]; - -/* ====================================================================== */ - -#else // yyyFPEMU+0 yyyFPNATIVE+0 - -#error No FP implementation selected - -#endif // yyyFPEMU- yyyFPNATIVE- diff --git a/crypto_sign/falcon-512-tree/m4-ct/inner.h b/crypto_sign/falcon-512-tree/m4-ct/inner.h deleted file mode 100644 index 1f7d0819..00000000 --- a/crypto_sign/falcon-512-tree/m4-ct/inner.h +++ /dev/null @@ -1,1168 +0,0 @@ -#ifndef FALCON_INNER_H__ -#define FALCON_INNER_H__ - -/* - * Internal functions for Falcon. This is not the API intended to be - * used by applications; instead, this internal API provides all the - * primitives on which wrappers build to provide external APIs. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -/* - * IMPORTANT API RULES - * ------------------- - * - * This API has some non-trivial usage rules: - * - * - * - All public functions (i.e. the non-static ones) must be referenced - * with the Zf() macro (e.g. Zf(verify_raw) for the verify_raw() - * function). That macro adds a prefix to the name, which is - * configurable with the FALCON_PREFIX macro. This allows compiling - * the code into a specific "namespace" and potentially including - * several versions of this code into a single application (e.g. to - * have an AVX2 and a non-AVX2 variants and select the one to use at - * runtime based on availability of AVX2 opcodes). - * - * - Functions that need temporary buffers expects them as a final - * tmp[] array of type uint8_t*, with a size which is documented for - * each function. However, most have some alignment requirements, - * because they will use the array to store 16-bit, 32-bit or 64-bit - * values (e.g. uint64_t or double). The caller must ensure proper - * alignment. What happens on unaligned access depends on the - * underlying architecture, ranging from a slight time penalty - * to immediate termination of the process. - * - * - Some functions rely on specific rounding rules and precision for - * floating-point numbers. On some systems (in particular 32-bit x86 - * with the 387 FPU), this requires setting an hardware control - * word. The caller MUST use set_fpu_cw() to ensure proper precision: - * - * oldcw = set_fpu_cw(2); - * Zf(sign_dyn)(...); - * set_fpu_cw(oldcw); - * - * On systems where the native floating-point precision is already - * proper, or integer-based emulation is used, the set_fpu_cw() - * function does nothing, so it can be called systematically. - */ - -// yyyPQCLEAN+0 yyyNIST+0 yyySUPERCOP+0 -#include "config.h" -// yyyPQCLEAN- yyyNIST- yyySUPERCOP- -// yyySUPERCOP+1 -// yyyCONF* -// yyySUPERCOP- - -#include -#include -#include - -#if defined FALCON_AVX2 && FALCON_AVX2 // yyyAVX2+1 -/* - * This implementation uses AVX2 and optionally FMA intrinsics. - */ -#include -#ifndef FALCON_LE -#define FALCON_LE 1 -#endif -#ifndef FALCON_UNALIGNED -#define FALCON_UNALIGNED 1 -#endif -#if defined __GNUC__ -#if defined FALCON_FMA && FALCON_FMA -#define TARGET_AVX2 __attribute__((target("avx2,fma"))) -#else -#define TARGET_AVX2 __attribute__((target("avx2"))) -#endif -#elif defined _MSC_VER && _MSC_VER -#pragma warning( disable : 4752 ) -#endif -#if defined FALCON_FMA && FALCON_FMA -#define FMADD(a, b, c) _mm256_fmadd_pd(a, b, c) -#define FMSUB(a, b, c) _mm256_fmsub_pd(a, b, c) -#else -#define FMADD(a, b, c) _mm256_add_pd(_mm256_mul_pd(a, b), c) -#define FMSUB(a, b, c) _mm256_sub_pd(_mm256_mul_pd(a, b), c) -#endif -#endif // yyyAVX2- - -// yyyNIST+0 yyyPQCLEAN+0 -/* - * On MSVC, disable warning about applying unary minus on an unsigned - * type: this is perfectly defined standard behaviour and we do it - * quite often. - */ -#if defined _MSC_VER && _MSC_VER -#pragma warning( disable : 4146 ) -#endif - -// yyySUPERCOP+0 -/* - * Enable ARM assembly on any ARMv7m platform (if it was not done before). - */ -#ifndef FALCON_ASM_CORTEXM4 -#if (defined __ARM_ARCH_7EM__ && __ARM_ARCH_7EM__) \ - && (defined __ARM_FEATURE_DSP && __ARM_FEATURE_DSP) -#define FALCON_ASM_CORTEXM4 1 -#else -#define FALCON_ASM_CORTEXM4 0 -#endif -#endif -// yyySUPERCOP- - -#if defined __i386__ || defined _M_IX86 \ - || defined __x86_64__ || defined _M_X64 || \ - (defined _ARCH_PWR8 && \ - (defined __LITTLE_ENDIAN || defined __LITTLE_ENDIAN__)) - -#ifndef FALCON_LE -#define FALCON_LE 1 -#endif -#ifndef FALCON_UNALIGNED -#define FALCON_UNALIGNED 1 -#endif - -#elif defined FALCON_ASM_CORTEXM4 && FALCON_ASM_CORTEXM4 - -#ifndef FALCON_LE -#define FALCON_LE 1 -#endif -#ifndef FALCON_UNALIGNED -#define FALCON_UNALIGNED 0 -#endif - -#elif (defined __LITTLE_ENDIAN__ && __LITTLE_ENDIAN__) \ - || (defined __BYTE_ORDER__ && defined __ORDER_LITTLE_ENDIAN__ \ - && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) - -#ifndef FALCON_LE -#define FALCON_LE 1 -#endif -#ifndef FALCON_UNALIGNED -#define FALCON_UNALIGNED 0 -#endif - -#else - -#ifndef FALCON_LE -#define FALCON_LE 0 -#endif -#ifndef FALCON_UNALIGNED -#define FALCON_UNALIGNED 0 -#endif - -#endif - -/* - * We ensure that both FALCON_FPEMU and FALCON_FPNATIVE are defined, - * with compatible values (exactly one of them must be non-zero). - * If none is defined, then default FP implementation is 'native' - * except on ARM Cortex M4. - */ -#if !defined FALCON_FPEMU && !defined FALCON_FPNATIVE - -#if (defined __ARM_FP && ((__ARM_FP & 0x08) == 0x08)) \ - || (!defined __ARM_FP && defined __ARM_VFPV2__) -#define FALCON_FPEMU 0 -#define FALCON_FPNATIVE 1 -#elif defined FALCON_ASM_CORTEXM4 && FALCON_ASM_CORTEXM4 -#define FALCON_FPEMU 1 -#define FALCON_FPNATIVE 0 -#else -#define FALCON_FPEMU 0 -#define FALCON_FPNATIVE 1 -#endif - -#elif defined FALCON_FPEMU && !defined FALCON_FPNATIVE - -#if FALCON_FPEMU -#define FALCON_FPNATIVE 0 -#else -#define FALCON_FPNATIVE 1 -#endif - -#elif defined FALCON_FPNATIVE && !defined FALCON_FPEMU - -#if FALCON_FPNATIVE -#define FALCON_FPEMU 0 -#else -#define FALCON_FPEMU 1 -#endif - -#endif - -#if (FALCON_FPEMU && FALCON_FPNATIVE) || (!FALCON_FPEMU && !FALCON_FPNATIVE) -#error Exactly one of FALCON_FPEMU and FALCON_FPNATIVE must be selected -#endif - -// yyySUPERCOP+0 -/* - * For seed generation from the operating system: - * - On Linux and glibc-2.25+, FreeBSD 12+ and OpenBSD, use getentropy(). - * - On Unix-like systems, use /dev/urandom (including as a fallback - * for failed getentropy() calls). - * - On Windows, use CryptGenRandom(). - */ - -#ifndef FALCON_RAND_GETENTROPY -#if (defined __linux__ && defined __GLIBC__ \ - && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 25))) \ - || (defined __FreeBSD__ && __FreeBSD__ >= 12) \ - || defined __OpenBSD__ -#define FALCON_RAND_GETENTROPY 1 -#else -#define FALCON_RAND_GETENTROPY 0 -#endif -#endif - -#ifndef FALCON_RAND_URANDOM -#if defined _AIX \ - || defined __ANDROID__ \ - || defined __FreeBSD__ \ - || defined __NetBSD__ \ - || defined __OpenBSD__ \ - || defined __DragonFly__ \ - || defined __linux__ \ - || (defined __sun && (defined __SVR4 || defined __svr4__)) \ - || (defined __APPLE__ && defined __MACH__) -#define FALCON_RAND_URANDOM 1 -#else -#define FALCON_RAND_URANDOM 0 -#endif -#endif - -#ifndef FALCON_RAND_WIN32 -#if defined _WIN32 || defined _WIN64 -#define FALCON_RAND_WIN32 1 -#else -#define FALCON_RAND_WIN32 0 -#endif -#endif -// yyySUPERCOP- - -/* - * For still undefined compile-time macros, define them to 0 to avoid - * warnings with -Wundef. - */ -#ifndef FALCON_AVX2 -#define FALCON_AVX2 0 -#endif -#ifndef FALCON_FMA -#define FALCON_FMA 0 -#endif -#ifndef FALCON_KG_CHACHA20 -#define FALCON_KG_CHACHA20 0 -#endif -// yyyNIST- yyyPQCLEAN- - -// yyyPQCLEAN+0 yyySUPERCOP+0 -/* - * "Naming" macro used to apply a consistent prefix over all global - * symbols. - */ -#ifndef FALCON_PREFIX -#define FALCON_PREFIX falcon_inner -#endif -#define Zf(name) Zf_(FALCON_PREFIX, name) -#define Zf_(prefix, name) Zf__(prefix, name) -#define Zf__(prefix, name) prefix ## _ ## name -// yyyPQCLEAN- yyySUPERCOP- - -// yyyAVX2+1 -/* - * We use the TARGET_AVX2 macro to tag some functions which, in some - * configurations, may use AVX2 and FMA intrinsics; this depends on - * the compiler. In all other cases, we just define it to emptiness - * (i.e. it will have no effect). - */ -#ifndef TARGET_AVX2 -#define TARGET_AVX2 -#endif -// yyyAVX2- - -/* - * Some computations with floating-point elements, in particular - * rounding to the nearest integer, rely on operations using _exactly_ - * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit - * x86, the 387 FPU may be used (depending on the target OS) and, in - * that case, may use more precision bits (i.e. 64 bits, for an 80-bit - * total type length); to prevent miscomputations, we define an explicit - * function that modifies the precision in the FPU control word. - * - * set_fpu_cw() sets the precision to the provided value, and returns - * the previously set precision; callers are supposed to restore the - * previous precision on exit. The correct (52-bit) precision is - * configured with the value "2". On unsupported compilers, or on - * targets other than 32-bit x86, or when the native 'double' type is - * not used, the set_fpu_cw() function does nothing at all. - */ -#if FALCON_FPNATIVE // yyyFPNATIVE+1 -#if defined __GNUC__ && defined __i386__ -static inline unsigned -set_fpu_cw(unsigned x) -{ - unsigned short t; - unsigned old; - - __asm__ __volatile__ ("fstcw %0" : "=m" (t) : : ); - old = (t & 0x0300u) >> 8; - t = (unsigned short)((t & ~0x0300u) | (x << 8)); - __asm__ __volatile__ ("fldcw %0" : : "m" (t) : ); - return old; -} -#elif defined _M_IX86 -static inline unsigned -set_fpu_cw(unsigned x) -{ - unsigned short t; - unsigned old; - - __asm { fstcw t } - old = (t & 0x0300u) >> 8; - t = (unsigned short)((t & ~0x0300u) | (x << 8)); - __asm { fldcw t } - return old; -} -#else -static inline unsigned -set_fpu_cw(unsigned x) -{ - return x; -} -#endif -#else // yyyFPNATIVE+0 -static inline unsigned -set_fpu_cw(unsigned x) -{ - return x; -} -#endif // yyyFPNATIVE- - -#if FALCON_FPNATIVE && !FALCON_AVX2 // yyyFPNATIVE+1 yyyAVX2+0 -/* - * If using the native 'double' type but not AVX2 code, on an x86 - * machine with SSE2 activated for maths, then we will use the - * SSE2 intrinsics. - */ -#if defined __GNUC__ && defined __SSE2_MATH__ -#include -#endif -#endif // yyyFPNATIVE- yyyAVX2- - -#if FALCON_FPNATIVE // yyyFPNATIVE+1 -/* - * For optimal reproducibility of values, we need to disable contraction - * of floating-point expressions; otherwise, on some architectures (e.g. - * PowerPC), the compiler may generate fused-multiply-add opcodes that - * may round differently than two successive separate opcodes. C99 defines - * a standard pragma for that, but GCC-6.2.2 appears to ignore it, - * hence the GCC-specific pragma (that Clang does not support). - */ -#if defined __clang__ -#pragma STDC FP_CONTRACT OFF -#elif defined __GNUC__ -#pragma GCC optimize ("fp-contract=off") -#endif -#endif // yyyFPNATIVE- - -// yyyPQCLEAN+0 -/* - * MSVC 2015 does not know the C99 keyword 'restrict'. - */ -#if defined _MSC_VER && _MSC_VER -#ifndef restrict -#define restrict __restrict -#endif -#endif -// yyyPQCLEAN- - -/* ==================================================================== */ -/* - * SHAKE256 implementation (shake.c). - * - * API is defined to be easily replaced with the fips202.h API defined - * as part of PQClean. - */ - -// yyyPQCLEAN+0 -/* -typedef struct { - union { - uint64_t A[25]; - uint8_t dbuf[200]; - } st; - uint64_t dptr; -} inner_shake256_context; - -#define inner_shake256_init Zf(i_shake256_init) -#define inner_shake256_inject Zf(i_shake256_inject) -#define inner_shake256_flip Zf(i_shake256_flip) -#define inner_shake256_extract Zf(i_shake256_extract) - -void Zf(i_shake256_init)( - inner_shake256_context *sc); -void Zf(i_shake256_inject)( - inner_shake256_context *sc, const uint8_t *in, size_t len); -void Zf(i_shake256_flip)( - inner_shake256_context *sc); -void Zf(i_shake256_extract)( - inner_shake256_context *sc, uint8_t *out, size_t len); -*/ - -// yyyPQCLEAN+1 - -#include "fips202.h" - -#define inner_shake256_context shake256incctx -#define inner_shake256_init(sc) shake256_inc_init(sc) -#define inner_shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len) -#define inner_shake256_flip(sc) shake256_inc_finalize(sc) -#define inner_shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc) - -// yyyPQCLEAN+0 - -// yyyPQCLEAN- - -/* ==================================================================== */ -/* - * Encoding/decoding functions (codec.c). - * - * Encoding functions take as parameters an output buffer (out) with - * a given maximum length (max_out_len); returned value is the actual - * number of bytes which have been written. If the output buffer is - * not large enough, then 0 is returned (some bytes may have been - * written to the buffer). If 'out' is NULL, then 'max_out_len' is - * ignored; instead, the function computes and returns the actual - * required output length (in bytes). - * - * Decoding functions take as parameters an input buffer (in) with - * its maximum length (max_in_len); returned value is the actual number - * of bytes that have been read from the buffer. If the provided length - * is too short, then 0 is returned. - * - * Values to encode or decode are vectors of integers, with N = 2^logn - * elements. - * - * Three encoding formats are defined: - * - * - modq: sequence of values modulo 12289, each encoded over exactly - * 14 bits. The encoder and decoder verify that integers are within - * the valid range (0..12288). Values are arrays of uint16. - * - * - trim: sequence of signed integers, a specified number of bits - * each. The number of bits is provided as parameter and includes - * the sign bit. Each integer x must be such that |x| < 2^(bits-1) - * (which means that the -2^(bits-1) value is forbidden); encode and - * decode functions check that property. Values are arrays of - * int16_t or int8_t, corresponding to names 'trim_i16' and - * 'trim_i8', respectively. - * - * - comp: variable-length encoding for signed integers; each integer - * uses a minimum of 9 bits, possibly more. This is normally used - * only for signatures. - * - */ - -size_t Zf(modq_encode)(void *out, size_t max_out_len, - const uint16_t *x, unsigned logn); -size_t Zf(trim_i16_encode)(void *out, size_t max_out_len, - const int16_t *x, unsigned logn, unsigned bits); -size_t Zf(trim_i8_encode)(void *out, size_t max_out_len, - const int8_t *x, unsigned logn, unsigned bits); -size_t Zf(comp_encode)(void *out, size_t max_out_len, - const int16_t *x, unsigned logn); - -size_t Zf(modq_decode)(uint16_t *x, unsigned logn, - const void *in, size_t max_in_len); -size_t Zf(trim_i16_decode)(int16_t *x, unsigned logn, unsigned bits, - const void *in, size_t max_in_len); -size_t Zf(trim_i8_decode)(int8_t *x, unsigned logn, unsigned bits, - const void *in, size_t max_in_len); -size_t Zf(comp_decode)(int16_t *x, unsigned logn, - const void *in, size_t max_in_len); - -/* - * Number of bits for key elements, indexed by logn (1 to 10). This - * is at most 8 bits for all degrees, but some degrees may have shorter - * elements. - */ -extern const uint8_t Zf(max_fg_bits)[]; -extern const uint8_t Zf(max_FG_bits)[]; - -/* - * Maximum size, in bits, of elements in a signature, indexed by logn - * (1 to 10). The size includes the sign bit. - */ -extern const uint8_t Zf(max_sig_bits)[]; - -/* ==================================================================== */ -/* - * Support functions used for both signature generation and signature - * verification (common.c). - */ - -/* - * From a SHAKE256 context (must be already flipped), produce a new - * point. This is the non-constant-time version, which may leak enough - * information to serve as a stop condition on a brute force attack on - * the hashed message (provided that the nonce value is known). - */ -void Zf(hash_to_point_vartime)(inner_shake256_context *sc, - uint16_t *x, unsigned logn); - -/* - * From a SHAKE256 context (must be already flipped), produce a new - * point. The temporary buffer (tmp) must have room for 2*2^logn bytes. - * This function is constant-time but is typically more expensive than - * Zf(hash_to_point_vartime)(). - * - * tmp[] must have 16-bit alignment. - */ -void Zf(hash_to_point_ct)(inner_shake256_context *sc, - uint16_t *x, unsigned logn, uint8_t *tmp); - -/* - * Tell whether a given vector (2N coordinates, in two halves) is - * acceptable as a signature. This compares the appropriate norm of the - * vector with the acceptance bound. Returned value is 1 on success - * (vector is short enough to be acceptable), 0 otherwise. - */ -int Zf(is_short)(const int16_t *s1, const int16_t *s2, unsigned logn); - -/* - * Tell whether a given vector (2N coordinates, in two halves) is - * acceptable as a signature. Instead of the first half s1, this - * function receives the "saturated squared norm" of s1, i.e. the - * sum of the squares of the coordinates of s1 (saturated at 2^32-1 - * if the sum exceeds 2^31-1). - * - * Returned value is 1 on success (vector is short enough to be - * acceptable), 0 otherwise. - */ -int Zf(is_short_half)(uint32_t sqn, const int16_t *s2, unsigned logn); - -/* ==================================================================== */ -/* - * Signature verification functions (vrfy.c). - */ - -/* - * Convert a public key to NTT + Montgomery format. Conversion is done - * in place. - */ -void Zf(to_ntt_monty)(uint16_t *h, unsigned logn); - -/* - * Internal signature verification code: - * c0[] contains the hashed nonce+message - * s2[] is the decoded signature - * h[] contains the public key, in NTT + Montgomery format - * logn is the degree log - * tmp[] temporary, must have at least 2*2^logn bytes - * Returned value is 1 on success, 0 on error. - * - * tmp[] must have 16-bit alignment. - */ -int Zf(verify_raw)(const uint16_t *c0, const int16_t *s2, - const uint16_t *h, unsigned logn, uint8_t *tmp); - -/* - * Compute the public key h[], given the private key elements f[] and - * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial - * modulus. This function returns 1 on success, 0 on error (an error is - * reported if f is not invertible mod phi mod q). - * - * The tmp[] array must have room for at least 2*2^logn elements. - * tmp[] must have 16-bit alignment. - */ -int Zf(compute_public)(uint16_t *h, - const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp); - -/* - * Recompute the fourth private key element. Private key consists in - * four polynomials with small coefficients f, g, F and G, which are - * such that fG - gF = q mod phi; furthermore, f is invertible modulo - * phi and modulo q. This function recomputes G from f, g and F. - * - * The tmp[] array must have room for at least 4*2^logn bytes. - * - * Returned value is 1 in success, 0 on error (f not invertible). - * tmp[] must have 16-bit alignment. - */ -int Zf(complete_private)(int8_t *G, - const int8_t *f, const int8_t *g, const int8_t *F, - unsigned logn, uint8_t *tmp); - -/* - * Test whether a given polynomial is invertible modulo phi and q. - * Polynomial coefficients are small integers. - * - * tmp[] must have 16-bit alignment. - */ -int Zf(is_invertible)( - const int16_t *s2, unsigned logn, uint8_t *tmp); - -/* - * Count the number of elements of value zero in the NTT representation - * of the given polynomial: this is the number of primitive 2n-th roots - * of unity (modulo q = 12289) that are roots of the provided polynomial - * (taken modulo q). - * - * tmp[] must have 16-bit alignment. - */ -int Zf(count_nttzero)(const int16_t *sig, unsigned logn, uint8_t *tmp); - -/* - * Internal signature verification with public key recovery: - * h[] receives the public key (NOT in NTT/Montgomery format) - * c0[] contains the hashed nonce+message - * s1[] is the first signature half - * s2[] is the second signature half - * logn is the degree log - * tmp[] temporary, must have at least 2*2^logn bytes - * Returned value is 1 on success, 0 on error. Success is returned if - * the signature is a short enough vector; in that case, the public - * key has been written to h[]. However, the caller must still - * verify that h[] is the correct value (e.g. with regards to a known - * hash of the public key). - * - * h[] may not overlap with any of the other arrays. - * - * tmp[] must have 16-bit alignment. - */ -int Zf(verify_recover)(uint16_t *h, - const uint16_t *c0, const int16_t *s1, const int16_t *s2, - unsigned logn, uint8_t *tmp); - -/* ==================================================================== */ -/* - * Implementation of floating-point real numbers (fpr.h, fpr.c). - */ - -/* - * Real numbers are implemented by an extra header file, included below. - * This is meant to support pluggable implementations. The default - * implementation relies on the C type 'double'. - * - * The included file must define the following types, functions and - * constants: - * - * fpr - * type for a real number - * - * fpr fpr_of(int64_t i) - * cast an integer into a real number; source must be in the - * -(2^63-1)..+(2^63-1) range - * - * fpr fpr_scaled(int64_t i, int sc) - * compute i*2^sc as a real number; source 'i' must be in the - * -(2^63-1)..+(2^63-1) range - * - * fpr fpr_ldexp(fpr x, int e) - * compute x*2^e - * - * int64_t fpr_rint(fpr x) - * round x to the nearest integer; x must be in the -(2^63-1) - * to +(2^63-1) range - * - * int64_t fpr_trunc(fpr x) - * round to an integer; this rounds towards zero; value must - * be in the -(2^63-1) to +(2^63-1) range - * - * fpr fpr_add(fpr x, fpr y) - * compute x + y - * - * fpr fpr_sub(fpr x, fpr y) - * compute x - y - * - * fpr fpr_neg(fpr x) - * compute -x - * - * fpr fpr_half(fpr x) - * compute x/2 - * - * fpr fpr_double(fpr x) - * compute x*2 - * - * fpr fpr_mul(fpr x, fpr y) - * compute x * y - * - * fpr fpr_sqr(fpr x) - * compute x * x - * - * fpr fpr_inv(fpr x) - * compute 1/x - * - * fpr fpr_div(fpr x, fpr y) - * compute x/y - * - * fpr fpr_sqrt(fpr x) - * compute the square root of x - * - * int fpr_lt(fpr x, fpr y) - * return 1 if x < y, 0 otherwise - * - * uint64_t fpr_expm_p63(fpr x) - * return exp(x), assuming that 0 <= x < log(2). Returned value - * is scaled to 63 bits (i.e. it really returns 2^63*exp(-x), - * rounded to the nearest integer). Computation should have a - * precision of at least 45 bits. - * - * const fpr fpr_gm_tab[] - * array of constants for FFT / iFFT - * - * const fpr fpr_p2_tab[] - * precomputed powers of 2 (by index, 0 to 10) - * - * Constants of type 'fpr': - * - * fpr fpr_q 12289 - * fpr fpr_inverse_of_q 1/12289 - * fpr fpr_inv_2sqrsigma0 1/(2*(1.8205^2)) - * fpr fpr_inv_sigma 1/(1.55*sqrt(12289)) - * fpr fpr_sigma_min_9 1.291500756233514568549480827642 - * fpr fpr_sigma_min_10 1.311734375905083682667395805765 - * fpr fpr_log2 log(2) - * fpr fpr_inv_log2 1/log(2) - * fpr fpr_bnorm_max 16822.4121 - * fpr fpr_zero 0 - * fpr fpr_one 1 - * fpr fpr_two 2 - * fpr fpr_onehalf 0.5 - * fpr fpr_ptwo31 2^31 - * fpr fpr_ptwo31m1 2^31-1 - * fpr fpr_mtwo31m1 -(2^31-1) - * fpr fpr_ptwo63m1 2^63-1 - * fpr fpr_mtwo63m1 -(2^63-1) - * fpr fpr_ptwo63 2^63 - */ -#include "fpr.h" - -/* ==================================================================== */ -/* - * RNG (rng.c). - * - * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256 - * context (flipped) and is used for bulk pseudorandom generation. - * A system-dependent seed generator is also provided. - */ - -/* - * Obtain a random seed from the system RNG. - * - * Returned value is 1 on success, 0 on error. - */ -int Zf(get_seed)(void *seed, size_t seed_len); - -/* - * Structure for a PRNG. This includes a large buffer so that values - * get generated in advance. The 'state' is used to keep the current - * PRNG algorithm state (contents depend on the selected algorithm). - * - * The unions with 'dummy_u64' are there to ensure proper alignment for - * 64-bit direct access. - */ -typedef struct { - union { - uint8_t d[512]; /* MUST be 512, exactly */ - uint64_t dummy_u64; - } buf; - size_t ptr; - union { - uint8_t d[256]; - uint64_t dummy_u64; - } state; - int type; -} prng; - -/* - * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256 - * context (in "flipped" state) to obtain its initial state. - */ -void Zf(prng_init)(prng *p, inner_shake256_context *src); - -/* - * Refill the PRNG buffer. This is normally invoked automatically, and - * is declared here only so that prng_get_u64() may be inlined. - */ -void Zf(prng_refill)(prng *p); - -/* - * Get some bytes from a PRNG. - */ -void Zf(prng_get_bytes)(prng *p, void *dst, size_t len); - -/* - * Get a 64-bit random value from a PRNG. - */ -static inline uint64_t -prng_get_u64(prng *p) -{ - size_t u; - - /* - * If there are less than 9 bytes in the buffer, we refill it. - * This means that we may drop the last few bytes, but this allows - * for faster extraction code. Also, it means that we never leave - * an empty buffer. - */ - u = p->ptr; - if (u >= (sizeof p->buf.d) - 9) { - Zf(prng_refill)(p); - u = 0; - } - p->ptr = u + 8; - - /* - * On systems that use little-endian encoding and allow - * unaligned accesses, we can simply read the data where it is. - */ -#if FALCON_LE && FALCON_UNALIGNED // yyyLEU+1 - return *(uint64_t *)(p->buf.d + u); -#else // yyyLEU+0 - return (uint64_t)p->buf.d[u + 0] - | ((uint64_t)p->buf.d[u + 1] << 8) - | ((uint64_t)p->buf.d[u + 2] << 16) - | ((uint64_t)p->buf.d[u + 3] << 24) - | ((uint64_t)p->buf.d[u + 4] << 32) - | ((uint64_t)p->buf.d[u + 5] << 40) - | ((uint64_t)p->buf.d[u + 6] << 48) - | ((uint64_t)p->buf.d[u + 7] << 56); -#endif // yyyLEU- -} - -/* - * Get an 8-bit random value from a PRNG. - */ -static inline unsigned -prng_get_u8(prng *p) -{ - unsigned v; - - v = p->buf.d[p->ptr ++]; - if (p->ptr == sizeof p->buf.d) { - Zf(prng_refill)(p); - } - return v; -} - -/* ==================================================================== */ -/* - * FFT (falcon-fft.c). - * - * A real polynomial is represented as an array of N 'fpr' elements. - * The FFT representation of a real polynomial contains N/2 complex - * elements; each is stored as two real numbers, for the real and - * imaginary parts, respectively. See falcon-fft.c for details on the - * internal representation. - */ - -/* - * Compute FFT in-place: the source array should contain a real - * polynomial (N coefficients); its storage area is reused to store - * the FFT representation of that polynomial (N/2 complex numbers). - * - * 'logn' MUST lie between 1 and 10 (inclusive). - */ -void Zf(FFT)(fpr *f, unsigned logn); - -/* - * Compute the inverse FFT in-place: the source array should contain the - * FFT representation of a real polynomial (N/2 elements); the resulting - * real polynomial (N coefficients of type 'fpr') is written over the - * array. - * - * 'logn' MUST lie between 1 and 10 (inclusive). - */ -void Zf(iFFT)(fpr *f, unsigned logn); - -/* - * Add polynomial b to polynomial a. a and b MUST NOT overlap. This - * function works in both normal and FFT representations. - */ -void Zf(poly_add)(fpr *restrict a, const fpr *restrict b, unsigned logn); - -/* - * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This - * function works in both normal and FFT representations. - */ -void Zf(poly_sub)(fpr *restrict a, const fpr *restrict b, unsigned logn); - -/* - * Negate polynomial a. This function works in both normal and FFT - * representations. - */ -void Zf(poly_neg)(fpr *a, unsigned logn); - -/* - * Compute adjoint of polynomial a. This function works only in FFT - * representation. - */ -void Zf(poly_adj_fft)(fpr *a, unsigned logn); - -/* - * Multiply polynomial a with polynomial b. a and b MUST NOT overlap. - * This function works only in FFT representation. - */ -void Zf(poly_mul_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn); - -/* - * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT - * overlap. This function works only in FFT representation. - */ -void Zf(poly_muladj_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn); - -/* - * Multiply polynomial with its own adjoint. This function works only in FFT - * representation. - */ -void Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn); - -/* - * Multiply polynomial with a real constant. This function works in both - * normal and FFT representations. - */ -void Zf(poly_mulconst)(fpr *a, fpr x, unsigned logn); - -/* - * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation). - * a and b MUST NOT overlap. - */ -void Zf(poly_div_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn); - -/* - * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g)) - * (also in FFT representation). Since the result is auto-adjoint, all its - * coordinates in FFT representation are real; as such, only the first N/2 - * values of d[] are filled (the imaginary parts are skipped). - * - * Array d MUST NOT overlap with either a or b. - */ -void Zf(poly_invnorm2_fft)(fpr *restrict d, - const fpr *restrict a, const fpr *restrict b, unsigned logn); - -/* - * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g) - * (also in FFT representation). Destination d MUST NOT overlap with - * any of the source arrays. - */ -void Zf(poly_add_muladj_fft)(fpr *restrict d, - const fpr *restrict F, const fpr *restrict G, - const fpr *restrict f, const fpr *restrict g, unsigned logn); - -/* - * Multiply polynomial a by polynomial b, where b is autoadjoint. Both - * a and b are in FFT representation. Since b is autoadjoint, all its - * FFT coefficients are real, and the array b contains only N/2 elements. - * a and b MUST NOT overlap. - */ -void Zf(poly_mul_autoadj_fft)(fpr *restrict a, - const fpr *restrict b, unsigned logn); - -/* - * Divide polynomial a by polynomial b, where b is autoadjoint. Both - * a and b are in FFT representation. Since b is autoadjoint, all its - * FFT coefficients are real, and the array b contains only N/2 elements. - * a and b MUST NOT overlap. - */ -void Zf(poly_div_autoadj_fft)(fpr *restrict a, - const fpr *restrict b, unsigned logn); - -/* - * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT - * representation. On input, g00, g01 and g11 are provided (where the - * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10 - * and d11 values are written in g00, g01 and g11, respectively - * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]). - * (In fact, d00 = g00, so the g00 operand is left unmodified.) - */ -void Zf(poly_LDL_fft)(const fpr *restrict g00, - fpr *restrict g01, fpr *restrict g11, unsigned logn); - -/* - * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT - * representation. This is identical to poly_LDL_fft() except that - * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written - * in two other separate buffers provided as extra parameters. - */ -void Zf(poly_LDLmv_fft)(fpr *restrict d11, fpr *restrict l10, - const fpr *restrict g00, const fpr *restrict g01, - const fpr *restrict g11, unsigned logn); - -/* - * Apply "split" operation on a polynomial in FFT representation: - * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1 - * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap. - */ -void Zf(poly_split_fft)(fpr *restrict f0, fpr *restrict f1, - const fpr *restrict f, unsigned logn); - -/* - * Apply "merge" operation on two polynomials in FFT representation: - * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes - * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1. - * f MUST NOT overlap with either f0 or f1. - */ -void Zf(poly_merge_fft)(fpr *restrict f, - const fpr *restrict f0, const fpr *restrict f1, unsigned logn); - -/* ==================================================================== */ -/* - * Key pair generation. - */ - -/* - * Required sizes of the temporary buffer (in bytes). - * - * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1 - * or 2) where it is slightly greater. - */ -#define FALCON_KEYGEN_TEMP_1 136 -#define FALCON_KEYGEN_TEMP_2 272 -#define FALCON_KEYGEN_TEMP_3 224 -#define FALCON_KEYGEN_TEMP_4 448 -#define FALCON_KEYGEN_TEMP_5 896 -#define FALCON_KEYGEN_TEMP_6 1792 -#define FALCON_KEYGEN_TEMP_7 3584 -#define FALCON_KEYGEN_TEMP_8 7168 -#define FALCON_KEYGEN_TEMP_9 14336 -#define FALCON_KEYGEN_TEMP_10 28672 - -/* - * Generate a new key pair. Randomness is extracted from the provided - * SHAKE256 context, which must have already been seeded and flipped. - * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_* - * macros) and be aligned for the uint32_t, uint64_t and fpr types. - * - * The private key elements are written in f, g, F and G, and the - * public key is written in h. Either or both of G and h may be NULL, - * in which case the corresponding element is not returned (they can - * be recomputed from f, g and F). - * - * tmp[] must have 64-bit alignment. - * This function uses floating-point rounding (see set_fpu_cw()). - */ -void Zf(keygen)(inner_shake256_context *rng, - int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h, - unsigned logn, uint8_t *tmp); - -/* ==================================================================== */ -/* - * Signature generation. - */ - -/* - * Expand a private key into the B0 matrix in FFT representation and - * the LDL tree. All the values are written in 'expanded_key', for - * a total of (8*logn+40)*2^logn bytes. - * - * The tmp[] array must have room for at least 48*2^logn bytes. - * - * tmp[] must have 64-bit alignment. - * This function uses floating-point rounding (see set_fpu_cw()). - */ -void Zf(expand_privkey)(fpr *restrict expanded_key, - const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G, - unsigned logn, uint8_t *restrict tmp); - -/* - * Compute a signature over the provided hashed message (hm); the - * signature value is one short vector. This function uses an - * expanded key (as generated by Zf(expand_privkey)()). - * - * The sig[] and hm[] buffers may overlap. - * - * On successful output, the start of the tmp[] buffer contains the s1 - * vector (as int16_t elements). - * - * The minimal size (in bytes) of tmp[] is 48*2^logn bytes. - * - * tmp[] must have 64-bit alignment. - * This function uses floating-point rounding (see set_fpu_cw()). - */ -void Zf(sign_tree)(int16_t *sig, inner_shake256_context *rng, - const fpr *restrict expanded_key, - const uint16_t *hm, unsigned logn, uint8_t *tmp); - -/* - * Compute a signature over the provided hashed message (hm); the - * signature value is one short vector. This function uses a raw - * key and dynamically recompute the B0 matrix and LDL tree; this - * saves RAM since there is no needed for an expanded key, but - * increases the signature cost. - * - * The sig[] and hm[] buffers may overlap. - * - * On successful output, the start of the tmp[] buffer contains the s1 - * vector (as int16_t elements). - * - * The minimal size (in bytes) of tmp[] is 72*2^logn bytes. - * - * tmp[] must have 64-bit alignment. - * This function uses floating-point rounding (see set_fpu_cw()). - */ -void Zf(sign_dyn)(int16_t *sig, inner_shake256_context *rng, - const int8_t *restrict f, const int8_t *restrict g, - const int8_t *restrict F, const int8_t *restrict G, - const uint16_t *hm, unsigned logn, uint8_t *tmp); - -/* - * Internal sampler engine. Exported for tests. - * - * sampler_context wraps around a source of random numbers (PRNG) and - * the sigma_min value (nominally dependent on the degree). - * - * sampler() takes as parameters: - * ctx pointer to the sampler_context structure - * mu center for the distribution - * isigma inverse of the distribution standard deviation - * It returns an integer sampled along the Gaussian distribution centered - * on mu and of standard deviation sigma = 1/isigma. - * - * gaussian0_sampler() takes as parameter a pointer to a PRNG, and - * returns an integer sampled along a half-Gaussian with standard - * deviation sigma0 = 1.8205 (center is 0, returned value is - * nonnegative). - */ - -typedef struct { - prng p; - fpr sigma_min; -} sampler_context; - -TARGET_AVX2 -int Zf(sampler)(void *ctx, fpr mu, fpr isigma); - -TARGET_AVX2 -int Zf(gaussian0_sampler)(prng *p); - -/* ==================================================================== */ - -#endif diff --git a/crypto_sign/falcon-512-tree/m4-ct/keygen.c b/crypto_sign/falcon-512-tree/m4-ct/keygen.c deleted file mode 100644 index cf7de008..00000000 --- a/crypto_sign/falcon-512-tree/m4-ct/keygen.c +++ /dev/null @@ -1,4301 +0,0 @@ -/* - * Falcon key pair generation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -#define MKN(logn) ((size_t)1 << (logn)) - -/* ==================================================================== */ -/* - * Modular arithmetics. - * - * We implement a few functions for computing modulo a small integer p. - * - * All functions require that 2^30 < p < 2^31. Moreover, operands must - * be in the 0..p-1 range. - * - * Modular addition and subtraction work for all such p. - * - * Montgomery multiplication requires that p is odd, and must be provided - * with an additional value p0i = -1/p mod 2^31. See below for some basics - * on Montgomery multiplication. - * - * Division computes an inverse modulo p by an exponentiation (with - * exponent p-2): this works only if p is prime. Multiplication - * requirements also apply, i.e. p must be odd and p0i must be provided. - * - * The NTT and inverse NTT need all of the above, and also that - * p = 1 mod 2048. - * - * ----------------------------------------------------------------------- - * - * We use Montgomery representation with 31-bit values: - * - * Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p. - * Montgomery representation of an integer x modulo p is x*R mod p. - * - * Montgomery multiplication computes (x*y)/R mod p for - * operands x and y. Therefore: - * - * - if operands are x*R and y*R (Montgomery representations of x and - * y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R - * mod p, which is the Montgomery representation of the product x*y; - * - * - if operands are x*R and y (or x and y*R), then Montgomery - * multiplication returns x*y mod p: mixed-representation - * multiplications yield results in normal representation. - * - * To convert to Montgomery representation, we multiply by R, which is done - * by Montgomery-multiplying by R^2. Stand-alone conversion back from - * Montgomery representation is Montgomery-multiplication by 1. - */ - -/* - * Precomputed small primes. Each element contains the following: - * - * p The prime itself. - * - * g A primitive root of phi = X^N+1 (in field Z_p). - * - * s The inverse of the product of all previous primes in the array, - * computed modulo p and in Montgomery representation. - * - * All primes are such that p = 1 mod 2048, and are lower than 2^31. They - * are listed in decreasing order. - */ - -typedef struct { - uint32_t p; - uint32_t g; - uint32_t s; -} small_prime; - -static const small_prime PRIMES[] = { - { 2147473409, 383167813, 10239 }, - { 2147389441, 211808905, 471403745 }, - { 2147387393, 37672282, 1329335065 }, - { 2147377153, 1977035326, 968223422 }, - { 2147358721, 1067163706, 132460015 }, - { 2147352577, 1606082042, 598693809 }, - { 2147346433, 2033915641, 1056257184 }, - { 2147338241, 1653770625, 421286710 }, - { 2147309569, 631200819, 1111201074 }, - { 2147297281, 2038364663, 1042003613 }, - { 2147295233, 1962540515, 19440033 }, - { 2147239937, 2100082663, 353296760 }, - { 2147235841, 1991153006, 1703918027 }, - { 2147217409, 516405114, 1258919613 }, - { 2147205121, 409347988, 1089726929 }, - { 2147196929, 927788991, 1946238668 }, - { 2147178497, 1136922411, 1347028164 }, - { 2147100673, 868626236, 701164723 }, - { 2147082241, 1897279176, 617820870 }, - { 2147074049, 1888819123, 158382189 }, - { 2147051521, 25006327, 522758543 }, - { 2147043329, 327546255, 37227845 }, - { 2147039233, 766324424, 1133356428 }, - { 2146988033, 1862817362, 73861329 }, - { 2146963457, 404622040, 653019435 }, - { 2146959361, 1936581214, 995143093 }, - { 2146938881, 1559770096, 634921513 }, - { 2146908161, 422623708, 1985060172 }, - { 2146885633, 1751189170, 298238186 }, - { 2146871297, 578919515, 291810829 }, - { 2146846721, 1114060353, 915902322 }, - { 2146834433, 2069565474, 47859524 }, - { 2146818049, 1552824584, 646281055 }, - { 2146775041, 1906267847, 1597832891 }, - { 2146756609, 1847414714, 1228090888 }, - { 2146744321, 1818792070, 1176377637 }, - { 2146738177, 1118066398, 1054971214 }, - { 2146736129, 52057278, 933422153 }, - { 2146713601, 592259376, 1406621510 }, - { 2146695169, 263161877, 1514178701 }, - { 2146656257, 685363115, 384505091 }, - { 2146650113, 927727032, 537575289 }, - { 2146646017, 52575506, 1799464037 }, - { 2146643969, 1276803876, 1348954416 }, - { 2146603009, 814028633, 1521547704 }, - { 2146572289, 1846678872, 1310832121 }, - { 2146547713, 919368090, 1019041349 }, - { 2146508801, 671847612, 38582496 }, - { 2146492417, 283911680, 532424562 }, - { 2146490369, 1780044827, 896447978 }, - { 2146459649, 327980850, 1327906900 }, - { 2146447361, 1310561493, 958645253 }, - { 2146441217, 412148926, 287271128 }, - { 2146437121, 293186449, 2009822534 }, - { 2146430977, 179034356, 1359155584 }, - { 2146418689, 1517345488, 1790248672 }, - { 2146406401, 1615820390, 1584833571 }, - { 2146404353, 826651445, 607120498 }, - { 2146379777, 3816988, 1897049071 }, - { 2146363393, 1221409784, 1986921567 }, - { 2146355201, 1388081168, 849968120 }, - { 2146336769, 1803473237, 1655544036 }, - { 2146312193, 1023484977, 273671831 }, - { 2146293761, 1074591448, 467406983 }, - { 2146283521, 831604668, 1523950494 }, - { 2146203649, 712865423, 1170834574 }, - { 2146154497, 1764991362, 1064856763 }, - { 2146142209, 627386213, 1406840151 }, - { 2146127873, 1638674429, 2088393537 }, - { 2146099201, 1516001018, 690673370 }, - { 2146093057, 1294931393, 315136610 }, - { 2146091009, 1942399533, 973539425 }, - { 2146078721, 1843461814, 2132275436 }, - { 2146060289, 1098740778, 360423481 }, - { 2146048001, 1617213232, 1951981294 }, - { 2146041857, 1805783169, 2075683489 }, - { 2146019329, 272027909, 1753219918 }, - { 2145986561, 1206530344, 2034028118 }, - { 2145976321, 1243769360, 1173377644 }, - { 2145964033, 887200839, 1281344586 }, - { 2145906689, 1651026455, 906178216 }, - { 2145875969, 1673238256, 1043521212 }, - { 2145871873, 1226591210, 1399796492 }, - { 2145841153, 1465353397, 1324527802 }, - { 2145832961, 1150638905, 554084759 }, - { 2145816577, 221601706, 427340863 }, - { 2145785857, 608896761, 316590738 }, - { 2145755137, 1712054942, 1684294304 }, - { 2145742849, 1302302867, 724873116 }, - { 2145728513, 516717693, 431671476 }, - { 2145699841, 524575579, 1619722537 }, - { 2145691649, 1925625239, 982974435 }, - { 2145687553, 463795662, 1293154300 }, - { 2145673217, 771716636, 881778029 }, - { 2145630209, 1509556977, 837364988 }, - { 2145595393, 229091856, 851648427 }, - { 2145587201, 1796903241, 635342424 }, - { 2145525761, 715310882, 1677228081 }, - { 2145495041, 1040930522, 200685896 }, - { 2145466369, 949804237, 1809146322 }, - { 2145445889, 1673903706, 95316881 }, - { 2145390593, 806941852, 1428671135 }, - { 2145372161, 1402525292, 159350694 }, - { 2145361921, 2124760298, 1589134749 }, - { 2145359873, 1217503067, 1561543010 }, - { 2145355777, 338341402, 83865711 }, - { 2145343489, 1381532164, 641430002 }, - { 2145325057, 1883895478, 1528469895 }, - { 2145318913, 1335370424, 65809740 }, - { 2145312769, 2000008042, 1919775760 }, - { 2145300481, 961450962, 1229540578 }, - { 2145282049, 910466767, 1964062701 }, - { 2145232897, 816527501, 450152063 }, - { 2145218561, 1435128058, 1794509700 }, - { 2145187841, 33505311, 1272467582 }, - { 2145181697, 269767433, 1380363849 }, - { 2145175553, 56386299, 1316870546 }, - { 2145079297, 2106880293, 1391797340 }, - { 2145021953, 1347906152, 720510798 }, - { 2145015809, 206769262, 1651459955 }, - { 2145003521, 1885513236, 1393381284 }, - { 2144960513, 1810381315, 31937275 }, - { 2144944129, 1306487838, 2019419520 }, - { 2144935937, 37304730, 1841489054 }, - { 2144894977, 1601434616, 157985831 }, - { 2144888833, 98749330, 2128592228 }, - { 2144880641, 1772327002, 2076128344 }, - { 2144864257, 1404514762, 2029969964 }, - { 2144827393, 801236594, 406627220 }, - { 2144806913, 349217443, 1501080290 }, - { 2144796673, 1542656776, 2084736519 }, - { 2144778241, 1210734884, 1746416203 }, - { 2144759809, 1146598851, 716464489 }, - { 2144757761, 286328400, 1823728177 }, - { 2144729089, 1347555695, 1836644881 }, - { 2144727041, 1795703790, 520296412 }, - { 2144696321, 1302475157, 852964281 }, - { 2144667649, 1075877614, 504992927 }, - { 2144573441, 198765808, 1617144982 }, - { 2144555009, 321528767, 155821259 }, - { 2144550913, 814139516, 1819937644 }, - { 2144536577, 571143206, 962942255 }, - { 2144524289, 1746733766, 2471321 }, - { 2144512001, 1821415077, 124190939 }, - { 2144468993, 917871546, 1260072806 }, - { 2144458753, 378417981, 1569240563 }, - { 2144421889, 175229668, 1825620763 }, - { 2144409601, 1699216963, 351648117 }, - { 2144370689, 1071885991, 958186029 }, - { 2144348161, 1763151227, 540353574 }, - { 2144335873, 1060214804, 919598847 }, - { 2144329729, 663515846, 1448552668 }, - { 2144327681, 1057776305, 590222840 }, - { 2144309249, 1705149168, 1459294624 }, - { 2144296961, 325823721, 1649016934 }, - { 2144290817, 738775789, 447427206 }, - { 2144243713, 962347618, 893050215 }, - { 2144237569, 1655257077, 900860862 }, - { 2144161793, 242206694, 1567868672 }, - { 2144155649, 769415308, 1247993134 }, - { 2144137217, 320492023, 515841070 }, - { 2144120833, 1639388522, 770877302 }, - { 2144071681, 1761785233, 964296120 }, - { 2144065537, 419817825, 204564472 }, - { 2144028673, 666050597, 2091019760 }, - { 2144010241, 1413657615, 1518702610 }, - { 2143952897, 1238327946, 475672271 }, - { 2143940609, 307063413, 1176750846 }, - { 2143918081, 2062905559, 786785803 }, - { 2143899649, 1338112849, 1562292083 }, - { 2143891457, 68149545, 87166451 }, - { 2143885313, 921750778, 394460854 }, - { 2143854593, 719766593, 133877196 }, - { 2143836161, 1149399850, 1861591875 }, - { 2143762433, 1848739366, 1335934145 }, - { 2143756289, 1326674710, 102999236 }, - { 2143713281, 808061791, 1156900308 }, - { 2143690753, 388399459, 1926468019 }, - { 2143670273, 1427891374, 1756689401 }, - { 2143666177, 1912173949, 986629565 }, - { 2143645697, 2041160111, 371842865 }, - { 2143641601, 1279906897, 2023974350 }, - { 2143635457, 720473174, 1389027526 }, - { 2143621121, 1298309455, 1732632006 }, - { 2143598593, 1548762216, 1825417506 }, - { 2143567873, 620475784, 1073787233 }, - { 2143561729, 1932954575, 949167309 }, - { 2143553537, 354315656, 1652037534 }, - { 2143541249, 577424288, 1097027618 }, - { 2143531009, 357862822, 478640055 }, - { 2143522817, 2017706025, 1550531668 }, - { 2143506433, 2078127419, 1824320165 }, - { 2143488001, 613475285, 1604011510 }, - { 2143469569, 1466594987, 502095196 }, - { 2143426561, 1115430331, 1044637111 }, - { 2143383553, 9778045, 1902463734 }, - { 2143377409, 1557401276, 2056861771 }, - { 2143363073, 652036455, 1965915971 }, - { 2143260673, 1464581171, 1523257541 }, - { 2143246337, 1876119649, 764541916 }, - { 2143209473, 1614992673, 1920672844 }, - { 2143203329, 981052047, 2049774209 }, - { 2143160321, 1847355533, 728535665 }, - { 2143129601, 965558457, 603052992 }, - { 2143123457, 2140817191, 8348679 }, - { 2143100929, 1547263683, 694209023 }, - { 2143092737, 643459066, 1979934533 }, - { 2143082497, 188603778, 2026175670 }, - { 2143062017, 1657329695, 377451099 }, - { 2143051777, 114967950, 979255473 }, - { 2143025153, 1698431342, 1449196896 }, - { 2143006721, 1862741675, 1739650365 }, - { 2142996481, 756660457, 996160050 }, - { 2142976001, 927864010, 1166847574 }, - { 2142965761, 905070557, 661974566 }, - { 2142916609, 40932754, 1787161127 }, - { 2142892033, 1987985648, 675335382 }, - { 2142885889, 797497211, 1323096997 }, - { 2142871553, 2068025830, 1411877159 }, - { 2142861313, 1217177090, 1438410687 }, - { 2142830593, 409906375, 1767860634 }, - { 2142803969, 1197788993, 359782919 }, - { 2142785537, 643817365, 513932862 }, - { 2142779393, 1717046338, 218943121 }, - { 2142724097, 89336830, 416687049 }, - { 2142707713, 5944581, 1356813523 }, - { 2142658561, 887942135, 2074011722 }, - { 2142638081, 151851972, 1647339939 }, - { 2142564353, 1691505537, 1483107336 }, - { 2142533633, 1989920200, 1135938817 }, - { 2142529537, 959263126, 1531961857 }, - { 2142527489, 453251129, 1725566162 }, - { 2142502913, 1536028102, 182053257 }, - { 2142498817, 570138730, 701443447 }, - { 2142416897, 326965800, 411931819 }, - { 2142363649, 1675665410, 1517191733 }, - { 2142351361, 968529566, 1575712703 }, - { 2142330881, 1384953238, 1769087884 }, - { 2142314497, 1977173242, 1833745524 }, - { 2142289921, 95082313, 1714775493 }, - { 2142283777, 109377615, 1070584533 }, - { 2142277633, 16960510, 702157145 }, - { 2142263297, 553850819, 431364395 }, - { 2142208001, 241466367, 2053967982 }, - { 2142164993, 1795661326, 1031836848 }, - { 2142097409, 1212530046, 712772031 }, - { 2142087169, 1763869720, 822276067 }, - { 2142078977, 644065713, 1765268066 }, - { 2142074881, 112671944, 643204925 }, - { 2142044161, 1387785471, 1297890174 }, - { 2142025729, 783885537, 1000425730 }, - { 2142011393, 905662232, 1679401033 }, - { 2141974529, 799788433, 468119557 }, - { 2141943809, 1932544124, 449305555 }, - { 2141933569, 1527403256, 841867925 }, - { 2141931521, 1247076451, 743823916 }, - { 2141902849, 1199660531, 401687910 }, - { 2141890561, 150132350, 1720336972 }, - { 2141857793, 1287438162, 663880489 }, - { 2141833217, 618017731, 1819208266 }, - { 2141820929, 999578638, 1403090096 }, - { 2141786113, 81834325, 1523542501 }, - { 2141771777, 120001928, 463556492 }, - { 2141759489, 122455485, 2124928282 }, - { 2141749249, 141986041, 940339153 }, - { 2141685761, 889088734, 477141499 }, - { 2141673473, 324212681, 1122558298 }, - { 2141669377, 1175806187, 1373818177 }, - { 2141655041, 1113654822, 296887082 }, - { 2141587457, 991103258, 1585913875 }, - { 2141583361, 1401451409, 1802457360 }, - { 2141575169, 1571977166, 712760980 }, - { 2141546497, 1107849376, 1250270109 }, - { 2141515777, 196544219, 356001130 }, - { 2141495297, 1733571506, 1060744866 }, - { 2141483009, 321552363, 1168297026 }, - { 2141458433, 505818251, 733225819 }, - { 2141360129, 1026840098, 948342276 }, - { 2141325313, 945133744, 2129965998 }, - { 2141317121, 1871100260, 1843844634 }, - { 2141286401, 1790639498, 1750465696 }, - { 2141267969, 1376858592, 186160720 }, - { 2141255681, 2129698296, 1876677959 }, - { 2141243393, 2138900688, 1340009628 }, - { 2141214721, 1933049835, 1087819477 }, - { 2141212673, 1898664939, 1786328049 }, - { 2141202433, 990234828, 940682169 }, - { 2141175809, 1406392421, 993089586 }, - { 2141165569, 1263518371, 289019479 }, - { 2141073409, 1485624211, 507864514 }, - { 2141052929, 1885134788, 311252465 }, - { 2141040641, 1285021247, 280941862 }, - { 2141028353, 1527610374, 375035110 }, - { 2141011969, 1400626168, 164696620 }, - { 2140999681, 632959608, 966175067 }, - { 2140997633, 2045628978, 1290889438 }, - { 2140993537, 1412755491, 375366253 }, - { 2140942337, 719477232, 785367828 }, - { 2140925953, 45224252, 836552317 }, - { 2140917761, 1157376588, 1001839569 }, - { 2140887041, 278480752, 2098732796 }, - { 2140837889, 1663139953, 924094810 }, - { 2140788737, 802501511, 2045368990 }, - { 2140766209, 1820083885, 1800295504 }, - { 2140764161, 1169561905, 2106792035 }, - { 2140696577, 127781498, 1885987531 }, - { 2140684289, 16014477, 1098116827 }, - { 2140653569, 665960598, 1796728247 }, - { 2140594177, 1043085491, 377310938 }, - { 2140579841, 1732838211, 1504505945 }, - { 2140569601, 302071939, 358291016 }, - { 2140567553, 192393733, 1909137143 }, - { 2140557313, 406595731, 1175330270 }, - { 2140549121, 1748850918, 525007007 }, - { 2140477441, 499436566, 1031159814 }, - { 2140469249, 1886004401, 1029951320 }, - { 2140426241, 1483168100, 1676273461 }, - { 2140420097, 1779917297, 846024476 }, - { 2140413953, 522948893, 1816354149 }, - { 2140383233, 1931364473, 1296921241 }, - { 2140366849, 1917356555, 147196204 }, - { 2140354561, 16466177, 1349052107 }, - { 2140348417, 1875366972, 1860485634 }, - { 2140323841, 456498717, 1790256483 }, - { 2140321793, 1629493973, 150031888 }, - { 2140315649, 1904063898, 395510935 }, - { 2140280833, 1784104328, 831417909 }, - { 2140250113, 256087139, 697349101 }, - { 2140229633, 388553070, 243875754 }, - { 2140223489, 747459608, 1396270850 }, - { 2140200961, 507423743, 1895572209 }, - { 2140162049, 580106016, 2045297469 }, - { 2140149761, 712426444, 785217995 }, - { 2140137473, 1441607584, 536866543 }, - { 2140119041, 346538902, 1740434653 }, - { 2140090369, 282642885, 21051094 }, - { 2140076033, 1407456228, 319910029 }, - { 2140047361, 1619330500, 1488632070 }, - { 2140041217, 2089408064, 2012026134 }, - { 2140008449, 1705524800, 1613440760 }, - { 2139924481, 1846208233, 1280649481 }, - { 2139906049, 989438755, 1185646076 }, - { 2139867137, 1522314850, 372783595 }, - { 2139842561, 1681587377, 216848235 }, - { 2139826177, 2066284988, 1784999464 }, - { 2139824129, 480888214, 1513323027 }, - { 2139789313, 847937200, 858192859 }, - { 2139783169, 1642000434, 1583261448 }, - { 2139770881, 940699589, 179702100 }, - { 2139768833, 315623242, 964612676 }, - { 2139666433, 331649203, 764666914 }, - { 2139641857, 2118730799, 1313764644 }, - { 2139635713, 519149027, 519212449 }, - { 2139598849, 1526413634, 1769667104 }, - { 2139574273, 551148610, 820739925 }, - { 2139568129, 1386800242, 472447405 }, - { 2139549697, 813760130, 1412328531 }, - { 2139537409, 1615286260, 1609362979 }, - { 2139475969, 1352559299, 1696720421 }, - { 2139455489, 1048691649, 1584935400 }, - { 2139432961, 836025845, 950121150 }, - { 2139424769, 1558281165, 1635486858 }, - { 2139406337, 1728402143, 1674423301 }, - { 2139396097, 1727715782, 1483470544 }, - { 2139383809, 1092853491, 1741699084 }, - { 2139369473, 690776899, 1242798709 }, - { 2139351041, 1768782380, 2120712049 }, - { 2139334657, 1739968247, 1427249225 }, - { 2139332609, 1547189119, 623011170 }, - { 2139310081, 1346827917, 1605466350 }, - { 2139303937, 369317948, 828392831 }, - { 2139301889, 1560417239, 1788073219 }, - { 2139283457, 1303121623, 595079358 }, - { 2139248641, 1354555286, 573424177 }, - { 2139240449, 60974056, 885781403 }, - { 2139222017, 355573421, 1221054839 }, - { 2139215873, 566477826, 1724006500 }, - { 2139150337, 871437673, 1609133294 }, - { 2139144193, 1478130914, 1137491905 }, - { 2139117569, 1854880922, 964728507 }, - { 2139076609, 202405335, 756508944 }, - { 2139062273, 1399715741, 884826059 }, - { 2139045889, 1051045798, 1202295476 }, - { 2139033601, 1707715206, 632234634 }, - { 2139006977, 2035853139, 231626690 }, - { 2138951681, 183867876, 838350879 }, - { 2138945537, 1403254661, 404460202 }, - { 2138920961, 310865011, 1282911681 }, - { 2138910721, 1328496553, 103472415 }, - { 2138904577, 78831681, 993513549 }, - { 2138902529, 1319697451, 1055904361 }, - { 2138816513, 384338872, 1706202469 }, - { 2138810369, 1084868275, 405677177 }, - { 2138787841, 401181788, 1964773901 }, - { 2138775553, 1850532988, 1247087473 }, - { 2138767361, 874261901, 1576073565 }, - { 2138757121, 1187474742, 993541415 }, - { 2138748929, 1782458888, 1043206483 }, - { 2138744833, 1221500487, 800141243 }, - { 2138738689, 413465368, 1450660558 }, - { 2138695681, 739045140, 342611472 }, - { 2138658817, 1355845756, 672674190 }, - { 2138644481, 608379162, 1538874380 }, - { 2138632193, 1444914034, 686911254 }, - { 2138607617, 484707818, 1435142134 }, - { 2138591233, 539460669, 1290458549 }, - { 2138572801, 2093538990, 2011138646 }, - { 2138552321, 1149786988, 1076414907 }, - { 2138546177, 840688206, 2108985273 }, - { 2138533889, 209669619, 198172413 }, - { 2138523649, 1975879426, 1277003968 }, - { 2138490881, 1351891144, 1976858109 }, - { 2138460161, 1817321013, 1979278293 }, - { 2138429441, 1950077177, 203441928 }, - { 2138400769, 908970113, 628395069 }, - { 2138398721, 219890864, 758486760 }, - { 2138376193, 1306654379, 977554090 }, - { 2138351617, 298822498, 2004708503 }, - { 2138337281, 441457816, 1049002108 }, - { 2138320897, 1517731724, 1442269609 }, - { 2138290177, 1355911197, 1647139103 }, - { 2138234881, 531313247, 1746591962 }, - { 2138214401, 1899410930, 781416444 }, - { 2138202113, 1813477173, 1622508515 }, - { 2138191873, 1086458299, 1025408615 }, - { 2138183681, 1998800427, 827063290 }, - { 2138173441, 1921308898, 749670117 }, - { 2138103809, 1620902804, 2126787647 }, - { 2138099713, 828647069, 1892961817 }, - { 2138085377, 179405355, 1525506535 }, - { 2138060801, 615683235, 1259580138 }, - { 2138044417, 2030277840, 1731266562 }, - { 2138042369, 2087222316, 1627902259 }, - { 2138032129, 126388712, 1108640984 }, - { 2138011649, 715026550, 1017980050 }, - { 2137993217, 1693714349, 1351778704 }, - { 2137888769, 1289762259, 1053090405 }, - { 2137853953, 199991890, 1254192789 }, - { 2137833473, 941421685, 896995556 }, - { 2137817089, 750416446, 1251031181 }, - { 2137792513, 798075119, 368077456 }, - { 2137786369, 878543495, 1035375025 }, - { 2137767937, 9351178, 1156563902 }, - { 2137755649, 1382297614, 1686559583 }, - { 2137724929, 1345472850, 1681096331 }, - { 2137704449, 834666929, 630551727 }, - { 2137673729, 1646165729, 1892091571 }, - { 2137620481, 778943821, 48456461 }, - { 2137618433, 1730837875, 1713336725 }, - { 2137581569, 805610339, 1378891359 }, - { 2137538561, 204342388, 1950165220 }, - { 2137526273, 1947629754, 1500789441 }, - { 2137516033, 719902645, 1499525372 }, - { 2137491457, 230451261, 556382829 }, - { 2137440257, 979573541, 412760291 }, - { 2137374721, 927841248, 1954137185 }, - { 2137362433, 1243778559, 861024672 }, - { 2137313281, 1341338501, 980638386 }, - { 2137311233, 937415182, 1793212117 }, - { 2137255937, 795331324, 1410253405 }, - { 2137243649, 150756339, 1966999887 }, - { 2137182209, 163346914, 1939301431 }, - { 2137171969, 1952552395, 758913141 }, - { 2137159681, 570788721, 218668666 }, - { 2137147393, 1896656810, 2045670345 }, - { 2137141249, 358493842, 518199643 }, - { 2137139201, 1505023029, 674695848 }, - { 2137133057, 27911103, 830956306 }, - { 2137122817, 439771337, 1555268614 }, - { 2137116673, 790988579, 1871449599 }, - { 2137110529, 432109234, 811805080 }, - { 2137102337, 1357900653, 1184997641 }, - { 2137098241, 515119035, 1715693095 }, - { 2137090049, 408575203, 2085660657 }, - { 2137085953, 2097793407, 1349626963 }, - { 2137055233, 1556739954, 1449960883 }, - { 2137030657, 1545758650, 1369303716 }, - { 2136987649, 332602570, 103875114 }, - { 2136969217, 1499989506, 1662964115 }, - { 2136924161, 857040753, 4738842 }, - { 2136895489, 1948872712, 570436091 }, - { 2136893441, 58969960, 1568349634 }, - { 2136887297, 2127193379, 273612548 }, - { 2136850433, 111208983, 1181257116 }, - { 2136809473, 1627275942, 1680317971 }, - { 2136764417, 1574888217, 14011331 }, - { 2136741889, 14011055, 1129154251 }, - { 2136727553, 35862563, 1838555253 }, - { 2136721409, 310235666, 1363928244 }, - { 2136698881, 1612429202, 1560383828 }, - { 2136649729, 1138540131, 800014364 }, - { 2136606721, 602323503, 1433096652 }, - { 2136563713, 182209265, 1919611038 }, - { 2136555521, 324156477, 165591039 }, - { 2136549377, 195513113, 217165345 }, - { 2136526849, 1050768046, 939647887 }, - { 2136508417, 1886286237, 1619926572 }, - { 2136477697, 609647664, 35065157 }, - { 2136471553, 679352216, 1452259468 }, - { 2136457217, 128630031, 824816521 }, - { 2136422401, 19787464, 1526049830 }, - { 2136420353, 698316836, 1530623527 }, - { 2136371201, 1651862373, 1804812805 }, - { 2136334337, 326596005, 336977082 }, - { 2136322049, 63253370, 1904972151 }, - { 2136297473, 312176076, 172182411 }, - { 2136248321, 381261841, 369032670 }, - { 2136242177, 358688773, 1640007994 }, - { 2136229889, 512677188, 75585225 }, - { 2136219649, 2095003250, 1970086149 }, - { 2136207361, 1909650722, 537760675 }, - { 2136176641, 1334616195, 1533487619 }, - { 2136158209, 2096285632, 1793285210 }, - { 2136143873, 1897347517, 293843959 }, - { 2136133633, 923586222, 1022655978 }, - { 2136096769, 1464868191, 1515074410 }, - { 2136094721, 2020679520, 2061636104 }, - { 2136076289, 290798503, 1814726809 }, - { 2136041473, 156415894, 1250757633 }, - { 2135996417, 297459940, 1132158924 }, - { 2135955457, 538755304, 1688831340 }, - { 0, 0, 0 } -}; - -/* - * Reduce a small signed integer modulo a small prime. The source - * value x MUST be such that -p < x < p. - */ -static inline uint32_t -modp_set(int32_t x, uint32_t p) -{ - uint32_t w; - - w = (uint32_t)x; - w += p & -(w >> 31); - return w; -} - -/* - * Normalize a modular integer around 0. - */ -static inline int32_t -modp_norm(uint32_t x, uint32_t p) -{ - return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1))); -} - -/* - * Compute -1/p mod 2^31. This works for all odd integers p that fit - * on 31 bits. - */ -static uint32_t -modp_ninv31(uint32_t p) -{ - uint32_t y; - - y = 2 - p; - y *= 2 - p * y; - y *= 2 - p * y; - y *= 2 - p * y; - y *= 2 - p * y; - return (uint32_t)0x7FFFFFFF & -y; -} - -/* - * Compute R = 2^31 mod p. - */ -static inline uint32_t -modp_R(uint32_t p) -{ - /* - * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply - * 2^31 - p. - */ - return ((uint32_t)1 << 31) - p; -} - -/* - * Addition modulo p. - */ -static inline uint32_t -modp_add(uint32_t a, uint32_t b, uint32_t p) -{ - uint32_t d; - - d = a + b - p; - d += p & -(d >> 31); - return d; -} - -/* - * Subtraction modulo p. - */ -static inline uint32_t -modp_sub(uint32_t a, uint32_t b, uint32_t p) -{ - uint32_t d; - - d = a - b; - d += p & -(d >> 31); - return d; -} - -/* - * Halving modulo p. - */ -/* unused -static inline uint32_t -modp_half(uint32_t a, uint32_t p) -{ - a += p & -(a & 1); - return a >> 1; -} -*/ - -/* - * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31. - * It is required that p is an odd integer. - */ -static inline uint32_t -modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i) -{ - uint64_t z, w; - uint32_t d; - - z = (uint64_t)a * (uint64_t)b; - w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p; - d = (uint32_t)((z + w) >> 31) - p; - d += p & -(d >> 31); - return d; -} - -/* - * Compute R2 = 2^62 mod p. - */ -static uint32_t -modp_R2(uint32_t p, uint32_t p0i) -{ - uint32_t z; - - /* - * Compute z = 2^31 mod p (this is the value 1 in Montgomery - * representation), then double it with an addition. - */ - z = modp_R(p); - z = modp_add(z, z, p); - - /* - * Square it five times to obtain 2^32 in Montgomery representation - * (i.e. 2^63 mod p). - */ - z = modp_montymul(z, z, p, p0i); - z = modp_montymul(z, z, p, p0i); - z = modp_montymul(z, z, p, p0i); - z = modp_montymul(z, z, p, p0i); - z = modp_montymul(z, z, p, p0i); - - /* - * Halve the value mod p to get 2^62. - */ - z = (z + (p & -(z & 1))) >> 1; - return z; -} - -/* - * Compute 2^(31*x) modulo p. This works for integers x up to 2^11. - * p must be prime such that 2^30 < p < 2^31; p0i must be equal to - * -1/p mod 2^31; R2 must be equal to 2^62 mod p. - */ -static inline uint32_t -modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2) -{ - int i; - uint32_t r, z; - - /* - * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery - * representation of (2^31)^e mod p, where e = x-1. - * R2 is 2^31 in Montgomery representation. - */ - x --; - r = R2; - z = modp_R(p); - for (i = 0; (1U << i) <= x; i ++) { - if ((x & (1U << i)) != 0) { - z = modp_montymul(z, r, p, p0i); - } - r = modp_montymul(r, r, p, p0i); - } - return z; -} - -/* - * Division modulo p. If the divisor (b) is 0, then 0 is returned. - * This function computes proper results only when p is prime. - * Parameters: - * a dividend - * b divisor - * p odd prime modulus - * p0i -1/p mod 2^31 - * R 2^31 mod R - */ -static uint32_t -modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R) -{ - uint32_t z, e; - int i; - - e = p - 2; - z = R; - for (i = 30; i >= 0; i --) { - uint32_t z2; - - z = modp_montymul(z, z, p, p0i); - z2 = modp_montymul(z, b, p, p0i); - z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1); - } - - /* - * The loop above just assumed that b was in Montgomery - * representation, i.e. really contained b*R; under that - * assumption, it returns 1/b in Montgomery representation, - * which is R/b. But we gave it b in normal representation, - * so the loop really returned R/(b/R) = R^2/b. - * - * We want a/b, so we need one Montgomery multiplication with a, - * which also remove one of the R factors, and another such - * multiplication to remove the second R factor. - */ - z = modp_montymul(z, 1, p, p0i); - return modp_montymul(a, z, p, p0i); -} - -/* - * Bit-reversal index table. - */ -static const uint16_t REV10[] = { - 0, 512, 256, 768, 128, 640, 384, 896, 64, 576, 320, 832, - 192, 704, 448, 960, 32, 544, 288, 800, 160, 672, 416, 928, - 96, 608, 352, 864, 224, 736, 480, 992, 16, 528, 272, 784, - 144, 656, 400, 912, 80, 592, 336, 848, 208, 720, 464, 976, - 48, 560, 304, 816, 176, 688, 432, 944, 112, 624, 368, 880, - 240, 752, 496, 1008, 8, 520, 264, 776, 136, 648, 392, 904, - 72, 584, 328, 840, 200, 712, 456, 968, 40, 552, 296, 808, - 168, 680, 424, 936, 104, 616, 360, 872, 232, 744, 488, 1000, - 24, 536, 280, 792, 152, 664, 408, 920, 88, 600, 344, 856, - 216, 728, 472, 984, 56, 568, 312, 824, 184, 696, 440, 952, - 120, 632, 376, 888, 248, 760, 504, 1016, 4, 516, 260, 772, - 132, 644, 388, 900, 68, 580, 324, 836, 196, 708, 452, 964, - 36, 548, 292, 804, 164, 676, 420, 932, 100, 612, 356, 868, - 228, 740, 484, 996, 20, 532, 276, 788, 148, 660, 404, 916, - 84, 596, 340, 852, 212, 724, 468, 980, 52, 564, 308, 820, - 180, 692, 436, 948, 116, 628, 372, 884, 244, 756, 500, 1012, - 12, 524, 268, 780, 140, 652, 396, 908, 76, 588, 332, 844, - 204, 716, 460, 972, 44, 556, 300, 812, 172, 684, 428, 940, - 108, 620, 364, 876, 236, 748, 492, 1004, 28, 540, 284, 796, - 156, 668, 412, 924, 92, 604, 348, 860, 220, 732, 476, 988, - 60, 572, 316, 828, 188, 700, 444, 956, 124, 636, 380, 892, - 252, 764, 508, 1020, 2, 514, 258, 770, 130, 642, 386, 898, - 66, 578, 322, 834, 194, 706, 450, 962, 34, 546, 290, 802, - 162, 674, 418, 930, 98, 610, 354, 866, 226, 738, 482, 994, - 18, 530, 274, 786, 146, 658, 402, 914, 82, 594, 338, 850, - 210, 722, 466, 978, 50, 562, 306, 818, 178, 690, 434, 946, - 114, 626, 370, 882, 242, 754, 498, 1010, 10, 522, 266, 778, - 138, 650, 394, 906, 74, 586, 330, 842, 202, 714, 458, 970, - 42, 554, 298, 810, 170, 682, 426, 938, 106, 618, 362, 874, - 234, 746, 490, 1002, 26, 538, 282, 794, 154, 666, 410, 922, - 90, 602, 346, 858, 218, 730, 474, 986, 58, 570, 314, 826, - 186, 698, 442, 954, 122, 634, 378, 890, 250, 762, 506, 1018, - 6, 518, 262, 774, 134, 646, 390, 902, 70, 582, 326, 838, - 198, 710, 454, 966, 38, 550, 294, 806, 166, 678, 422, 934, - 102, 614, 358, 870, 230, 742, 486, 998, 22, 534, 278, 790, - 150, 662, 406, 918, 86, 598, 342, 854, 214, 726, 470, 982, - 54, 566, 310, 822, 182, 694, 438, 950, 118, 630, 374, 886, - 246, 758, 502, 1014, 14, 526, 270, 782, 142, 654, 398, 910, - 78, 590, 334, 846, 206, 718, 462, 974, 46, 558, 302, 814, - 174, 686, 430, 942, 110, 622, 366, 878, 238, 750, 494, 1006, - 30, 542, 286, 798, 158, 670, 414, 926, 94, 606, 350, 862, - 222, 734, 478, 990, 62, 574, 318, 830, 190, 702, 446, 958, - 126, 638, 382, 894, 254, 766, 510, 1022, 1, 513, 257, 769, - 129, 641, 385, 897, 65, 577, 321, 833, 193, 705, 449, 961, - 33, 545, 289, 801, 161, 673, 417, 929, 97, 609, 353, 865, - 225, 737, 481, 993, 17, 529, 273, 785, 145, 657, 401, 913, - 81, 593, 337, 849, 209, 721, 465, 977, 49, 561, 305, 817, - 177, 689, 433, 945, 113, 625, 369, 881, 241, 753, 497, 1009, - 9, 521, 265, 777, 137, 649, 393, 905, 73, 585, 329, 841, - 201, 713, 457, 969, 41, 553, 297, 809, 169, 681, 425, 937, - 105, 617, 361, 873, 233, 745, 489, 1001, 25, 537, 281, 793, - 153, 665, 409, 921, 89, 601, 345, 857, 217, 729, 473, 985, - 57, 569, 313, 825, 185, 697, 441, 953, 121, 633, 377, 889, - 249, 761, 505, 1017, 5, 517, 261, 773, 133, 645, 389, 901, - 69, 581, 325, 837, 197, 709, 453, 965, 37, 549, 293, 805, - 165, 677, 421, 933, 101, 613, 357, 869, 229, 741, 485, 997, - 21, 533, 277, 789, 149, 661, 405, 917, 85, 597, 341, 853, - 213, 725, 469, 981, 53, 565, 309, 821, 181, 693, 437, 949, - 117, 629, 373, 885, 245, 757, 501, 1013, 13, 525, 269, 781, - 141, 653, 397, 909, 77, 589, 333, 845, 205, 717, 461, 973, - 45, 557, 301, 813, 173, 685, 429, 941, 109, 621, 365, 877, - 237, 749, 493, 1005, 29, 541, 285, 797, 157, 669, 413, 925, - 93, 605, 349, 861, 221, 733, 477, 989, 61, 573, 317, 829, - 189, 701, 445, 957, 125, 637, 381, 893, 253, 765, 509, 1021, - 3, 515, 259, 771, 131, 643, 387, 899, 67, 579, 323, 835, - 195, 707, 451, 963, 35, 547, 291, 803, 163, 675, 419, 931, - 99, 611, 355, 867, 227, 739, 483, 995, 19, 531, 275, 787, - 147, 659, 403, 915, 83, 595, 339, 851, 211, 723, 467, 979, - 51, 563, 307, 819, 179, 691, 435, 947, 115, 627, 371, 883, - 243, 755, 499, 1011, 11, 523, 267, 779, 139, 651, 395, 907, - 75, 587, 331, 843, 203, 715, 459, 971, 43, 555, 299, 811, - 171, 683, 427, 939, 107, 619, 363, 875, 235, 747, 491, 1003, - 27, 539, 283, 795, 155, 667, 411, 923, 91, 603, 347, 859, - 219, 731, 475, 987, 59, 571, 315, 827, 187, 699, 443, 955, - 123, 635, 379, 891, 251, 763, 507, 1019, 7, 519, 263, 775, - 135, 647, 391, 903, 71, 583, 327, 839, 199, 711, 455, 967, - 39, 551, 295, 807, 167, 679, 423, 935, 103, 615, 359, 871, - 231, 743, 487, 999, 23, 535, 279, 791, 151, 663, 407, 919, - 87, 599, 343, 855, 215, 727, 471, 983, 55, 567, 311, 823, - 183, 695, 439, 951, 119, 631, 375, 887, 247, 759, 503, 1015, - 15, 527, 271, 783, 143, 655, 399, 911, 79, 591, 335, 847, - 207, 719, 463, 975, 47, 559, 303, 815, 175, 687, 431, 943, - 111, 623, 367, 879, 239, 751, 495, 1007, 31, 543, 287, 799, - 159, 671, 415, 927, 95, 607, 351, 863, 223, 735, 479, 991, - 63, 575, 319, 831, 191, 703, 447, 959, 127, 639, 383, 895, - 255, 767, 511, 1023 -}; - -/* - * Compute the roots for NTT and inverse NTT (binary case). Input - * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 = - * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g: - * gm[rev(i)] = g^i mod p - * igm[rev(i)] = (1/g)^i mod p - * where rev() is the "bit reversal" function over 10 bits. It fills - * the arrays only up to N = 2^logn values. - * - * The values stored in gm[] and igm[] are in Montgomery representation. - * - * p must be a prime such that p = 1 mod 2048. - */ -static void -modp_mkgm2(uint32_t *restrict gm, uint32_t *restrict igm, unsigned logn, - uint32_t g, uint32_t p, uint32_t p0i) -{ - size_t u, n; - unsigned k; - uint32_t ig, x1, x2, R2; - - n = (size_t)1 << logn; - - /* - * We want g such that g^(2N) = 1 mod p, but the provided - * generator has order 2048. We must square it a few times. - */ - R2 = modp_R2(p, p0i); - g = modp_montymul(g, R2, p, p0i); - for (k = logn; k < 10; k ++) { - g = modp_montymul(g, g, p, p0i); - } - - ig = modp_div(R2, g, p, p0i, modp_R(p)); - k = 10 - logn; - x1 = x2 = modp_R(p); - for (u = 0; u < n; u ++) { - size_t v; - - v = REV10[u << k]; - gm[v] = x1; - igm[v] = x2; - x1 = modp_montymul(x1, g, p, p0i); - x2 = modp_montymul(x2, ig, p, p0i); - } -} - -/* - * Compute the NTT over a polynomial (binary case). Polynomial elements - * are a[0], a[stride], a[2 * stride]... - */ -static void -modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn, - uint32_t p, uint32_t p0i) -{ - size_t t, m, n; - - if (logn == 0) { - return; - } - n = (size_t)1 << logn; - t = n; - for (m = 1; m < n; m <<= 1) { - size_t ht, u, v1; - - ht = t >> 1; - for (u = 0, v1 = 0; u < m; u ++, v1 += t) { - uint32_t s; - size_t v; - uint32_t *r1, *r2; - - s = gm[m + u]; - r1 = a + v1 * stride; - r2 = r1 + ht * stride; - for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) { - uint32_t x, y; - - x = *r1; - y = modp_montymul(*r2, s, p, p0i); - *r1 = modp_add(x, y, p); - *r2 = modp_sub(x, y, p); - } - } - t = ht; - } -} - -/* - * Compute the inverse NTT over a polynomial (binary case). - */ -static void -modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn, - uint32_t p, uint32_t p0i) -{ - size_t t, m, n, k; - uint32_t ni; - uint32_t *r; - - if (logn == 0) { - return; - } - n = (size_t)1 << logn; - t = 1; - for (m = n; m > 1; m >>= 1) { - size_t hm, dt, u, v1; - - hm = m >> 1; - dt = t << 1; - for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) { - uint32_t s; - size_t v; - uint32_t *r1, *r2; - - s = igm[hm + u]; - r1 = a + v1 * stride; - r2 = r1 + t * stride; - for (v = 0; v < t; v ++, r1 += stride, r2 += stride) { - uint32_t x, y; - - x = *r1; - y = *r2; - *r1 = modp_add(x, y, p); - *r2 = modp_montymul( - modp_sub(x, y, p), s, p, p0i);; - } - } - t = dt; - } - - /* - * We need 1/n in Montgomery representation, i.e. R/n. Since - * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p, - * thus a simple shift will do. - */ - ni = (uint32_t)1 << (31 - logn); - for (k = 0, r = a; k < n; k ++, r += stride) { - *r = modp_montymul(*r, ni, p, p0i); - } -} - -/* - * Simplified macros for NTT and iNTT (binary case) when the elements - * are consecutive in RAM. - */ -#define modp_NTT2(a, gm, logn, p, p0i) modp_NTT2_ext(a, 1, gm, logn, p, p0i) -#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i) - -/* - * Given polynomial f in NTT representation modulo p, compute f' of degree - * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are - * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2). - * - * The new polynomial is written "in place" over the first N/2 elements - * of f. - * - * If applied logn times successively on a given polynomial, the resulting - * degree-0 polynomial is the resultant of f and X^N+1 modulo p. - * - * This function applies only to the binary case; it is invoked from - * solve_NTRU_binary_depth1(). - */ -static void -modp_poly_rec_res(uint32_t *f, unsigned logn, - uint32_t p, uint32_t p0i, uint32_t R2) -{ - size_t hn, u; - - hn = (size_t)1 << (logn - 1); - for (u = 0; u < hn; u ++) { - uint32_t w0, w1; - - w0 = f[(u << 1) + 0]; - w1 = f[(u << 1) + 1]; - f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i); - } -} - -/* ==================================================================== */ -/* - * Custom bignum implementation. - * - * This is a very reduced set of functionalities. We need to do the - * following operations: - * - * - Rebuild the resultant and the polynomial coefficients from their - * values modulo small primes (of length 31 bits each). - * - * - Compute an extended GCD between the two computed resultants. - * - * - Extract top bits and add scaled values during the successive steps - * of Babai rounding. - * - * When rebuilding values using CRT, we must also recompute the product - * of the small prime factors. We always do it one small factor at a - * time, so the "complicated" operations can be done modulo the small - * prime with the modp_* functions. CRT coefficients (inverses) are - * precomputed. - * - * All values are positive until the last step: when the polynomial - * coefficients have been rebuilt, we normalize them around 0. But then, - * only additions and subtractions on the upper few bits are needed - * afterwards. - * - * We keep big integers as arrays of 31-bit words (in uint32_t values); - * the top bit of each uint32_t is kept equal to 0. Using 31-bit words - * makes it easier to keep track of carries. When negative values are - * used, two's complement is used. - */ - -/* - * Subtract integer b from integer a. Both integers are supposed to have - * the same size. The carry (0 or 1) is returned. Source arrays a and b - * MUST be distinct. - * - * The operation is performed as described above if ctr = 1. If - * ctl = 0, the value a[] is unmodified, but all memory accesses are - * still performed, and the carry is computed and returned. - */ -static uint32_t -zint_sub(uint32_t *restrict a, const uint32_t *restrict b, size_t len, - uint32_t ctl) -{ - size_t u; - uint32_t cc, m; - - cc = 0; - m = -ctl; - for (u = 0; u < len; u ++) { - uint32_t aw, w; - - aw = a[u]; - w = aw - b[u] - cc; - cc = w >> 31; - aw ^= ((w & 0x7FFFFFFF) ^ aw) & m; - a[u] = aw; - } - return cc; -} - -/* - * Mutiply the provided big integer m with a small value x. - * This function assumes that x < 2^31. The carry word is returned. - */ -static uint32_t -zint_mul_small(uint32_t *m, size_t mlen, uint32_t x) -{ - size_t u; - uint32_t cc; - - cc = 0; - for (u = 0; u < mlen; u ++) { - uint64_t z; - - z = (uint64_t)m[u] * (uint64_t)x + cc; - m[u] = (uint32_t)z & 0x7FFFFFFF; - cc = (uint32_t)(z >> 31); - } - return cc; -} - -/* - * Reduce a big integer d modulo a small integer p. - * Rules: - * d is unsigned - * p is prime - * 2^30 < p < 2^31 - * p0i = -(1/p) mod 2^31 - * R2 = 2^62 mod p - */ -static uint32_t -zint_mod_small_unsigned(const uint32_t *d, size_t dlen, - uint32_t p, uint32_t p0i, uint32_t R2) -{ - uint32_t x; - size_t u; - - /* - * Algorithm: we inject words one by one, starting with the high - * word. Each step is: - * - multiply x by 2^31 - * - add new word - */ - x = 0; - u = dlen; - while (u -- > 0) { - uint32_t w; - - x = modp_montymul(x, R2, p, p0i); - w = d[u] - p; - w += p & -(w >> 31); - x = modp_add(x, w, p); - } - return x; -} - -/* - * Similar to zint_mod_small_unsigned(), except that d may be signed. - * Extra parameter is Rx = 2^(31*dlen) mod p. - */ -static uint32_t -zint_mod_small_signed(const uint32_t *d, size_t dlen, - uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx) -{ - uint32_t z; - - if (dlen == 0) { - return 0; - } - z = zint_mod_small_unsigned(d, dlen, p, p0i, R2); - z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p); - return z; -} - -/* - * Add y*s to x. x and y initially have length 'len' words; the new x - * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must - * not overlap. - */ -static void -zint_add_mul_small(uint32_t *restrict x, - const uint32_t *restrict y, size_t len, uint32_t s) -{ - size_t u; - uint32_t cc; - - cc = 0; - for (u = 0; u < len; u ++) { - uint32_t xw, yw; - uint64_t z; - - xw = x[u]; - yw = y[u]; - z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc; - x[u] = (uint32_t)z & 0x7FFFFFFF; - cc = (uint32_t)(z >> 31); - } - x[len] = cc; -} - -/* - * Normalize a modular integer around 0: if x > p/2, then x is replaced - * with x - p (signed encoding with two's complement); otherwise, x is - * untouched. The two integers x and p are encoded over the same length. - */ -static void -zint_norm_zero(uint32_t *restrict x, const uint32_t *restrict p, size_t len) -{ - size_t u; - uint32_t r, bb; - - /* - * Compare x with p/2. We use the shifted version of p, and p - * is odd, so we really compare with (p-1)/2; we want to perform - * the subtraction if and only if x > (p-1)/2. - */ - r = 0; - bb = 0; - u = len; - while (u -- > 0) { - uint32_t wx, wp, cc; - - /* - * Get the two words to compare in wx and wp (both over - * 31 bits exactly). - */ - wx = x[u]; - wp = (p[u] >> 1) | (bb << 30); - bb = p[u] & 1; - - /* - * We set cc to -1, 0 or 1, depending on whether wp is - * lower than, equal to, or greater than wx. - */ - cc = wp - wx; - cc = ((-cc) >> 31) | -(cc >> 31); - - /* - * If r != 0 then it is either 1 or -1, and we keep its - * value. Otherwise, if r = 0, then we replace it with cc. - */ - r |= cc & ((r & 1) - 1); - } - - /* - * At this point, r = -1, 0 or 1, depending on whether (p-1)/2 - * is lower than, equal to, or greater than x. We thus want to - * do the subtraction only if r = -1. - */ - zint_sub(x, p, len, r >> 31); -} - -/* - * Rebuild integers from their RNS representation. There are 'num' - * integers, and each consists in 'xlen' words. 'xx' points at that - * first word of the first integer; subsequent integers are accessed - * by adding 'xstride' repeatedly. - * - * The words of an integer are the RNS representation of that integer, - * using the provided 'primes' are moduli. This function replaces - * each integer with its multi-word value (little-endian order). - * - * If "normalize_signed" is non-zero, then the returned value is - * normalized to the -m/2..m/2 interval (where m is the product of all - * small prime moduli); two's complement is used for negative values. - */ -static void -zint_rebuild_CRT(uint32_t *restrict xx, size_t xlen, size_t xstride, - size_t num, const small_prime *primes, int normalize_signed, - uint32_t *restrict tmp) -{ - size_t u; - uint32_t *x; - - tmp[0] = primes[0].p; - for (u = 1; u < xlen; u ++) { - /* - * At the entry of each loop iteration: - * - the first u words of each array have been - * reassembled; - * - the first u words of tmp[] contains the - * product of the prime moduli processed so far. - * - * We call 'q' the product of all previous primes. - */ - uint32_t p, p0i, s, R2; - size_t v; - - p = primes[u].p; - s = primes[u].s; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - - for (v = 0, x = xx; v < num; v ++, x += xstride) { - uint32_t xp, xq, xr; - /* - * xp = the integer x modulo the prime p for this - * iteration - * xq = (x mod q) mod p - */ - xp = x[u]; - xq = zint_mod_small_unsigned(x, u, p, p0i, R2); - - /* - * New value is (x mod q) + q * (s * (xp - xq) mod p) - */ - xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i); - zint_add_mul_small(x, tmp, u, xr); - } - - /* - * Update product of primes in tmp[]. - */ - tmp[u] = zint_mul_small(tmp, u, p); - } - - /* - * Normalize the reconstructed values around 0. - */ - if (normalize_signed) { - for (u = 0, x = xx; u < num; u ++, x += xstride) { - zint_norm_zero(x, tmp, xlen); - } - } -} - -/* - * Negate a big integer conditionally: value a is replaced with -a if - * and only if ctl = 1. Control value ctl must be 0 or 1. - */ -static void -zint_negate(uint32_t *a, size_t len, uint32_t ctl) -{ - size_t u; - uint32_t cc, m; - - /* - * If ctl = 1 then we flip the bits of a by XORing with - * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR - * with 0 and add 0, which leaves the value unchanged. - */ - cc = ctl; - m = -ctl >> 1; - for (u = 0; u < len; u ++) { - uint32_t aw; - - aw = a[u]; - aw = (aw ^ m) + cc; - a[u] = aw & 0x7FFFFFFF; - cc = aw >> 31; - } -} - -/* - * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31). - * The low bits are dropped (the caller should compute the coefficients - * such that these dropped bits are all zeros). If either or both - * yields a negative value, then the value is negated. - * - * Returned value is: - * 0 both values were positive - * 1 new a had to be negated - * 2 new b had to be negated - * 3 both new a and new b had to be negated - * - * Coefficients xa, xb, ya and yb may use the full signed 32-bit range. - */ -static uint32_t -zint_co_reduce(uint32_t *a, uint32_t *b, size_t len, - int64_t xa, int64_t xb, int64_t ya, int64_t yb) -{ - size_t u; - int64_t cca, ccb; - uint32_t nega, negb; - - cca = 0; - ccb = 0; - for (u = 0; u < len; u ++) { - uint32_t wa, wb; - uint64_t za, zb; - - wa = a[u]; - wb = b[u]; - za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca; - zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb; - if (u > 0) { - a[u - 1] = (uint32_t)za & 0x7FFFFFFF; - b[u - 1] = (uint32_t)zb & 0x7FFFFFFF; - } - cca = *(int64_t *)&za >> 31; - ccb = *(int64_t *)&zb >> 31; - } - a[len - 1] = (uint32_t)cca; - b[len - 1] = (uint32_t)ccb; - - nega = (uint32_t)((uint64_t)cca >> 63); - negb = (uint32_t)((uint64_t)ccb >> 63); - zint_negate(a, len, nega); - zint_negate(b, len, negb); - return nega | (negb << 1); -} - -/* - * Finish modular reduction. Rules on input parameters: - * - * if neg = 1, then -m <= a < 0 - * if neg = 0, then 0 <= a < 2*m - * - * If neg = 0, then the top word of a[] is allowed to use 32 bits. - * - * Modulus m must be odd. - */ -static void -zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg) -{ - size_t u; - uint32_t cc, xm, ym; - - /* - * First pass: compare a (assumed nonnegative) with m. Note that - * if the top word uses 32 bits, subtracting m must yield a - * value less than 2^31 since a < 2*m. - */ - cc = 0; - for (u = 0; u < len; u ++) { - cc = (a[u] - m[u] - cc) >> 31; - } - - /* - * If neg = 1 then we must add m (regardless of cc) - * If neg = 0 and cc = 0 then we must subtract m - * If neg = 0 and cc = 1 then we must do nothing - * - * In the loop below, we conditionally subtract either m or -m - * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1); - * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0. - */ - xm = -neg >> 1; - ym = -(neg | (1 - cc)); - cc = neg; - for (u = 0; u < len; u ++) { - uint32_t aw, mw; - - aw = a[u]; - mw = (m[u] ^ xm) & ym; - aw = aw - mw - cc; - a[u] = aw & 0x7FFFFFFF; - cc = aw >> 31; - } -} - -/* - * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with - * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31. - */ -static void -zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len, - uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb) -{ - size_t u; - int64_t cca, ccb; - uint32_t fa, fb; - - /* - * These are actually four combined Montgomery multiplications. - */ - cca = 0; - ccb = 0; - fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF; - fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF; - for (u = 0; u < len; u ++) { - uint32_t wa, wb; - uint64_t za, zb; - - wa = a[u]; - wb = b[u]; - za = wa * (uint64_t)xa + wb * (uint64_t)xb - + m[u] * (uint64_t)fa + (uint64_t)cca; - zb = wa * (uint64_t)ya + wb * (uint64_t)yb - + m[u] * (uint64_t)fb + (uint64_t)ccb; - if (u > 0) { - a[u - 1] = (uint32_t)za & 0x7FFFFFFF; - b[u - 1] = (uint32_t)zb & 0x7FFFFFFF; - } - cca = *(int64_t *)&za >> 31; - ccb = *(int64_t *)&zb >> 31; - } - a[len - 1] = (uint32_t)cca; - b[len - 1] = (uint32_t)ccb; - - /* - * At this point: - * -m <= a < 2*m - * -m <= b < 2*m - * (this is a case of Montgomery reduction) - * The top words of 'a' and 'b' may have a 32-th bit set. - * We want to add or subtract the modulus, as required. - */ - zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63)); - zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63)); -} - -/* - * Compute a GCD between two positive big integers x and y. The two - * integers must be odd. Returned value is 1 if the GCD is 1, 0 - * otherwise. When 1 is returned, arrays u and v are filled with values - * such that: - * 0 <= u <= y - * 0 <= v <= x - * x*u - y*v = 1 - * x[] and y[] are unmodified. Both input values must have the same - * encoded length. Temporary array must be large enough to accommodate 4 - * extra values of that length. Arrays u, v and tmp may not overlap with - * each other, or with either x or y. - */ -static int -zint_bezout(uint32_t *restrict u, uint32_t *restrict v, - const uint32_t *restrict x, const uint32_t *restrict y, - size_t len, uint32_t *restrict tmp) -{ - /* - * Algorithm is an extended binary GCD. We maintain 6 values - * a, b, u0, u1, v0 and v1 with the following invariants: - * - * a = x*u0 - y*v0 - * b = x*u1 - y*v1 - * 0 <= a <= x - * 0 <= b <= y - * 0 <= u0 < y - * 0 <= v0 < x - * 0 <= u1 <= y - * 0 <= v1 < x - * - * Initial values are: - * - * a = x u0 = 1 v0 = 0 - * b = y u1 = y v1 = x-1 - * - * Each iteration reduces either a or b, and maintains the - * invariants. Algorithm stops when a = b, at which point their - * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains - * the values (u,v) we want to return. - * - * The formal definition of the algorithm is a sequence of steps: - * - * - If a is even, then: - * a <- a/2 - * u0 <- u0/2 mod y - * v0 <- v0/2 mod x - * - * - Otherwise, if b is even, then: - * b <- b/2 - * u1 <- u1/2 mod y - * v1 <- v1/2 mod x - * - * - Otherwise, if a > b, then: - * a <- (a-b)/2 - * u0 <- (u0-u1)/2 mod y - * v0 <- (v0-v1)/2 mod x - * - * - Otherwise: - * b <- (b-a)/2 - * u1 <- (u1-u0)/2 mod y - * v1 <- (v1-v0)/2 mod y - * - * We can show that the operations above preserve the invariants: - * - * - If a is even, then u0 and v0 are either both even or both - * odd (since a = x*u0 - y*v0, and x and y are both odd). - * If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2). - * Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way, - * the a = x*u0 - y*v0 invariant is preserved. - * - * - The same holds for the case where b is even. - * - * - If a and b are odd, and a > b, then: - * - * a-b = x*(u0-u1) - y*(v0-v1) - * - * In that situation, if u0 < u1, then x*(u0-u1) < 0, but - * a-b > 0; therefore, it must be that v0 < v1, and the - * first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x), - * which preserves the invariants. Otherwise, if u0 > u1, - * then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and - * b >= 0, hence a-b <= x. It follows that, in that case, - * v0-v1 >= 0. The first part of the update is then: - * (u0,v0) <- (u0-u1,v0-v1), which again preserves the - * invariants. - * - * Either way, once the subtraction is done, the new value of - * a, which is the difference of two odd values, is even, - * and the remaining of this step is a subcase of the - * first algorithm case (i.e. when a is even). - * - * - If a and b are odd, and b > a, then the a similar - * argument holds. - * - * The values a and b start at x and y, respectively. Since x - * and y are odd, their GCD is odd, and it is easily seen that - * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b); - * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a - * or b is reduced by at least one bit at each iteration, so - * the algorithm necessarily converges on the case a = b, at - * which point the common value is the GCD. - * - * In the algorithm expressed above, when a = b, the fourth case - * applies, and sets b = 0. Since a contains the GCD of x and y, - * which are both odd, a must be odd, and subsequent iterations - * (if any) will simply divide b by 2 repeatedly, which has no - * consequence. Thus, the algorithm can run for more iterations - * than necessary; the final GCD will be in a, and the (u,v) - * coefficients will be (u0,v0). - * - * - * The presentation above is bit-by-bit. It can be sped up by - * noticing that all decisions are taken based on the low bits - * and high bits of a and b. We can extract the two top words - * and low word of each of a and b, and compute reduction - * parameters pa, pb, qa and qb such that the new values for - * a and b are: - * a' = (a*pa + b*pb) / (2^31) - * b' = (a*qa + b*qb) / (2^31) - * the two divisions being exact. The coefficients are obtained - * just from the extracted words, and may be slightly off, requiring - * an optional correction: if a' < 0, then we replace pa with -pa - * and pb with -pb. Each such step will reduce the total length - * (sum of lengths of a and b) by at least 30 bits at each - * iteration. - */ - uint32_t *u0, *u1, *v0, *v1, *a, *b; - uint32_t x0i, y0i; - uint32_t num, rc; - size_t j; - - if (len == 0) { - return 0; - } - - /* - * u0 and v0 are the u and v result buffers; the four other - * values (u1, v1, a and b) are taken from tmp[]. - */ - u0 = u; - v0 = v; - u1 = tmp; - v1 = u1 + len; - a = v1 + len; - b = a + len; - - /* - * We'll need the Montgomery reduction coefficients. - */ - x0i = modp_ninv31(x[0]); - y0i = modp_ninv31(y[0]); - - /* - * Initialize a, b, u0, u1, v0 and v1. - * a = x u0 = 1 v0 = 0 - * b = y u1 = y v1 = x-1 - * Note that x is odd, so computing x-1 is easy. - */ - memcpy(a, x, len * sizeof *x); - memcpy(b, y, len * sizeof *y); - u0[0] = 1; - memset(u0 + 1, 0, (len - 1) * sizeof *u0); - memset(v0, 0, len * sizeof *v0); - memcpy(u1, y, len * sizeof *u1); - memcpy(v1, x, len * sizeof *v1); - v1[0] --; - - /* - * Each input operand may be as large as 31*len bits, and we - * reduce the total length by at least 30 bits at each iteration. - */ - for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) { - uint32_t c0, c1; - uint32_t a0, a1, b0, b1; - uint64_t a_hi, b_hi; - uint32_t a_lo, b_lo; - int64_t pa, pb, qa, qb; - int i; - uint32_t r; - - /* - * Extract the top words of a and b. If j is the highest - * index >= 1 such that a[j] != 0 or b[j] != 0, then we - * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1]. - * If a and b are down to one word each, then we use - * a[0] and b[0]. - */ - c0 = (uint32_t)-1; - c1 = (uint32_t)-1; - a0 = 0; - a1 = 0; - b0 = 0; - b1 = 0; - j = len; - while (j -- > 0) { - uint32_t aw, bw; - - aw = a[j]; - bw = b[j]; - a0 ^= (a0 ^ aw) & c0; - a1 ^= (a1 ^ aw) & c1; - b0 ^= (b0 ^ bw) & c0; - b1 ^= (b1 ^ bw) & c1; - c1 = c0; - c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1; - } - - /* - * If c1 = 0, then we grabbed two words for a and b. - * If c1 != 0 but c0 = 0, then we grabbed one word. It - * is not possible that c1 != 0 and c0 != 0, because that - * would mean that both integers are zero. - */ - a1 |= a0 & c1; - a0 &= ~c1; - b1 |= b0 & c1; - b0 &= ~c1; - a_hi = ((uint64_t)a0 << 31) + a1; - b_hi = ((uint64_t)b0 << 31) + b1; - a_lo = a[0]; - b_lo = b[0]; - - /* - * Compute reduction factors: - * - * a' = a*pa + b*pb - * b' = a*qa + b*qb - * - * such that a' and b' are both multiple of 2^31, but are - * only marginally larger than a and b. - */ - pa = 1; - pb = 0; - qa = 0; - qb = 1; - for (i = 0; i < 31; i ++) { - /* - * At each iteration: - * - * a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi - * b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi - * a <- a/2 if: a is even - * b <- b/2 if: a is odd, b is even - * - * We multiply a_lo and b_lo by 2 at each - * iteration, thus a division by 2 really is a - * non-multiplication by 2. - */ - uint32_t rt, oa, ob, cAB, cBA, cA; - uint64_t rz; - - /* - * rt = 1 if a_hi > b_hi, 0 otherwise. - */ - rz = b_hi - a_hi; - rt = (uint32_t)((rz ^ ((a_hi ^ b_hi) - & (a_hi ^ rz))) >> 63); - - /* - * cAB = 1 if b must be subtracted from a - * cBA = 1 if a must be subtracted from b - * cA = 1 if a must be divided by 2 - * - * Rules: - * - * cAB and cBA cannot both be 1. - * If a is not divided by 2, b is. - */ - oa = (a_lo >> i) & 1; - ob = (b_lo >> i) & 1; - cAB = oa & ob & rt; - cBA = oa & ob & ~rt; - cA = cAB | (oa ^ 1); - - /* - * Conditional subtractions. - */ - a_lo -= b_lo & -cAB; - a_hi -= b_hi & -(uint64_t)cAB; - pa -= qa & -(int64_t)cAB; - pb -= qb & -(int64_t)cAB; - b_lo -= a_lo & -cBA; - b_hi -= a_hi & -(uint64_t)cBA; - qa -= pa & -(int64_t)cBA; - qb -= pb & -(int64_t)cBA; - - /* - * Shifting. - */ - a_lo += a_lo & (cA - 1); - pa += pa & ((int64_t)cA - 1); - pb += pb & ((int64_t)cA - 1); - a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA; - b_lo += b_lo & -cA; - qa += qa & -(int64_t)cA; - qb += qb & -(int64_t)cA; - b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1); - } - - /* - * Apply the computed parameters to our values. We - * may have to correct pa and pb depending on the - * returned value of zint_co_reduce() (when a and/or b - * had to be negated). - */ - r = zint_co_reduce(a, b, len, pa, pb, qa, qb); - pa -= (pa + pa) & -(int64_t)(r & 1); - pb -= (pb + pb) & -(int64_t)(r & 1); - qa -= (qa + qa) & -(int64_t)(r >> 1); - qb -= (qb + qb) & -(int64_t)(r >> 1); - zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb); - zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb); - } - - /* - * At that point, array a[] should contain the GCD, and the - * results (u,v) should already be set. We check that the GCD - * is indeed 1. We also check that the two operands x and y - * are odd. - */ - rc = a[0] ^ 1; - for (j = 1; j < len; j ++) { - rc |= a[j]; - } - return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]); -} - -/* - * Add k*y*2^sc to x. The result is assumed to fit in the array of - * size xlen (truncation is applied if necessary). - * Scale factor 'sc' is provided as sch and scl, such that: - * sch = sc / 31 - * scl = sc % 31 - * xlen MUST NOT be lower than ylen. - * - * x[] and y[] are both signed integers, using two's complement for - * negative values. - */ -static void -zint_add_scaled_mul_small(uint32_t *restrict x, size_t xlen, - const uint32_t *restrict y, size_t ylen, int32_t k, - uint32_t sch, uint32_t scl) -{ - size_t u; - uint32_t ysign, tw; - int32_t cc; - - if (ylen == 0) { - return; - } - - ysign = -(y[ylen - 1] >> 30) >> 1; - tw = 0; - cc = 0; - for (u = sch; u < xlen; u ++) { - size_t v; - uint32_t wy, wys, ccu; - uint64_t z; - - /* - * Get the next word of y (scaled). - */ - v = u - sch; - wy = v < ylen ? y[v] : ysign; - wys = ((wy << scl) & 0x7FFFFFFF) | tw; - tw = wy >> (31 - scl); - - /* - * The expression below does not overflow. - */ - z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc); - x[u] = (uint32_t)z & 0x7FFFFFFF; - - /* - * Right-shifting the signed value z would yield - * implementation-defined results (arithmetic shift is - * not guaranteed). However, we can cast to unsigned, - * and get the next carry as an unsigned word. We can - * then convert it back to signed by using the guaranteed - * fact that 'int32_t' uses two's complement with no - * trap representation or padding bit, and with a layout - * compatible with that of 'uint32_t'. - */ - ccu = (uint32_t)(z >> 31); - cc = *(int32_t *)&ccu; - } -} - -/* - * Subtract y*2^sc from x. The result is assumed to fit in the array of - * size xlen (truncation is applied if necessary). - * Scale factor 'sc' is provided as sch and scl, such that: - * sch = sc / 31 - * scl = sc % 31 - * xlen MUST NOT be lower than ylen. - * - * x[] and y[] are both signed integers, using two's complement for - * negative values. - */ -static void -zint_sub_scaled(uint32_t *restrict x, size_t xlen, - const uint32_t *restrict y, size_t ylen, uint32_t sch, uint32_t scl) -{ - size_t u; - uint32_t ysign, tw; - uint32_t cc; - - if (ylen == 0) { - return; - } - - ysign = -(y[ylen - 1] >> 30) >> 1; - tw = 0; - cc = 0; - for (u = sch; u < xlen; u ++) { - size_t v; - uint32_t w, wy, wys; - - /* - * Get the next word of y (scaled). - */ - v = u - sch; - wy = v < ylen ? y[v] : ysign; - wys = ((wy << scl) & 0x7FFFFFFF) | tw; - tw = wy >> (31 - scl); - - w = x[u] - wys - cc; - x[u] = w & 0x7FFFFFFF; - cc = w >> 31; - } -} - -/* - * Convert a one-word signed big integer into a signed value. - */ -static inline int32_t -zint_one_to_plain(const uint32_t *x) -{ - uint32_t w; - - w = x[0]; - w |= (w & 0x40000000) << 1; - return *(int32_t *)&w; -} - -/* ==================================================================== */ - -/* - * Convert a polynomial to floating-point values. - * - * Each coefficient has length flen words, and starts fstride words after - * the previous. - * - * IEEE-754 binary64 values can represent values in a finite range, - * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large, - * they should be "trimmed" by pointing not to the lowest word of each, - * but upper. - */ -static void -poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride, - unsigned logn) -{ - size_t n, u; - - n = MKN(logn); - if (flen == 0) { - for (u = 0; u < n; u ++) { - d[u] = fpr_zero; - } - return; - } - for (u = 0; u < n; u ++, f += fstride) { - size_t v; - uint32_t neg, cc, xm; - fpr x, fsc; - - /* - * Get sign of the integer; if it is negative, then we - * will load its absolute value instead, and negate the - * result. - */ - neg = -(f[flen - 1] >> 30); - xm = neg >> 1; - cc = neg & 1; - x = fpr_zero; - fsc = fpr_one; - for (v = 0; v < flen; v ++, fsc = fpr_mul(fsc, fpr_ptwo31)) { - uint32_t w; - - w = (f[v] ^ xm) + cc; - cc = w >> 31; - w &= 0x7FFFFFFF; - w -= (w << 1) & neg; - x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc)); - } - d[u] = x; - } -} - -/* - * Convert a polynomial to small integers. Source values are supposed - * to be one-word integers, signed over 31 bits. Returned value is 0 - * if any of the coefficients exceeds the provided limit (in absolute - * value), or 1 on success. - * - * This is not constant-time; this is not a problem here, because on - * any failure, the NTRU-solving process will be deemed to have failed - * and the (f,g) polynomials will be discarded. - */ -static int -poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn) -{ - size_t n, u; - - n = MKN(logn); - for (u = 0; u < n; u ++) { - int32_t z; - - z = zint_one_to_plain(s + u); - if (z < -lim || z > lim) { - return 0; - } - d[u] = (int8_t)z; - } - return 1; -} - -/* - * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1. - * Coefficients of polynomial k are small integers (signed values in the - * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31 - * and scl = sc % 31. - * - * This function implements the basic quadratic multiplication algorithm, - * which is efficient in space (no extra buffer needed) but slow at - * high degree. - */ -static void -poly_sub_scaled(uint32_t *restrict F, size_t Flen, size_t Fstride, - const uint32_t *restrict f, size_t flen, size_t fstride, - const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn) -{ - size_t n, u; - - n = MKN(logn); - for (u = 0; u < n; u ++) { - int32_t kf; - size_t v; - uint32_t *x; - const uint32_t *y; - - kf = -k[u]; - x = F + u * Fstride; - y = f; - for (v = 0; v < n; v ++) { - zint_add_scaled_mul_small( - x, Flen, y, flen, kf, sch, scl); - if (u + v == n - 1) { - x = F; - kf = -kf; - } else { - x += Fstride; - } - y += fstride; - } - } -} - -/* - * Subtract k*f from F. Coefficients of polynomial k are small integers - * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function - * assumes that the degree is large, and integers relatively small. - * The value sc is provided as sch = sc / 31 and scl = sc % 31. - */ -static void -poly_sub_scaled_ntt(uint32_t *restrict F, size_t Flen, size_t Fstride, - const uint32_t *restrict f, size_t flen, size_t fstride, - const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn, - uint32_t *restrict tmp) -{ - uint32_t *gm, *igm, *fk, *t1, *x; - const uint32_t *y; - size_t n, u, tlen; - const small_prime *primes; - - n = MKN(logn); - tlen = flen + 1; - gm = tmp; - igm = gm + MKN(logn); - fk = igm + MKN(logn); - t1 = fk + n * tlen; - - primes = PRIMES; - - /* - * Compute k*f in fk[], in RNS notation. - */ - for (u = 0; u < tlen; u ++) { - uint32_t p, p0i, R2, Rx; - size_t v; - - p = primes[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - Rx = modp_Rx((unsigned)flen, p, p0i, R2); - modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i); - - for (v = 0; v < n; v ++) { - t1[v] = modp_set(k[v], p); - } - modp_NTT2(t1, gm, logn, p, p0i); - for (v = 0, y = f, x = fk + u; - v < n; v ++, y += fstride, x += tlen) - { - *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx); - } - modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i); - for (v = 0, x = fk + u; v < n; v ++, x += tlen) { - *x = modp_montymul( - modp_montymul(t1[v], *x, p, p0i), R2, p, p0i); - } - modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i); - } - - /* - * Rebuild k*f. - */ - zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1); - - /* - * Subtract k*f, scaled, from F. - */ - for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) { - zint_sub_scaled(x, Flen, y, tlen, sch, scl); - } -} - -/* ==================================================================== */ - -#if FALCON_KG_CHACHA20 // yyyKG_CHACHA20+1 - -#define RNG_CONTEXT prng -#define get_rng_u64 prng_get_u64 - -#else // yyyKG_CHACHA20+0 - -#define RNG_CONTEXT inner_shake256_context - -/* - * Get a random 8-byte integer from a SHAKE-based RNG. This function - * ensures consistent interpretation of the SHAKE output so that - * the same values will be obtained over different platforms, in case - * a known seed is used. - */ -static inline uint64_t -get_rng_u64(inner_shake256_context *rng) -{ - /* - * We enforce little-endian representation. - */ - -#if FALCON_LE // yyyLE+1 - /* - * On little-endian systems we just interpret the bytes "as is" - * (this is correct because the exact-width types such as - * 'uint64_t' are guaranteed to have no padding and no trap - * representation). - */ - uint64_t r; - - inner_shake256_extract(rng, (uint8_t *)&r, sizeof r); - return r; -#else // yyyLE+0 - uint8_t tmp[8]; - - inner_shake256_extract(rng, tmp, sizeof tmp); - return (uint64_t)tmp[0] - | ((uint64_t)tmp[1] << 8) - | ((uint64_t)tmp[2] << 16) - | ((uint64_t)tmp[3] << 24) - | ((uint64_t)tmp[4] << 32) - | ((uint64_t)tmp[5] << 40) - | ((uint64_t)tmp[6] << 48) - | ((uint64_t)tmp[7] << 56); -#endif // yyyLE- -} - -#endif // yyyKG_CHACHA20- - -/* - * Table below incarnates a discrete Gaussian distribution: - * D(x) = exp(-(x^2)/(2*sigma^2)) - * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024. - * Element 0 of the table is P(x = 0). - * For k > 0, element k is P(x >= k+1 | x > 0). - * Probabilities are scaled up by 2^63. - */ -static const uint64_t gauss_1024_12289[] = { - 1283868770400643928u, 6416574995475331444u, 4078260278032692663u, - 2353523259288686585u, 1227179971273316331u, 575931623374121527u, - 242543240509105209u, 91437049221049666u, 30799446349977173u, - 9255276791179340u, 2478152334826140u, 590642893610164u, - 125206034929641u, 23590435911403u, 3948334035941u, - 586753615614u, 77391054539u, 9056793210u, - 940121950u, 86539696u, 7062824u, - 510971u, 32764u, 1862u, - 94u, 4u, 0u -}; - -/* - * Generate a random value with a Gaussian distribution centered on 0. - * The RNG must be ready for extraction (already flipped). - * - * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The - * precomputed table is for N = 1024. Since the sum of two independent - * values of standard deviation sigma has standard deviation - * sigma*sqrt(2), then we can just generate more values and add them - * together for lower dimensions. - */ -static int -mkgauss(RNG_CONTEXT *rng, unsigned logn) -{ - unsigned u, g; - int val; - - g = 1U << (10 - logn); - val = 0; - for (u = 0; u < g; u ++) { - /* - * Each iteration generates one value with the - * Gaussian distribution for N = 1024. - * - * We use two random 64-bit values. First value - * decides on whether the generated value is 0, and, - * if not, the sign of the value. Second random 64-bit - * word is used to generate the non-zero value. - * - * For constant-time code we have to read the complete - * table. This has negligible cost, compared with the - * remainder of the keygen process (solving the NTRU - * equation). - */ - uint64_t r; - uint32_t f, v, k, neg; - - /* - * First value: - * - flag 'neg' is randomly selected to be 0 or 1. - * - flag 'f' is set to 1 if the generated value is zero, - * or set to 0 otherwise. - */ - r = get_rng_u64(rng); - neg = (uint32_t)(r >> 63); - r &= ~((uint64_t)1 << 63); - f = (uint32_t)((r - gauss_1024_12289[0]) >> 63); - - /* - * We produce a new random 63-bit integer r, and go over - * the array, starting at index 1. We store in v the - * index of the first array element which is not greater - * than r, unless the flag f was already 1. - */ - v = 0; - r = get_rng_u64(rng); - r &= ~((uint64_t)1 << 63); - for (k = 1; k < (sizeof gauss_1024_12289) - / (sizeof gauss_1024_12289[0]); k ++) - { - uint32_t t; - - t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1; - v |= k & -(t & (f ^ 1)); - f |= t; - } - - /* - * We apply the sign ('neg' flag). If the value is zero, - * the sign has no effect. - */ - v = (v ^ -neg) + neg; - - /* - * Generated value is added to val. - */ - val += *(int32_t *)&v; - } - return val; -} - -/* - * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit - * words, of intermediate values in the computation: - * - * MAX_BL_SMALL[depth]: length for the input f and g at that depth - * MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth - * - * Rules: - * - * - Within an array, values grow. - * - * - The 'SMALL' array must have an entry for maximum depth, corresponding - * to the size of values used in the binary GCD. There is no such value - * for the 'LARGE' array (the binary GCD yields already reduced - * coefficients). - * - * - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1]. - * - * - Values must be large enough to handle the common cases, with some - * margins. - * - * - Values must not be "too large" either because we will convert some - * integers into floating-point values by considering the top 10 words, - * i.e. 310 bits; hence, for values of length more than 10 words, we - * should take care to have the length centered on the expected size. - * - * The following average lengths, in bits, have been measured on thousands - * of random keys (fg = max length of the absolute value of coefficients - * of f and g at that depth; FG = idem for the unreduced F and G; for the - * maximum depth, F and G are the output of binary GCD, multiplied by q; - * for each value, the average and standard deviation are provided). - * - * Binary case: - * depth: 10 fg: 6307.52 (24.48) FG: 6319.66 (24.51) - * depth: 9 fg: 3138.35 (12.25) FG: 9403.29 (27.55) - * depth: 8 fg: 1576.87 ( 7.49) FG: 4703.30 (14.77) - * depth: 7 fg: 794.17 ( 4.98) FG: 2361.84 ( 9.31) - * depth: 6 fg: 400.67 ( 3.10) FG: 1188.68 ( 6.04) - * depth: 5 fg: 202.22 ( 1.87) FG: 599.81 ( 3.87) - * depth: 4 fg: 101.62 ( 1.02) FG: 303.49 ( 2.38) - * depth: 3 fg: 50.37 ( 0.53) FG: 153.65 ( 1.39) - * depth: 2 fg: 24.07 ( 0.25) FG: 78.20 ( 0.73) - * depth: 1 fg: 10.99 ( 0.08) FG: 39.82 ( 0.41) - * depth: 0 fg: 4.00 ( 0.00) FG: 19.61 ( 0.49) - * - * Integers are actually represented either in binary notation over - * 31-bit words (signed, using two's complement), or in RNS, modulo - * many small primes. These small primes are close to, but slightly - * lower than, 2^31. Use of RNS loses less than two bits, even for - * the largest values. - * - * IMPORTANT: if these values are modified, then the temporary buffer - * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed - * accordingly. - */ - -static const size_t MAX_BL_SMALL[] = { - 1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209 -}; - -static const size_t MAX_BL_LARGE[] = { - 2, 2, 5, 7, 12, 21, 40, 78, 157, 308 -}; - -/* - * Average and standard deviation for the maximum size (in bits) of - * coefficients of (f,g), depending on depth. These values are used - * to compute bounds for Babai's reduction. - */ -static const struct { - int avg; - int std; -} BITLENGTH[] = { - { 4, 0 }, - { 11, 1 }, - { 24, 1 }, - { 50, 1 }, - { 102, 1 }, - { 202, 2 }, - { 401, 4 }, - { 794, 5 }, - { 1577, 8 }, - { 3138, 13 }, - { 6308, 25 } -}; - -/* - * Minimal recursion depth at which we rebuild intermediate values - * when reconstructing f and g. - */ -#define DEPTH_INT_FG 4 - -/* - * Compute squared norm of a short vector. Returned value is saturated to - * 2^32-1 if it is not lower than 2^31. - */ -static uint32_t -poly_small_sqnorm(const int8_t *f, unsigned logn) -{ - size_t n, u; - uint32_t s, ng; - - n = MKN(logn); - s = 0; - ng = 0; - for (u = 0; u < n; u ++) { - int32_t z; - - z = f[u]; - s += (uint32_t)(z * z); - ng |= s; - } - return s | -(ng >> 31); -} - -/* - * Align (upwards) the provided 'data' pointer with regards to 'base' - * so that the offset is a multiple of the size of 'fpr'. - */ -static fpr * -align_fpr(void *base, void *data) -{ - uint8_t *cb, *cd; - size_t k, km; - - cb = base; - cd = data; - k = (size_t)(cd - cb); - km = k % sizeof(fpr); - if (km) { - k += (sizeof(fpr)) - km; - } - return (fpr *)(cb + k); -} - -/* - * Align (upwards) the provided 'data' pointer with regards to 'base' - * so that the offset is a multiple of the size of 'uint32_t'. - */ -static uint32_t * -align_u32(void *base, void *data) -{ - uint8_t *cb, *cd; - size_t k, km; - - cb = base; - cd = data; - k = (size_t)(cd - cb); - km = k % sizeof(uint32_t); - if (km) { - k += (sizeof(uint32_t)) - km; - } - return (uint32_t *)(cb + k); -} - -/* - * Convert a small vector to floating point. - */ -static void -poly_small_to_fp(fpr *x, const int8_t *f, unsigned logn) -{ - size_t n, u; - - n = MKN(logn); - for (u = 0; u < n; u ++) { - x[u] = fpr_of(f[u]); - } -} - -/* - * Input: f,g of degree N = 2^logn; 'depth' is used only to get their - * individual length. - * - * Output: f',g' of degree N/2, with the length for 'depth+1'. - * - * Values are in RNS; input and/or output may also be in NTT. - */ -static void -make_fg_step(uint32_t *data, unsigned logn, unsigned depth, - int in_ntt, int out_ntt) -{ - size_t n, hn, u; - size_t slen, tlen; - uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1; - const small_prime *primes; - - n = (size_t)1 << logn; - hn = n >> 1; - slen = MAX_BL_SMALL[depth]; - tlen = MAX_BL_SMALL[depth + 1]; - primes = PRIMES; - - /* - * Prepare room for the result. - */ - fd = data; - gd = fd + hn * tlen; - fs = gd + hn * tlen; - gs = fs + n * slen; - gm = gs + n * slen; - igm = gm + n; - t1 = igm + n; - memmove(fs, data, 2 * n * slen * sizeof *data); - - /* - * First slen words: we use the input values directly, and apply - * inverse NTT as we go. - */ - for (u = 0; u < slen; u ++) { - uint32_t p, p0i, R2; - size_t v; - uint32_t *x; - - p = primes[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i); - - for (v = 0, x = fs + u; v < n; v ++, x += slen) { - t1[v] = *x; - } - if (!in_ntt) { - modp_NTT2(t1, gm, logn, p, p0i); - } - for (v = 0, x = fd + u; v < hn; v ++, x += tlen) { - uint32_t w0, w1; - - w0 = t1[(v << 1) + 0]; - w1 = t1[(v << 1) + 1]; - *x = modp_montymul( - modp_montymul(w0, w1, p, p0i), R2, p, p0i); - } - if (in_ntt) { - modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i); - } - - for (v = 0, x = gs + u; v < n; v ++, x += slen) { - t1[v] = *x; - } - if (!in_ntt) { - modp_NTT2(t1, gm, logn, p, p0i); - } - for (v = 0, x = gd + u; v < hn; v ++, x += tlen) { - uint32_t w0, w1; - - w0 = t1[(v << 1) + 0]; - w1 = t1[(v << 1) + 1]; - *x = modp_montymul( - modp_montymul(w0, w1, p, p0i), R2, p, p0i); - } - if (in_ntt) { - modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i); - } - - if (!out_ntt) { - modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i); - modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i); - } - } - - /* - * Since the fs and gs words have been de-NTTized, we can use the - * CRT to rebuild the values. - */ - zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm); - zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm); - - /* - * Remaining words: use modular reductions to extract the values. - */ - for (u = slen; u < tlen; u ++) { - uint32_t p, p0i, R2, Rx; - size_t v; - uint32_t *x; - - p = primes[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - Rx = modp_Rx((unsigned)slen, p, p0i, R2); - modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i); - for (v = 0, x = fs; v < n; v ++, x += slen) { - t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx); - } - modp_NTT2(t1, gm, logn, p, p0i); - for (v = 0, x = fd + u; v < hn; v ++, x += tlen) { - uint32_t w0, w1; - - w0 = t1[(v << 1) + 0]; - w1 = t1[(v << 1) + 1]; - *x = modp_montymul( - modp_montymul(w0, w1, p, p0i), R2, p, p0i); - } - for (v = 0, x = gs; v < n; v ++, x += slen) { - t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx); - } - modp_NTT2(t1, gm, logn, p, p0i); - for (v = 0, x = gd + u; v < hn; v ++, x += tlen) { - uint32_t w0, w1; - - w0 = t1[(v << 1) + 0]; - w1 = t1[(v << 1) + 1]; - *x = modp_montymul( - modp_montymul(w0, w1, p, p0i), R2, p, p0i); - } - - if (!out_ntt) { - modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i); - modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i); - } - } -} - -/* - * Compute f and g at a specific depth, in RNS notation. - * - * Returned values are stored in the data[] array, at slen words per integer. - * - * Conditions: - * 0 <= depth <= logn - * - * Space use in data[]: enough room for any two successive values (f', g', - * f and g). - */ -static void -make_fg(uint32_t *data, const int8_t *f, const int8_t *g, - unsigned logn, unsigned depth, int out_ntt) -{ - size_t n, u; - uint32_t *ft, *gt, p0; - unsigned d; - const small_prime *primes; - - n = MKN(logn); - ft = data; - gt = ft + n; - primes = PRIMES; - p0 = primes[0].p; - for (u = 0; u < n; u ++) { - ft[u] = modp_set(f[u], p0); - gt[u] = modp_set(g[u], p0); - } - - if (depth == 0 && out_ntt) { - uint32_t *gm, *igm; - uint32_t p, p0i; - - p = primes[0].p; - p0i = modp_ninv31(p); - gm = gt + n; - igm = gm + MKN(logn); - modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i); - modp_NTT2(ft, gm, logn, p, p0i); - modp_NTT2(gt, gm, logn, p, p0i); - return; - } - - for (d = 0; d < depth; d ++) { - make_fg_step(data, logn - d, d, - d != 0, (d + 1) < depth || out_ntt); - } -} - -/* - * Solving the NTRU equation, deepest level: compute the resultants of - * f and g with X^N+1, and use binary GCD. The F and G values are - * returned in tmp[]. - * - * Returned value: 1 on success, 0 on error. - */ -static int -solve_NTRU_deepest(unsigned logn_top, - const int8_t *f, const int8_t *g, uint32_t *tmp) -{ - size_t len; - uint32_t *Fp, *Gp, *fp, *gp, *t1, q; - const small_prime *primes; - - len = MAX_BL_SMALL[logn_top]; - primes = PRIMES; - - Fp = tmp; - Gp = Fp + len; - fp = Gp + len; - gp = fp + len; - t1 = gp + len; - - make_fg(fp, f, g, logn_top, logn_top, 0); - - /* - * We use the CRT to rebuild the resultants as big integers. - * There are two such big integers. The resultants are always - * nonnegative. - */ - zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1); - - /* - * Apply the binary GCD. The zint_bezout() function works only - * if both inputs are odd. - * - * We can test on the result and return 0 because that would - * imply failure of the NTRU solving equation, and the (f,g) - * values will be abandoned in that case. - */ - if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) { - return 0; - } - - /* - * Multiply the two values by the target value q. Values must - * fit in the destination arrays. - * We can again test on the returned words: a non-zero output - * of zint_mul_small() means that we exceeded our array - * capacity, and that implies failure and rejection of (f,g). - */ - q = 12289; - if (zint_mul_small(Fp, len, q) != 0 - || zint_mul_small(Gp, len, q) != 0) - { - return 0; - } - - return 1; -} - -/* - * Solving the NTRU equation, intermediate level. Upon entry, the F and G - * from the previous level should be in the tmp[] array. - * This function MAY be invoked for the top-level (in which case depth = 0). - * - * Returned value: 1 on success, 0 on error. - */ -static int -solve_NTRU_intermediate(unsigned logn_top, - const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp) -{ - /* - * In this function, 'logn' is the log2 of the degree for - * this step. If N = 2^logn, then: - * - the F and G values already in fk->tmp (from the deeper - * levels) have degree N/2; - * - this function should return F and G of degree N. - */ - unsigned logn; - size_t n, hn, slen, dlen, llen, rlen, FGlen, u; - uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1; - fpr *rt1, *rt2, *rt3, *rt4, *rt5; - int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k; - uint32_t *x, *y; - int32_t *k; - const small_prime *primes; - - logn = logn_top - depth; - n = (size_t)1 << logn; - hn = n >> 1; - - /* - * slen = size for our input f and g; also size of the reduced - * F and G we return (degree N) - * - * dlen = size of the F and G obtained from the deeper level - * (degree N/2 or N/3) - * - * llen = size for intermediary F and G before reduction (degree N) - * - * We build our non-reduced F and G as two independent halves each, - * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1). - */ - slen = MAX_BL_SMALL[depth]; - dlen = MAX_BL_SMALL[depth + 1]; - llen = MAX_BL_LARGE[depth]; - primes = PRIMES; - - /* - * Fd and Gd are the F and G from the deeper level. - */ - Fd = tmp; - Gd = Fd + dlen * hn; - - /* - * Compute the input f and g for this level. Note that we get f - * and g in RNS + NTT representation. - */ - ft = Gd + dlen * hn; - make_fg(ft, f, g, logn_top, depth, 1); - - /* - * Move the newly computed f and g to make room for our candidate - * F and G (unreduced). - */ - Ft = tmp; - Gt = Ft + n * llen; - t1 = Gt + n * llen; - memmove(t1, ft, 2 * n * slen * sizeof *ft); - ft = t1; - gt = ft + slen * n; - t1 = gt + slen * n; - - /* - * Move Fd and Gd _after_ f and g. - */ - memmove(t1, Fd, 2 * hn * dlen * sizeof *Fd); - Fd = t1; - Gd = Fd + hn * dlen; - - /* - * We reduce Fd and Gd modulo all the small primes we will need, - * and store the values in Ft and Gt (only n/2 values in each). - */ - for (u = 0; u < llen; u ++) { - uint32_t p, p0i, R2, Rx; - size_t v; - uint32_t *xs, *ys, *xd, *yd; - - p = primes[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - Rx = modp_Rx((unsigned)dlen, p, p0i, R2); - for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u; - v < hn; - v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) - { - *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx); - *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx); - } - } - - /* - * We do not need Fd and Gd after that point. - */ - - /* - * Compute our F and G modulo sufficiently many small primes. - */ - for (u = 0; u < llen; u ++) { - uint32_t p, p0i, R2; - uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp; - size_t v; - - /* - * All computations are done modulo p. - */ - p = primes[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - - /* - * If we processed slen words, then f and g have been - * de-NTTized, and are in RNS; we can rebuild them. - */ - if (u == slen) { - zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1); - zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1); - } - - gm = t1; - igm = gm + n; - fx = igm + n; - gx = fx + n; - - modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i); - - if (u < slen) { - for (v = 0, x = ft + u, y = gt + u; - v < n; v ++, x += slen, y += slen) - { - fx[v] = *x; - gx[v] = *y; - } - modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i); - modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i); - } else { - uint32_t Rx; - - Rx = modp_Rx((unsigned)slen, p, p0i, R2); - for (v = 0, x = ft, y = gt; - v < n; v ++, x += slen, y += slen) - { - fx[v] = zint_mod_small_signed(x, slen, - p, p0i, R2, Rx); - gx[v] = zint_mod_small_signed(y, slen, - p, p0i, R2, Rx); - } - modp_NTT2(fx, gm, logn, p, p0i); - modp_NTT2(gx, gm, logn, p, p0i); - } - - /* - * Get F' and G' modulo p and in NTT representation - * (they have degree n/2). These values were computed in - * a previous step, and stored in Ft and Gt. - */ - Fp = gx + n; - Gp = Fp + hn; - for (v = 0, x = Ft + u, y = Gt + u; - v < hn; v ++, x += llen, y += llen) - { - Fp[v] = *x; - Gp[v] = *y; - } - modp_NTT2(Fp, gm, logn - 1, p, p0i); - modp_NTT2(Gp, gm, logn - 1, p, p0i); - - /* - * Compute our F and G modulo p. - * - * General case: - * - * we divide degree by d = 2 or 3 - * f'(x^d) = N(f)(x^d) = f * adj(f) - * g'(x^d) = N(g)(x^d) = g * adj(g) - * f'*G' - g'*F' = q - * F = F'(x^d) * adj(g) - * G = G'(x^d) * adj(f) - * - * We compute things in the NTT. We group roots of phi - * such that all roots x in a group share the same x^d. - * If the roots in a group are x_1, x_2... x_d, then: - * - * N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d) - * - * Thus, we have: - * - * G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d) - * G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d) - * ... - * G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d) - * - * In all cases, we can thus compute F and G in NTT - * representation by a few simple multiplications. - * Moreover, in our chosen NTT representation, roots - * from the same group are consecutive in RAM. - */ - for (v = 0, x = Ft + u, y = Gt + u; v < hn; - v ++, x += (llen << 1), y += (llen << 1)) - { - uint32_t ftA, ftB, gtA, gtB; - uint32_t mFp, mGp; - - ftA = fx[(v << 1) + 0]; - ftB = fx[(v << 1) + 1]; - gtA = gx[(v << 1) + 0]; - gtB = gx[(v << 1) + 1]; - mFp = modp_montymul(Fp[v], R2, p, p0i); - mGp = modp_montymul(Gp[v], R2, p, p0i); - x[0] = modp_montymul(gtB, mFp, p, p0i); - x[llen] = modp_montymul(gtA, mFp, p, p0i); - y[0] = modp_montymul(ftB, mGp, p, p0i); - y[llen] = modp_montymul(ftA, mGp, p, p0i); - } - modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i); - modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i); - } - - /* - * Rebuild F and G with the CRT. - */ - zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1); - zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1); - - /* - * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that - * order). - */ - - /* - * Apply Babai reduction to bring back F and G to size slen. - * - * We use the FFT to compute successive approximations of the - * reduction coefficient. We first isolate the top bits of - * the coefficients of f and g, and convert them to floating - * point; with the FFT, we compute adj(f), adj(g), and - * 1/(f*adj(f)+g*adj(g)). - * - * Then, we repeatedly apply the following: - * - * - Get the top bits of the coefficients of F and G into - * floating point, and use the FFT to compute: - * (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) - * - * - Convert back that value into normal representation, and - * round it to the nearest integers, yielding a polynomial k. - * Proper scaling is applied to f, g, F and G so that the - * coefficients fit on 32 bits (signed). - * - * - Subtract k*f from F and k*g from G. - * - * Under normal conditions, this process reduces the size of F - * and G by some bits at each iteration. For constant-time - * operation, we do not want to measure the actual length of - * F and G; instead, we do the following: - * - * - f and g are converted to floating-point, with some scaling - * if necessary to keep values in the representable range. - * - * - For each iteration, we _assume_ a maximum size for F and G, - * and use the values at that size. If we overreach, then - * we get zeros, which is harmless: the resulting coefficients - * of k will be 0 and the value won't be reduced. - * - * - We conservatively assume that F and G will be reduced by - * at least 25 bits at each iteration. - * - * Even when reaching the bottom of the reduction, reduction - * coefficient will remain low. If it goes out-of-range, then - * something wrong occurred and the whole NTRU solving fails. - */ - - /* - * Memory layout: - * - We need to compute and keep adj(f), adj(g), and - * 1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers, - * respectively). - * - At each iteration we need two extra fp buffer (N fp values), - * and produce a k (N 32-bit words). k will be shared with one - * of the fp buffers. - * - To compute k*f and k*g efficiently (with the NTT), we need - * some extra room; we reuse the space of the temporary buffers. - * - * Arrays of 'fpr' are obtained from the temporary array itself. - * We ensure that the base is at a properly aligned offset (the - * source array tmp[] is supposed to be already aligned). - */ - - rt3 = align_fpr(tmp, t1); - rt4 = rt3 + n; - rt5 = rt4 + n; - rt1 = rt5 + (n >> 1); - k = (int32_t *)align_u32(tmp, rt1); - rt2 = align_fpr(tmp, k + n); - if (rt2 < (rt1 + n)) { - rt2 = rt1 + n; - } - t1 = (uint32_t *)k + n; - - /* - * Get f and g into rt3 and rt4 as floating-point approximations. - * - * We need to "scale down" the floating-point representation of - * coefficients when they are too big. We want to keep the value - * below 2^310 or so. Thus, when values are larger than 10 words, - * we consider only the top 10 words. Array lengths have been - * computed so that average maximum length will fall in the - * middle or the upper half of these top 10 words. - */ - rlen = (slen > 10) ? 10 : slen; - poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn); - poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn); - - /* - * Values in rt3 and rt4 are downscaled by 2^(scale_fg). - */ - scale_fg = 31 * (int)(slen - rlen); - - /* - * Estimated boundaries for the maximum size (in bits) of the - * coefficients of (f,g). We use the measured average, and - * allow for a deviation of at most six times the standard - * deviation. - */ - minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std; - maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std; - - /* - * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f) - * and adj(g) in rt3 and rt4, respectively. - */ - Zf(FFT)(rt3, logn); - Zf(FFT)(rt4, logn); - Zf(poly_invnorm2_fft)(rt5, rt3, rt4, logn); - Zf(poly_adj_fft)(rt3, logn); - Zf(poly_adj_fft)(rt4, logn); - - /* - * Reduce F and G repeatedly. - * - * The expected maximum bit length of coefficients of F and G - * is kept in maxbl_FG, with the corresponding word length in - * FGlen. - */ - FGlen = llen; - maxbl_FG = 31 * (int)llen; - - /* - * Each reduction operation computes the reduction polynomial - * "k". We need that polynomial to have coefficients that fit - * on 32-bit signed integers, with some scaling; thus, we use - * a descending sequence of scaling values, down to zero. - * - * The size of the coefficients of k is (roughly) the difference - * between the size of the coefficients of (F,G) and the size - * of the coefficients of (f,g). Thus, the maximum size of the - * coefficients of k is, at the start, maxbl_FG - minbl_fg; - * this is our starting scale value for k. - * - * We need to estimate the size of (F,G) during the execution of - * the algorithm; we are allowed some overestimation but not too - * much (poly_big_to_fp() uses a 310-bit window). Generally - * speaking, after applying a reduction with k scaled to - * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd, - * where 'dd' is a few bits to account for the fact that the - * reduction is never perfect (intuitively, dd is on the order - * of sqrt(N), so at most 5 bits; we here allow for 10 extra - * bits). - * - * The size of (f,g) is not known exactly, but maxbl_fg is an - * upper bound. - */ - scale_k = maxbl_FG - minbl_fg; - - for (;;) { - int scale_FG, dc, new_maxbl_FG; - uint32_t scl, sch; - fpr pdc, pt; - - /* - * Convert current F and G into floating-point. We apply - * scaling if the current length is more than 10 words. - */ - rlen = (FGlen > 10) ? 10 : FGlen; - scale_FG = 31 * (int)(FGlen - rlen); - poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn); - poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn); - - /* - * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2. - */ - Zf(FFT)(rt1, logn); - Zf(FFT)(rt2, logn); - Zf(poly_mul_fft)(rt1, rt3, logn); - Zf(poly_mul_fft)(rt2, rt4, logn); - Zf(poly_add)(rt2, rt1, logn); - Zf(poly_mul_autoadj_fft)(rt2, rt5, logn); - Zf(iFFT)(rt2, logn); - - /* - * (f,g) are scaled by 'scale_fg', meaning that the - * numbers in rt3/rt4 should be multiplied by 2^(scale_fg) - * to have their true mathematical value. - * - * (F,G) are similarly scaled by 'scale_FG'. Therefore, - * the value we computed in rt2 is scaled by - * 'scale_FG-scale_fg'. - * - * We want that value to be scaled by 'scale_k', hence we - * apply a corrective scaling. After scaling, the values - * should fit in -2^31-1..+2^31-1. - */ - dc = scale_k - scale_FG + scale_fg; - - /* - * We will need to multiply values by 2^(-dc). The value - * 'dc' is not secret, so we can compute 2^(-dc) with a - * non-constant-time process. - * (We could use ldexp(), but we prefer to avoid any - * dependency on libm. When using FP emulation, we could - * use our fpr_ldexp(), which is constant-time.) - */ - if (dc < 0) { - dc = -dc; - pt = fpr_two; - } else { - pt = fpr_onehalf; - } - pdc = fpr_one; - while (dc != 0) { - if ((dc & 1) != 0) { - pdc = fpr_mul(pdc, pt); - } - dc >>= 1; - pt = fpr_sqr(pt); - } - - for (u = 0; u < n; u ++) { - fpr xv; - - xv = fpr_mul(rt2[u], pdc); - - /* - * Sometimes the values can be out-of-bounds if - * the algorithm fails; we must not call - * fpr_rint() (and cast to int32_t) if the value - * is not in-bounds. Note that the test does not - * break constant-time discipline, since any - * failure here implies that we discard the current - * secret key (f,g). - */ - if (!fpr_lt(fpr_mtwo31m1, xv) - || !fpr_lt(xv, fpr_ptwo31m1)) - { - return 0; - } - k[u] = (int32_t)fpr_rint(xv); - } - - /* - * Values in k[] are integers. They really are scaled - * down by maxbl_FG - minbl_fg bits. - * - * If we are at low depth, then we use the NTT to - * compute k*f and k*g. - */ - sch = (uint32_t)(scale_k / 31); - scl = (uint32_t)(scale_k % 31); - if (depth <= DEPTH_INT_FG) { - poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen, - k, sch, scl, logn, t1); - poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen, - k, sch, scl, logn, t1); - } else { - poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen, - k, sch, scl, logn); - poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen, - k, sch, scl, logn); - } - - /* - * We compute the new maximum size of (F,G), assuming that - * (f,g) has _maximal_ length (i.e. that reduction is - * "late" instead of "early". We also adjust FGlen - * accordingly. - */ - new_maxbl_FG = scale_k + maxbl_fg + 10; - if (new_maxbl_FG < maxbl_FG) { - maxbl_FG = new_maxbl_FG; - if ((int)FGlen * 31 >= maxbl_FG + 31) { - FGlen --; - } - } - - /* - * We suppose that scaling down achieves a reduction by - * at least 25 bits per iteration. We stop when we have - * done the loop with an unscaled k. - */ - if (scale_k <= 0) { - break; - } - scale_k -= 25; - if (scale_k < 0) { - scale_k = 0; - } - } - - /* - * If (F,G) length was lowered below 'slen', then we must take - * care to re-extend the sign. - */ - if (FGlen < slen) { - for (u = 0; u < n; u ++, Ft += llen, Gt += llen) { - size_t v; - uint32_t sw; - - sw = -(Ft[FGlen - 1] >> 30) >> 1; - for (v = FGlen; v < slen; v ++) { - Ft[v] = sw; - } - sw = -(Gt[FGlen - 1] >> 30) >> 1; - for (v = FGlen; v < slen; v ++) { - Gt[v] = sw; - } - } - } - - /* - * Compress encoding of all values to 'slen' words (this is the - * expected output format). - */ - for (u = 0, x = tmp, y = tmp; - u < (n << 1); u ++, x += slen, y += llen) - { - memmove(x, y, slen * sizeof *y); - } - return 1; -} - -/* - * Solving the NTRU equation, binary case, depth = 1. Upon entry, the - * F and G from the previous level should be in the tmp[] array. - * - * Returned value: 1 on success, 0 on error. - */ -static int -solve_NTRU_binary_depth1(unsigned logn_top, - const int8_t *f, const int8_t *g, uint32_t *tmp) -{ - /* - * The first half of this function is a copy of the corresponding - * part in solve_NTRU_intermediate(), for the reconstruction of - * the unreduced F and G. The second half (Babai reduction) is - * done differently, because the unreduced F and G fit in 53 bits - * of precision, allowing a much simpler process with lower RAM - * usage. - */ - unsigned depth, logn; - size_t n_top, n, hn, slen, dlen, llen, u; - uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1; - fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6; - uint32_t *x, *y; - - depth = 1; - n_top = (size_t)1 << logn_top; - logn = logn_top - depth; - n = (size_t)1 << logn; - hn = n >> 1; - - /* - * Equations are: - * - * f' = f0^2 - X^2*f1^2 - * g' = g0^2 - X^2*g1^2 - * F' and G' are a solution to f'G' - g'F' = q (from deeper levels) - * F = F'*(g0 - X*g1) - * G = G'*(f0 - X*f1) - * - * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to - * degree N/2 (their odd-indexed coefficients are all zero). - */ - - /* - * slen = size for our input f and g; also size of the reduced - * F and G we return (degree N) - * - * dlen = size of the F and G obtained from the deeper level - * (degree N/2) - * - * llen = size for intermediary F and G before reduction (degree N) - * - * We build our non-reduced F and G as two independent halves each, - * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1). - */ - slen = MAX_BL_SMALL[depth]; - dlen = MAX_BL_SMALL[depth + 1]; - llen = MAX_BL_LARGE[depth]; - - /* - * Fd and Gd are the F and G from the deeper level. Ft and Gt - * are the destination arrays for the unreduced F and G. - */ - Fd = tmp; - Gd = Fd + dlen * hn; - Ft = Gd + dlen * hn; - Gt = Ft + llen * n; - - /* - * We reduce Fd and Gd modulo all the small primes we will need, - * and store the values in Ft and Gt. - */ - for (u = 0; u < llen; u ++) { - uint32_t p, p0i, R2, Rx; - size_t v; - uint32_t *xs, *ys, *xd, *yd; - - p = PRIMES[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - Rx = modp_Rx((unsigned)dlen, p, p0i, R2); - for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u; - v < hn; - v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) - { - *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx); - *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx); - } - } - - /* - * Now Fd and Gd are not needed anymore; we can squeeze them out. - */ - memmove(tmp, Ft, llen * n * sizeof(uint32_t)); - Ft = tmp; - memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t)); - Gt = Ft + llen * n; - ft = Gt + llen * n; - gt = ft + slen * n; - - t1 = gt + slen * n; - - /* - * Compute our F and G modulo sufficiently many small primes. - */ - for (u = 0; u < llen; u ++) { - uint32_t p, p0i, R2; - uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp; - unsigned e; - size_t v; - - /* - * All computations are done modulo p. - */ - p = PRIMES[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - - /* - * We recompute things from the source f and g, of full - * degree. However, we will need only the n first elements - * of the inverse NTT table (igm); the call to modp_mkgm() - * below will fill n_top elements in igm[] (thus overflowing - * into fx[]) but later code will overwrite these extra - * elements. - */ - gm = t1; - igm = gm + n_top; - fx = igm + n; - gx = fx + n_top; - modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i); - - /* - * Set ft and gt to f and g modulo p, respectively. - */ - for (v = 0; v < n_top; v ++) { - fx[v] = modp_set(f[v], p); - gx[v] = modp_set(g[v], p); - } - - /* - * Convert to NTT and compute our f and g. - */ - modp_NTT2(fx, gm, logn_top, p, p0i); - modp_NTT2(gx, gm, logn_top, p, p0i); - for (e = logn_top; e > logn; e --) { - modp_poly_rec_res(fx, e, p, p0i, R2); - modp_poly_rec_res(gx, e, p, p0i, R2); - } - - /* - * From that point onward, we only need tables for - * degree n, so we can save some space. - */ - if (depth > 0) { /* always true */ - memmove(gm + n, igm, n * sizeof *igm); - igm = gm + n; - memmove(igm + n, fx, n * sizeof *ft); - fx = igm + n; - memmove(fx + n, gx, n * sizeof *gt); - gx = fx + n; - } - - /* - * Get F' and G' modulo p and in NTT representation - * (they have degree n/2). These values were computed - * in a previous step, and stored in Ft and Gt. - */ - Fp = gx + n; - Gp = Fp + hn; - for (v = 0, x = Ft + u, y = Gt + u; - v < hn; v ++, x += llen, y += llen) - { - Fp[v] = *x; - Gp[v] = *y; - } - modp_NTT2(Fp, gm, logn - 1, p, p0i); - modp_NTT2(Gp, gm, logn - 1, p, p0i); - - /* - * Compute our F and G modulo p. - * - * Equations are: - * - * f'(x^2) = N(f)(x^2) = f * adj(f) - * g'(x^2) = N(g)(x^2) = g * adj(g) - * - * f'*G' - g'*F' = q - * - * F = F'(x^2) * adj(g) - * G = G'(x^2) * adj(f) - * - * The NTT representation of f is f(w) for all w which - * are roots of phi. In the binary case, as well as in - * the ternary case for all depth except the deepest, - * these roots can be grouped in pairs (w,-w), and we - * then have: - * - * f(w) = adj(f)(-w) - * f(-w) = adj(f)(w) - * - * and w^2 is then a root for phi at the half-degree. - * - * At the deepest level in the ternary case, this still - * holds, in the following sense: the roots of x^2-x+1 - * are (w,-w^2) (for w^3 = -1, and w != -1), and we - * have: - * - * f(w) = adj(f)(-w^2) - * f(-w^2) = adj(f)(w) - * - * In all case, we can thus compute F and G in NTT - * representation by a few simple multiplications. - * Moreover, the two roots for each pair are consecutive - * in our bit-reversal encoding. - */ - for (v = 0, x = Ft + u, y = Gt + u; - v < hn; v ++, x += (llen << 1), y += (llen << 1)) - { - uint32_t ftA, ftB, gtA, gtB; - uint32_t mFp, mGp; - - ftA = fx[(v << 1) + 0]; - ftB = fx[(v << 1) + 1]; - gtA = gx[(v << 1) + 0]; - gtB = gx[(v << 1) + 1]; - mFp = modp_montymul(Fp[v], R2, p, p0i); - mGp = modp_montymul(Gp[v], R2, p, p0i); - x[0] = modp_montymul(gtB, mFp, p, p0i); - x[llen] = modp_montymul(gtA, mFp, p, p0i); - y[0] = modp_montymul(ftB, mGp, p, p0i); - y[llen] = modp_montymul(ftA, mGp, p, p0i); - } - modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i); - modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i); - - /* - * Also save ft and gt (only up to size slen). - */ - if (u < slen) { - modp_iNTT2(fx, igm, logn, p, p0i); - modp_iNTT2(gx, igm, logn, p, p0i); - for (v = 0, x = ft + u, y = gt + u; - v < n; v ++, x += slen, y += slen) - { - *x = fx[v]; - *y = gx[v]; - } - } - } - - /* - * Rebuild f, g, F and G with the CRT. Note that the elements of F - * and G are consecutive, and thus can be rebuilt in a single - * loop; similarly, the elements of f and g are consecutive. - */ - zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1); - zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1); - - /* - * Here starts the Babai reduction, specialized for depth = 1. - * - * Candidates F and G (from Ft and Gt), and base f and g (ft and gt), - * are converted to floating point. There is no scaling, and a - * single pass is sufficient. - */ - - /* - * Convert F and G into floating point (rt1 and rt2). - */ - rt1 = align_fpr(tmp, gt + slen * n); - rt2 = rt1 + n; - poly_big_to_fp(rt1, Ft, llen, llen, logn); - poly_big_to_fp(rt2, Gt, llen, llen, logn); - - /* - * Integer representation of F and G is no longer needed, we - * can remove it. - */ - memmove(tmp, ft, 2 * slen * n * sizeof *ft); - ft = tmp; - gt = ft + slen * n; - rt3 = align_fpr(tmp, gt + slen * n); - memmove(rt3, rt1, 2 * n * sizeof *rt1); - rt1 = rt3; - rt2 = rt1 + n; - rt3 = rt2 + n; - rt4 = rt3 + n; - - /* - * Convert f and g into floating point (rt3 and rt4). - */ - poly_big_to_fp(rt3, ft, slen, slen, logn); - poly_big_to_fp(rt4, gt, slen, slen, logn); - - /* - * Remove unneeded ft and gt. - */ - memmove(tmp, rt1, 4 * n * sizeof *rt1); - rt1 = (fpr *)tmp; - rt2 = rt1 + n; - rt3 = rt2 + n; - rt4 = rt3 + n; - - /* - * We now have: - * rt1 = F - * rt2 = G - * rt3 = f - * rt4 = g - * in that order in RAM. We convert all of them to FFT. - */ - Zf(FFT)(rt1, logn); - Zf(FFT)(rt2, logn); - Zf(FFT)(rt3, logn); - Zf(FFT)(rt4, logn); - - /* - * Compute: - * rt5 = F*adj(f) + G*adj(g) - * rt6 = 1 / (f*adj(f) + g*adj(g)) - * (Note that rt6 is half-length.) - */ - rt5 = rt4 + n; - rt6 = rt5 + n; - Zf(poly_add_muladj_fft)(rt5, rt1, rt2, rt3, rt4, logn); - Zf(poly_invnorm2_fft)(rt6, rt3, rt4, logn); - - /* - * Compute: - * rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g)) - */ - Zf(poly_mul_autoadj_fft)(rt5, rt6, logn); - - /* - * Compute k as the rounded version of rt5. Check that none of - * the values is larger than 2^63-1 (in absolute value) - * because that would make the fpr_rint() do something undefined; - * note that any out-of-bounds value here implies a failure and - * (f,g) will be discarded, so we can make a simple test. - */ - Zf(iFFT)(rt5, logn); - for (u = 0; u < n; u ++) { - fpr z; - - z = rt5[u]; - if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) { - return 0; - } - rt5[u] = fpr_of(fpr_rint(z)); - } - Zf(FFT)(rt5, logn); - - /* - * Subtract k*f from F, and k*g from G. - */ - Zf(poly_mul_fft)(rt3, rt5, logn); - Zf(poly_mul_fft)(rt4, rt5, logn); - Zf(poly_sub)(rt1, rt3, logn); - Zf(poly_sub)(rt2, rt4, logn); - Zf(iFFT)(rt1, logn); - Zf(iFFT)(rt2, logn); - - /* - * Convert back F and G to integers, and return. - */ - Ft = tmp; - Gt = Ft + n; - rt3 = align_fpr(tmp, Gt + n); - memmove(rt3, rt1, 2 * n * sizeof *rt1); - rt1 = rt3; - rt2 = rt1 + n; - for (u = 0; u < n; u ++) { - Ft[u] = (uint32_t)fpr_rint(rt1[u]); - Gt[u] = (uint32_t)fpr_rint(rt2[u]); - } - - return 1; -} - -/* - * Solving the NTRU equation, top level. Upon entry, the F and G - * from the previous level should be in the tmp[] array. - * - * Returned value: 1 on success, 0 on error. - */ -static int -solve_NTRU_binary_depth0(unsigned logn, - const int8_t *f, const int8_t *g, uint32_t *tmp) -{ - size_t n, hn, u; - uint32_t p, p0i, R2; - uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5; - uint32_t *gm, *igm, *ft, *gt; - fpr *rt2, *rt3; - - n = (size_t)1 << logn; - hn = n >> 1; - - /* - * Equations are: - * - * f' = f0^2 - X^2*f1^2 - * g' = g0^2 - X^2*g1^2 - * F' and G' are a solution to f'G' - g'F' = q (from deeper levels) - * F = F'*(g0 - X*g1) - * G = G'*(f0 - X*f1) - * - * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to - * degree N/2 (their odd-indexed coefficients are all zero). - * - * Everything should fit in 31-bit integers, hence we can just use - * the first small prime p = 2147473409. - */ - p = PRIMES[0].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - - Fp = tmp; - Gp = Fp + hn; - ft = Gp + hn; - gt = ft + n; - gm = gt + n; - igm = gm + n; - - modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i); - - /* - * Convert F' anf G' in NTT representation. - */ - for (u = 0; u < hn; u ++) { - Fp[u] = modp_set(zint_one_to_plain(Fp + u), p); - Gp[u] = modp_set(zint_one_to_plain(Gp + u), p); - } - modp_NTT2(Fp, gm, logn - 1, p, p0i); - modp_NTT2(Gp, gm, logn - 1, p, p0i); - - /* - * Load f and g and convert them to NTT representation. - */ - for (u = 0; u < n; u ++) { - ft[u] = modp_set(f[u], p); - gt[u] = modp_set(g[u], p); - } - modp_NTT2(ft, gm, logn, p, p0i); - modp_NTT2(gt, gm, logn, p, p0i); - - /* - * Build the unreduced F,G in ft and gt. - */ - for (u = 0; u < n; u += 2) { - uint32_t ftA, ftB, gtA, gtB; - uint32_t mFp, mGp; - - ftA = ft[u + 0]; - ftB = ft[u + 1]; - gtA = gt[u + 0]; - gtB = gt[u + 1]; - mFp = modp_montymul(Fp[u >> 1], R2, p, p0i); - mGp = modp_montymul(Gp[u >> 1], R2, p, p0i); - ft[u + 0] = modp_montymul(gtB, mFp, p, p0i); - ft[u + 1] = modp_montymul(gtA, mFp, p, p0i); - gt[u + 0] = modp_montymul(ftB, mGp, p, p0i); - gt[u + 1] = modp_montymul(ftA, mGp, p, p0i); - } - modp_iNTT2(ft, igm, logn, p, p0i); - modp_iNTT2(gt, igm, logn, p, p0i); - - Gp = Fp + n; - t1 = Gp + n; - memmove(Fp, ft, 2 * n * sizeof *ft); - - /* - * We now need to apply the Babai reduction. At that point, - * we have F and G in two n-word arrays. - * - * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g) - * modulo p, using the NTT. We still move memory around in - * order to save RAM. - */ - t2 = t1 + n; - t3 = t2 + n; - t4 = t3 + n; - t5 = t4 + n; - - /* - * Compute the NTT tables in t1 and t2. We do not keep t2 - * (we'll recompute it later on). - */ - modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i); - - /* - * Convert F and G to NTT. - */ - modp_NTT2(Fp, t1, logn, p, p0i); - modp_NTT2(Gp, t1, logn, p, p0i); - - /* - * Load f and adj(f) in t4 and t5, and convert them to NTT - * representation. - */ - t4[0] = t5[0] = modp_set(f[0], p); - for (u = 1; u < n; u ++) { - t4[u] = modp_set(f[u], p); - t5[n - u] = modp_set(-f[u], p); - } - modp_NTT2(t4, t1, logn, p, p0i); - modp_NTT2(t5, t1, logn, p, p0i); - - /* - * Compute F*adj(f) in t2, and f*adj(f) in t3. - */ - for (u = 0; u < n; u ++) { - uint32_t w; - - w = modp_montymul(t5[u], R2, p, p0i); - t2[u] = modp_montymul(w, Fp[u], p, p0i); - t3[u] = modp_montymul(w, t4[u], p, p0i); - } - - /* - * Load g and adj(g) in t4 and t5, and convert them to NTT - * representation. - */ - t4[0] = t5[0] = modp_set(g[0], p); - for (u = 1; u < n; u ++) { - t4[u] = modp_set(g[u], p); - t5[n - u] = modp_set(-g[u], p); - } - modp_NTT2(t4, t1, logn, p, p0i); - modp_NTT2(t5, t1, logn, p, p0i); - - /* - * Add G*adj(g) to t2, and g*adj(g) to t3. - */ - for (u = 0; u < n; u ++) { - uint32_t w; - - w = modp_montymul(t5[u], R2, p, p0i); - t2[u] = modp_add(t2[u], - modp_montymul(w, Gp[u], p, p0i), p); - t3[u] = modp_add(t3[u], - modp_montymul(w, t4[u], p, p0i), p); - } - - /* - * Convert back t2 and t3 to normal representation (normalized - * around 0), and then - * move them to t1 and t2. We first need to recompute the - * inverse table for NTT. - */ - modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i); - modp_iNTT2(t2, t4, logn, p, p0i); - modp_iNTT2(t3, t4, logn, p, p0i); - for (u = 0; u < n; u ++) { - t1[u] = (uint32_t)modp_norm(t2[u], p); - t2[u] = (uint32_t)modp_norm(t3[u], p); - } - - /* - * At that point, array contents are: - * - * F (NTT representation) (Fp) - * G (NTT representation) (Gp) - * F*adj(f)+G*adj(g) (t1) - * f*adj(f)+g*adj(g) (t2) - * - * We want to divide t1 by t2. The result is not integral; it - * must be rounded. We thus need to use the FFT. - */ - - /* - * Get f*adj(f)+g*adj(g) in FFT representation. Since this - * polynomial is auto-adjoint, all its coordinates in FFT - * representation are actually real, so we can truncate off - * the imaginary parts. - */ - rt3 = align_fpr(tmp, t3); - for (u = 0; u < n; u ++) { - rt3[u] = fpr_of(((int32_t *)t2)[u]); - } - Zf(FFT)(rt3, logn); - rt2 = align_fpr(tmp, t2); - memmove(rt2, rt3, hn * sizeof *rt3); - - /* - * Convert F*adj(f)+G*adj(g) in FFT representation. - */ - rt3 = rt2 + hn; - for (u = 0; u < n; u ++) { - rt3[u] = fpr_of(((int32_t *)t1)[u]); - } - Zf(FFT)(rt3, logn); - - /* - * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get - * its rounded normal representation in t1. - */ - Zf(poly_div_autoadj_fft)(rt3, rt2, logn); - Zf(iFFT)(rt3, logn); - for (u = 0; u < n; u ++) { - t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p); - } - - /* - * RAM contents are now: - * - * F (NTT representation) (Fp) - * G (NTT representation) (Gp) - * k (t1) - * - * We want to compute F-k*f, and G-k*g. - */ - t2 = t1 + n; - t3 = t2 + n; - t4 = t3 + n; - t5 = t4 + n; - modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i); - for (u = 0; u < n; u ++) { - t4[u] = modp_set(f[u], p); - t5[u] = modp_set(g[u], p); - } - modp_NTT2(t1, t2, logn, p, p0i); - modp_NTT2(t4, t2, logn, p, p0i); - modp_NTT2(t5, t2, logn, p, p0i); - for (u = 0; u < n; u ++) { - uint32_t kw; - - kw = modp_montymul(t1[u], R2, p, p0i); - Fp[u] = modp_sub(Fp[u], - modp_montymul(kw, t4[u], p, p0i), p); - Gp[u] = modp_sub(Gp[u], - modp_montymul(kw, t5[u], p, p0i), p); - } - modp_iNTT2(Fp, t3, logn, p, p0i); - modp_iNTT2(Gp, t3, logn, p, p0i); - for (u = 0; u < n; u ++) { - Fp[u] = (uint32_t)modp_norm(Fp[u], p); - Gp[u] = (uint32_t)modp_norm(Gp[u], p); - } - - return 1; -} - -/* - * Solve the NTRU equation. Returned value is 1 on success, 0 on error. - * G can be NULL, in which case that value is computed but not returned. - * If any of the coefficients of F and G exceeds lim (in absolute value), - * then 0 is returned. - */ -static int -solve_NTRU(unsigned logn, int8_t *F, int8_t *G, - const int8_t *f, const int8_t *g, int lim, uint32_t *tmp) -{ - size_t n, u; - uint32_t *ft, *gt, *Ft, *Gt, *gm; - uint32_t p, p0i, r; - const small_prime *primes; - - n = MKN(logn); - - if (!solve_NTRU_deepest(logn, f, g, tmp)) { - return 0; - } - - /* - * For logn <= 2, we need to use solve_NTRU_intermediate() - * directly, because coefficients are a bit too large and - * do not fit the hypotheses in solve_NTRU_binary_depth0(). - */ - if (logn <= 2) { - unsigned depth; - - depth = logn; - while (depth -- > 0) { - if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) { - return 0; - } - } - } else { - unsigned depth; - - depth = logn; - while (depth -- > 2) { - if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) { - return 0; - } - } - if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) { - return 0; - } - if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) { - return 0; - } - } - - /* - * If no buffer has been provided for G, use a temporary one. - */ - if (G == NULL) { - G = (int8_t *)(tmp + 2 * n); - } - - /* - * Final F and G are in fk->tmp, one word per coefficient - * (signed value over 31 bits). - */ - if (!poly_big_to_small(F, tmp, lim, logn) - || !poly_big_to_small(G, tmp + n, lim, logn)) - { - return 0; - } - - /* - * Verify that the NTRU equation is fulfilled. Since all elements - * have short lengths, verifying modulo a small prime p works, and - * allows using the NTT. - * - * We put Gt[] first in tmp[], and process it first, so that it does - * not overlap with G[] in case we allocated it ourselves. - */ - Gt = tmp; - ft = Gt + n; - gt = ft + n; - Ft = gt + n; - gm = Ft + n; - - primes = PRIMES; - p = primes[0].p; - p0i = modp_ninv31(p); - modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i); - for (u = 0; u < n; u ++) { - Gt[u] = modp_set(G[u], p); - } - for (u = 0; u < n; u ++) { - ft[u] = modp_set(f[u], p); - gt[u] = modp_set(g[u], p); - Ft[u] = modp_set(F[u], p); - } - modp_NTT2(ft, gm, logn, p, p0i); - modp_NTT2(gt, gm, logn, p, p0i); - modp_NTT2(Ft, gm, logn, p, p0i); - modp_NTT2(Gt, gm, logn, p, p0i); - r = modp_montymul(12289, 1, p, p0i); - for (u = 0; u < n; u ++) { - uint32_t z; - - z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i), - modp_montymul(gt[u], Ft[u], p, p0i), p); - if (z != r) { - return 0; - } - } - - return 1; -} - -/* - * Generate a random polynomial with a Gaussian distribution. This function - * also makes sure that the resultant of the polynomial with phi is odd. - */ -static void -poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) -{ - size_t n, u; - unsigned mod2; - - n = MKN(logn); - mod2 = 0; - for (u = 0; u < n; u ++) { - int s; - - restart: - s = mkgauss(rng, logn); - - /* - * We need the coefficient to fit within -127..+127; - * realistically, this is always the case except for - * the very low degrees (N = 2 or 4), for which there - * is no real security anyway. - */ - if (s < -127 || s > 127) { - goto restart; - } - - /* - * We need the sum of all coefficients to be 1; otherwise, - * the resultant of the polynomial with X^N+1 will be even, - * and the binary GCD will fail. - */ - if (u == n - 1) { - if ((mod2 ^ (unsigned)(s & 1)) == 0) { - goto restart; - } - } else { - mod2 ^= (unsigned)(s & 1); - } - f[u] = (int8_t)s; - } -} - -/* see falcon.h */ -void -Zf(keygen)(inner_shake256_context *rng, - int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h, - unsigned logn, uint8_t *tmp) -{ - /* - * Algorithm is the following: - * - * - Generate f and g with the Gaussian distribution. - * - * - If either Res(f,phi) or Res(g,phi) is even, try again. - * - * - If ||(f,g)|| is too large, try again. - * - * - If ||B~_{f,g}|| is too large, try again. - * - * - If f is not invertible mod phi mod q, try again. - * - * - Compute h = g/f mod phi mod q. - * - * - Solve the NTRU equation fG - gF = q; if the solving fails, - * try again. Usual failure condition is when Res(f,phi) - * and Res(g,phi) are not prime to each other. - */ - size_t n, u; - uint16_t *h2, *tmp2; - RNG_CONTEXT *rc; -#if FALCON_KG_CHACHA20 // yyyKG_CHACHA20+1 - prng p; -#endif // yyyKG_CHACHA20- - - n = MKN(logn); -#if FALCON_KG_CHACHA20 // yyyKG_CHACHA20+1 - Zf(prng_init)(&p, rng); - rc = &p; -#else // yyyKG_CHACHA20+0 - rc = rng; -#endif // yyyKG_CHACHA20- - - /* - * We need to generate f and g randomly, until we find values - * such that the norm of (g,-f), and of the orthogonalized - * vector, are satisfying. The orthogonalized vector is: - * (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g))) - * (it is actually the (N+1)-th row of the Gram-Schmidt basis). - * - * In the binary case, coefficients of f and g are generated - * independently of each other, with a discrete Gaussian - * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then, - * the two vectors have expected norm 1.17*sqrt(q), which is - * also our acceptance bound: we require both vectors to be no - * larger than that (this will be satisfied about 1/4th of the - * time, thus we expect sampling new (f,g) about 4 times for that - * step). - * - * We require that Res(f,phi) and Res(g,phi) are both odd (the - * NTRU equation solver requires it). - */ - for (;;) { - fpr *rt1, *rt2, *rt3; - fpr bnorm; - uint32_t normf, normg, norm; - int lim; - - /* - * The poly_small_mkgauss() function makes sure - * that the sum of coefficients is 1 modulo 2 - * (i.e. the resultant of the polynomial with phi - * will be odd). - */ - poly_small_mkgauss(rc, f, logn); - poly_small_mkgauss(rc, g, logn); - - /* - * Verify that all coefficients are within the bounds - * defined in max_fg_bits. This is the case with - * overwhelming probability; this guarantees that the - * key will be encodable with FALCON_COMP_TRIM. - */ - lim = 1 << (Zf(max_fg_bits)[logn] - 1); - for (u = 0; u < n; u ++) { - /* - * We can use non-CT tests since on any failure - * we will discard f and g. - */ - if (f[u] >= lim || f[u] <= -lim - || g[u] >= lim || g[u] <= -lim) - { - lim = -1; - break; - } - } - if (lim < 0) { - continue; - } - - /* - * Bound is 1.17*sqrt(q). We compute the squared - * norms. With q = 12289, the squared bound is: - * (1.17^2)* 12289 = 16822.4121 - * Since f and g are integral, the squared norm - * of (g,-f) is an integer. - */ - normf = poly_small_sqnorm(f, logn); - normg = poly_small_sqnorm(g, logn); - norm = (normf + normg) | -((normf | normg) >> 31); - if (norm >= 16823) { - continue; - } - - /* - * We compute the orthogonalized vector norm. - */ - rt1 = (fpr *)tmp; - rt2 = rt1 + n; - rt3 = rt2 + n; - poly_small_to_fp(rt1, f, logn); - poly_small_to_fp(rt2, g, logn); - Zf(FFT)(rt1, logn); - Zf(FFT)(rt2, logn); - Zf(poly_invnorm2_fft)(rt3, rt1, rt2, logn); - Zf(poly_adj_fft)(rt1, logn); - Zf(poly_adj_fft)(rt2, logn); - Zf(poly_mulconst)(rt1, fpr_q, logn); - Zf(poly_mulconst)(rt2, fpr_q, logn); - Zf(poly_mul_autoadj_fft)(rt1, rt3, logn); - Zf(poly_mul_autoadj_fft)(rt2, rt3, logn); - Zf(iFFT)(rt1, logn); - Zf(iFFT)(rt2, logn); - bnorm = fpr_zero; - for (u = 0; u < n; u ++) { - bnorm = fpr_add(bnorm, fpr_sqr(rt1[u])); - bnorm = fpr_add(bnorm, fpr_sqr(rt2[u])); - } - if (!fpr_lt(bnorm, fpr_bnorm_max)) { - continue; - } - - /* - * Compute public key h = g/f mod X^N+1 mod q. If this - * fails, we must restart. - */ - if (h == NULL) { - h2 = (uint16_t *)tmp; - tmp2 = h2 + n; - } else { - h2 = h; - tmp2 = (uint16_t *)tmp; - } - if (!Zf(compute_public)(h2, f, g, logn, (uint8_t *)tmp2)) { - continue; - } - - /* - * Solve the NTRU equation to get F and G. - */ - lim = (1 << (Zf(max_FG_bits)[logn] - 1)) - 1; - if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) { - continue; - } - - /* - * Key pair is generated. - */ - break; - } -} diff --git a/crypto_sign/falcon-512-tree/m4-ct/pqm4.c b/crypto_sign/falcon-512-tree/m4-ct/pqm4.c deleted file mode 100644 index 74b83a8b..00000000 --- a/crypto_sign/falcon-512-tree/m4-ct/pqm4.c +++ /dev/null @@ -1,347 +0,0 @@ -#include -#include - -#include "api.h" -#include "inner.h" -#include "randombytes.h" - -/* ==================================================================== */ - -/* - * Falcon degree is N = 2^LOGN, where LOGN=9 (for Falcon-512) or 10 - * (for Falcon-1024). We use the advertised public key size to know - * which degree is used. - */ -#if CRYPTO_PUBLICKEYBYTES == 897 -#define LOGN 9 -#elif CRYPTO_PUBLICKEYBYTES == 1793 -#define LOGN 10 -#else -#error Unknown Falcon degree (unexpected public key size) -#endif - -#define N ((size_t)1 << LOGN) -#define NONCELEN 40 -#define SEEDLEN 48 - -/* - * If the private key length is larger than 10000, then this is the - * variant with precomputed expanded keys. - */ -#if CRYPTO_SECRETKEYBYTES > 10000 -#define KG_EXPAND 1 -#else -#define KG_EXPAND 0 -#endif - -/* - * Common buffer, to avoid bulky stack allocation. The buffer sizes are - * all expressed in bytes, but the buffer must be suitably aligned for - * 64-bit integers and floating-point values. - * - * Required size (in bytes): - * - * With expanded key: - * keygen: 48*N + 6*N = 54*N - * sign: 48*N + 2*N = 50*N - * vrfy: 8*N - * - * Without expanded key: - * keygen: 28*N + 5*N = 33*N - * sign: 72*N + 6*N = 78*N - * vrfy: 8*N - */ -static union { -#if KG_EXPAND - uint8_t b[54 * N]; -#else - uint8_t b[78 * N]; -#endif - uint64_t dummy_u64; - fpr dummy_fp; -} tmp; - -int -crypto_sign_keypair(unsigned char *pk, unsigned char *sk) -{ - int8_t *f, *g, *F, *G; - uint16_t *h; - inner_shake256_context rng; - unsigned char seed[SEEDLEN]; -#if KG_EXPAND - size_t v; -#else - size_t u, v; -#endif - unsigned sav_cw; - -#if KG_EXPAND - f = (int8_t *)&tmp.b[48 * N]; - g = f + N; - F = g + N; - G = F + N; - h = (uint16_t *)(G + N); -#else - f = (int8_t *)&tmp.b[28 * N]; - g = f + N; - F = g + N; - G = NULL; - h = (uint16_t *)(F + N); -#endif - - randombytes(seed, SEEDLEN); - inner_shake256_init(&rng); - inner_shake256_inject(&rng, seed, SEEDLEN); - inner_shake256_flip(&rng); - sav_cw = set_fpu_cw(2); - Zf(keygen)(&rng, f, g, F, G, h, LOGN, tmp.b); - -#if KG_EXPAND - /* - * Expand private key. - */ - Zf(expand_privkey)((fpr *)sk, f, g, F, G, LOGN, tmp.b); - set_fpu_cw(sav_cw); -#else - set_fpu_cw(sav_cw); - - /* - * Encode private key. - */ - sk[0] = 0x50 + LOGN; - u = 1; - v = Zf(trim_i8_encode)(sk + u, CRYPTO_SECRETKEYBYTES - u, - f, LOGN, Zf(max_fg_bits)[LOGN]); - if (v == 0) { - return -1; - } - u += v; - v = Zf(trim_i8_encode)(sk + u, CRYPTO_SECRETKEYBYTES - u, - g, LOGN, Zf(max_fg_bits)[LOGN]); - if (v == 0) { - return -1; - } - u += v; - v = Zf(trim_i8_encode)(sk + u, CRYPTO_SECRETKEYBYTES - u, - F, LOGN, Zf(max_FG_bits)[LOGN]); - if (v == 0) { - return -1; - } - u += v; - if (u != CRYPTO_SECRETKEYBYTES) { - return -1; - } -#endif - - /* - * Encode public key. - */ - pk[0] = 0x00 + LOGN; - v = Zf(modq_encode)(pk + 1, CRYPTO_PUBLICKEYBYTES - 1, h, LOGN); - if (v != CRYPTO_PUBLICKEYBYTES - 1) { - return -1; - } - - return 0; -} - -int -crypto_sign(unsigned char *sm, size_t *smlen, - const unsigned char *m, size_t mlen, - const unsigned char *sk) -{ -#if KG_EXPAND - const fpr *expanded_key; -#else - int8_t *f, *g, *F, *G; - size_t u, v; -#endif - int16_t *sig; - uint16_t *hm; - unsigned char seed[SEEDLEN], nonce[NONCELEN]; - unsigned char *esig; - inner_shake256_context sc; - size_t sig_len; - unsigned sav_cw; - -#if KG_EXPAND - sig = (int16_t *)&tmp.b[48 * N]; -#else - f = (int8_t *)&tmp.b[72 * N]; - g = f + N; - F = g + N; - G = F + N; - sig = (int16_t *)(G + N); -#endif - hm = (uint16_t *)sig; /* hm[] is shared with sig[] */ - esig = (unsigned char *)tmp.b; - -#if KG_EXPAND - /* - * Expanded key is provided "as is". - */ - expanded_key = (const fpr *)sk; -#else - /* - * Decode the private key. - */ - if (sk[0] != 0x50 + LOGN) { - return -1; - } - u = 1; - v = Zf(trim_i8_decode)(f, LOGN, Zf(max_fg_bits)[LOGN], - sk + u, CRYPTO_SECRETKEYBYTES - u); - if (v == 0) { - return -1; - } - u += v; - v = Zf(trim_i8_decode)(g, LOGN, Zf(max_fg_bits)[LOGN], - sk + u, CRYPTO_SECRETKEYBYTES - u); - if (v == 0) { - return -1; - } - u += v; - v = Zf(trim_i8_decode)(F, LOGN, Zf(max_FG_bits)[LOGN], - sk + u, CRYPTO_SECRETKEYBYTES - u); - if (v == 0) { - return -1; - } - u += v; - if (u != CRYPTO_SECRETKEYBYTES) { - return -1; - } - if (!Zf(complete_private)(G, f, g, F, LOGN, tmp.b)) { - return -1; - } -#endif - - /* - * Create a random nonce (40 bytes). - */ - randombytes(nonce, NONCELEN); - - /* - * Hash message nonce + message into a vector. - */ - inner_shake256_init(&sc); - inner_shake256_inject(&sc, nonce, NONCELEN); - inner_shake256_inject(&sc, m, mlen); - inner_shake256_flip(&sc); - Zf(hash_to_point_vartime)(&sc, hm, LOGN); - - /* - * Initialize a RNG. - */ - randombytes(seed, SEEDLEN); - inner_shake256_init(&sc); - inner_shake256_inject(&sc, seed, SEEDLEN); - inner_shake256_flip(&sc); - - /* - * Compute the signature. - */ - sav_cw = set_fpu_cw(2); -#if KG_EXPAND - Zf(sign_tree)(sig, &sc, expanded_key, hm, LOGN, tmp.b); -#else - Zf(sign_dyn)(sig, &sc, f, g, F, G, hm, LOGN, tmp.b); -#endif - set_fpu_cw(sav_cw); - - /* - * Encode the signature and bundle it with the message. Format is: - * signature length 2 bytes, big-endian - * nonce 40 bytes - * message mlen bytes - * signature slen bytes - */ - esig[0] = 0x20 + LOGN; - sig_len = Zf(comp_encode)(esig + 1, CRYPTO_BYTES - 1, sig, LOGN); - if (sig_len == 0) { - return -1; - } - sig_len ++; - memmove(sm + 2 + NONCELEN, m, mlen); - sm[0] = (unsigned char)(sig_len >> 8); - sm[1] = (unsigned char)sig_len; - memcpy(sm + 2, nonce, NONCELEN); - memcpy(sm + 2 + NONCELEN + mlen, esig, sig_len); - *smlen = 2 + NONCELEN + mlen + sig_len; - return 0; -} - -int -crypto_sign_open(unsigned char *m, size_t *mlen, - const unsigned char *sm, size_t smlen, - const unsigned char *pk) -{ - uint16_t *h, *hm; - int16_t *sig; - const unsigned char *esig; - inner_shake256_context sc; - size_t sig_len, msg_len; - - h = (uint16_t *)&tmp.b[2 * N]; - hm = h + N; - sig = (int16_t *)(hm + N); - - /* - * Decode public key. - */ - if (pk[0] != 0x00 + LOGN) { - return -1; - } - if (Zf(modq_decode)(h, LOGN, pk + 1, CRYPTO_PUBLICKEYBYTES - 1) - != CRYPTO_PUBLICKEYBYTES - 1) - { - return -1; - } - Zf(to_ntt_monty)(h, LOGN); - - /* - * Find nonce, signature, message length. - */ - if (smlen < 2 + NONCELEN) { - return -1; - } - sig_len = ((size_t)sm[0] << 8) | (size_t)sm[1]; - if (sig_len > (smlen - 2 - NONCELEN)) { - return -1; - } - msg_len = smlen - 2 - NONCELEN - sig_len; - - /* - * Decode signature. - */ - esig = sm + 2 + NONCELEN + msg_len; - if (sig_len < 1 || esig[0] != 0x20 + LOGN) { - return -1; - } - if (Zf(comp_decode)(sig, LOGN, - esig + 1, sig_len - 1) != sig_len - 1) - { - return -1; - } - - /* - * Hash nonce + message into a vector. - */ - inner_shake256_init(&sc); - inner_shake256_inject(&sc, sm + 2, NONCELEN + msg_len); - inner_shake256_flip(&sc); - Zf(hash_to_point_vartime)(&sc, hm, LOGN); - - /* - * Verify signature. - */ - if (!Zf(verify_raw)(hm, sig, h, LOGN, tmp.b)) { - return -1; - } - - /* - * Return plaintext. - */ - memmove(m, sm + 2 + NONCELEN, msg_len); - *mlen = msg_len; - return 0; -} diff --git a/crypto_sign/falcon-512-tree/m4-ct/rng.c b/crypto_sign/falcon-512-tree/m4-ct/rng.c deleted file mode 100644 index d2ecb7af..00000000 --- a/crypto_sign/falcon-512-tree/m4-ct/rng.c +++ /dev/null @@ -1,379 +0,0 @@ -/* - * PRNG and interface to the system RNG. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include - -#include "inner.h" - -// yyyNIST+0 yyyPQCLEAN+0 -/* - * Include relevant system header files. For Win32, this will also need - * linking with advapi32.dll, which we trigger with an appropriate #pragma. - */ -#if FALCON_RAND_GETENTROPY -#include -#endif -#if FALCON_RAND_URANDOM -#include -#if !FALCON_RAND_GETENTROPY -#include -#endif -#include -#include -#endif -#if FALCON_RAND_WIN32 -#include -#include -#pragma comment(lib, "advapi32") -#endif - -/* see inner.h */ -int -Zf(get_seed)(void *seed, size_t len) -{ - (void)seed; - if (len == 0) { - return 1; - } -#if FALCON_RAND_GETENTROPY - if (getentropy(seed, len) == 0) { - return 1; - } -#endif -#if FALCON_RAND_URANDOM - { - int f; - - f = open("/dev/urandom", O_RDONLY); - if (f >= 0) { - while (len > 0) { - ssize_t rlen; - - rlen = read(f, seed, len); - if (rlen < 0) { - if (errno == EINTR) { - continue; - } - break; - } - seed = (uint8_t *)seed + rlen; - len -= (size_t)rlen; - } - close(f); - if (len == 0) { - return 1; - } - } - } -#endif -#if FALCON_RAND_WIN32 - { - HCRYPTPROV hp; - - if (CryptAcquireContext(&hp, 0, 0, PROV_RSA_FULL, - CRYPT_VERIFYCONTEXT | CRYPT_SILENT)) - { - BOOL r; - - r = CryptGenRandom(hp, (DWORD)len, seed); - CryptReleaseContext(hp, 0); - if (r) { - return 1; - } - } - } -#endif - return 0; -} -// yyyNIST- yyyPQCLEAN- - -/* see inner.h */ -void -Zf(prng_init)(prng *p, inner_shake256_context *src) -{ -#if FALCON_LE // yyyLE+1 - inner_shake256_extract(src, p->state.d, 56); -#else // yyyLE+0 - /* - * To ensure reproducibility for a given seed, we - * must enforce little-endian interpretation of - * the state words. - */ - uint8_t tmp[56]; - uint64_t th, tl; - int i; - - inner_shake256_extract(src, tmp, 56); - for (i = 0; i < 14; i ++) { - uint32_t w; - - w = (uint32_t)tmp[(i << 2) + 0] - | ((uint32_t)tmp[(i << 2) + 1] << 8) - | ((uint32_t)tmp[(i << 2) + 2] << 16) - | ((uint32_t)tmp[(i << 2) + 3] << 24); - *(uint32_t *)(p->state.d + (i << 2)) = w; - } - tl = *(uint32_t *)(p->state.d + 48); - th = *(uint32_t *)(p->state.d + 52); - *(uint64_t *)(p->state.d + 48) = tl + (th << 32); -#endif // yyyLE- - Zf(prng_refill)(p); -} - -/* - * PRNG based on ChaCha20. - * - * State consists in key (32 bytes) then IV (16 bytes) and block counter - * (8 bytes). Normally, we should not care about local endianness (this - * is for a PRNG), but for the NIST competition we need reproducible KAT - * vectors that work across architectures, so we enforce little-endian - * interpretation where applicable. Moreover, output words are "spread - * out" over the output buffer with the interleaving pattern that is - * naturally obtained from the AVX2 implementation that runs eight - * ChaCha20 instances in parallel. - * - * The block counter is XORed into the first 8 bytes of the IV. - */ -TARGET_AVX2 -void -Zf(prng_refill)(prng *p) -{ -#if FALCON_AVX2 // yyyAVX2+1 - - static const uint32_t CW[] = { - 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 - }; - - uint64_t cc; - size_t u; - int i; - uint32_t *sw; - union { - uint32_t w[16]; - __m256i y[2]; /* for alignment */ - } t; - __m256i state[16], init[16]; - - sw = (uint32_t *)p->state.d; - - /* - * XOR next counter values into state. - */ - cc = *(uint64_t *)(p->state.d + 48); - for (u = 0; u < 8; u ++) { - t.w[u] = (uint32_t)(cc + u); - t.w[u + 8] = (uint32_t)((cc + u) >> 32); - } - *(uint64_t *)(p->state.d + 48) = cc + 8; - - /* - * Load state. - */ - for (u = 0; u < 4; u ++) { - state[u] = init[u] = - _mm256_broadcastd_epi32(_mm_cvtsi32_si128(CW[u])); - } - for (u = 0; u < 10; u ++) { - state[u + 4] = init[u + 4] = - _mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[u])); - } - state[14] = init[14] = _mm256_xor_si256( - _mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[10])), - _mm256_loadu_si256((__m256i *)&t.w[0])); - state[15] = init[15] = _mm256_xor_si256( - _mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[11])), - _mm256_loadu_si256((__m256i *)&t.w[8])); - - /* - * Do all rounds. - */ - for (i = 0; i < 10; i ++) { - -#define QROUND(a, b, c, d) do { \ - state[a] = _mm256_add_epi32(state[a], state[b]); \ - state[d] = _mm256_xor_si256(state[d], state[a]); \ - state[d] = _mm256_or_si256( \ - _mm256_slli_epi32(state[d], 16), \ - _mm256_srli_epi32(state[d], 16)); \ - state[c] = _mm256_add_epi32(state[c], state[d]); \ - state[b] = _mm256_xor_si256(state[b], state[c]); \ - state[b] = _mm256_or_si256( \ - _mm256_slli_epi32(state[b], 12), \ - _mm256_srli_epi32(state[b], 20)); \ - state[a] = _mm256_add_epi32(state[a], state[b]); \ - state[d] = _mm256_xor_si256(state[d], state[a]); \ - state[d] = _mm256_or_si256( \ - _mm256_slli_epi32(state[d], 8), \ - _mm256_srli_epi32(state[d], 24)); \ - state[c] = _mm256_add_epi32(state[c], state[d]); \ - state[b] = _mm256_xor_si256(state[b], state[c]); \ - state[b] = _mm256_or_si256( \ - _mm256_slli_epi32(state[b], 7), \ - _mm256_srli_epi32(state[b], 25)); \ - } while (0) - - QROUND( 0, 4, 8, 12); - QROUND( 1, 5, 9, 13); - QROUND( 2, 6, 10, 14); - QROUND( 3, 7, 11, 15); - QROUND( 0, 5, 10, 15); - QROUND( 1, 6, 11, 12); - QROUND( 2, 7, 8, 13); - QROUND( 3, 4, 9, 14); - -#undef QROUND - - } - - /* - * Add initial state back and encode the result in the destination - * buffer. We can dump the AVX2 values "as is" because the non-AVX2 - * code uses a compatible order of values. - */ - for (u = 0; u < 16; u ++) { - _mm256_storeu_si256((__m256i *)&p->buf.d[u << 5], - _mm256_add_epi32(state[u], init[u])); - } - -#else // yyyAVX2+0 - - static const uint32_t CW[] = { - 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 - }; - - uint64_t cc; - size_t u; - - /* - * State uses local endianness. Only the output bytes must be - * converted to little endian (if used on a big-endian machine). - */ - cc = *(uint64_t *)(p->state.d + 48); - for (u = 0; u < 8; u ++) { - uint32_t state[16]; - size_t v; - int i; - - memcpy(&state[0], CW, sizeof CW); - memcpy(&state[4], p->state.d, 48); - state[14] ^= (uint32_t)cc; - state[15] ^= (uint32_t)(cc >> 32); - for (i = 0; i < 10; i ++) { - -#define QROUND(a, b, c, d) do { \ - state[a] += state[b]; \ - state[d] ^= state[a]; \ - state[d] = (state[d] << 16) | (state[d] >> 16); \ - state[c] += state[d]; \ - state[b] ^= state[c]; \ - state[b] = (state[b] << 12) | (state[b] >> 20); \ - state[a] += state[b]; \ - state[d] ^= state[a]; \ - state[d] = (state[d] << 8) | (state[d] >> 24); \ - state[c] += state[d]; \ - state[b] ^= state[c]; \ - state[b] = (state[b] << 7) | (state[b] >> 25); \ - } while (0) - - QROUND( 0, 4, 8, 12); - QROUND( 1, 5, 9, 13); - QROUND( 2, 6, 10, 14); - QROUND( 3, 7, 11, 15); - QROUND( 0, 5, 10, 15); - QROUND( 1, 6, 11, 12); - QROUND( 2, 7, 8, 13); - QROUND( 3, 4, 9, 14); - -#undef QROUND - - } - - for (v = 0; v < 4; v ++) { - state[v] += CW[v]; - } - for (v = 4; v < 14; v ++) { - state[v] += ((uint32_t *)p->state.d)[v - 4]; - } - state[14] += ((uint32_t *)p->state.d)[10] - ^ (uint32_t)cc; - state[15] += ((uint32_t *)p->state.d)[11] - ^ (uint32_t)(cc >> 32); - cc ++; - - /* - * We mimic the interleaving that is used in the AVX2 - * implementation. - */ - for (v = 0; v < 16; v ++) { -#if FALCON_LE // yyyLE+1 - ((uint32_t *)p->buf.d)[u + (v << 3)] = state[v]; -#else // yyyLE+0 - p->buf.d[(u << 2) + (v << 5) + 0] = - (uint8_t)state[v]; - p->buf.d[(u << 2) + (v << 5) + 1] = - (uint8_t)(state[v] >> 8); - p->buf.d[(u << 2) + (v << 5) + 2] = - (uint8_t)(state[v] >> 16); - p->buf.d[(u << 2) + (v << 5) + 3] = - (uint8_t)(state[v] >> 24); -#endif // yyyLE- - } - } - *(uint64_t *)(p->state.d + 48) = cc; - -#endif // yyyAVX2- - - p->ptr = 0; -} - -/* see inner.h */ -void -Zf(prng_get_bytes)(prng *p, void *dst, size_t len) -{ - uint8_t *buf; - - buf = dst; - while (len > 0) { - size_t clen; - - clen = (sizeof p->buf.d) - p->ptr; - if (clen > len) { - clen = len; - } - memcpy(buf, p->buf.d, clen); - buf += clen; - len -= clen; - p->ptr += clen; - if (p->ptr == sizeof p->buf.d) { - Zf(prng_refill)(p); - } - } -} diff --git a/crypto_sign/falcon-512-tree/m4-ct/sign.c b/crypto_sign/falcon-512-tree/m4-ct/sign.c deleted file mode 100644 index 752fb8ba..00000000 --- a/crypto_sign/falcon-512-tree/m4-ct/sign.c +++ /dev/null @@ -1,1532 +0,0 @@ -/* - * Falcon signature generation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -/* =================================================================== */ - -/* - * Compute degree N from logarithm 'logn'. - */ -#define MKN(logn) ((size_t)1 << (logn)) - -/* =================================================================== */ -/* - * Binary case: - * N = 2^logn - * phi = X^N+1 - */ - -/* - * Get the size of the LDL tree for an input with polynomials of size - * 2^logn. The size is expressed in the number of elements. - */ -static inline unsigned -ffLDL_treesize(unsigned logn) -{ - /* - * For logn = 0 (polynomials are constant), the "tree" is a - * single element. Otherwise, the tree node has size 2^logn, and - * has two child trees for size logn-1 each. Thus, treesize s() - * must fulfill these two relations: - * - * s(0) = 1 - * s(logn) = (2^logn) + 2*s(logn-1) - */ - return (logn + 1) << logn; -} - -/* - * Inner function for ffLDL_fft(). It expects the matrix to be both - * auto-adjoint and quasicyclic; also, it uses the source operands - * as modifiable temporaries. - * - * tmp[] must have room for at least one polynomial. - */ -static void -ffLDL_fft_inner(fpr *restrict tree, - fpr *restrict g0, fpr *restrict g1, unsigned logn, fpr *restrict tmp) -{ - size_t n, hn; - - n = MKN(logn); - if (n == 1) { - tree[0] = g0[0]; - return; - } - hn = n >> 1; - - /* - * The LDL decomposition yields L (which is written in the tree) - * and the diagonal of D. Since d00 = g0, we just write d11 - * into tmp. - */ - Zf(poly_LDLmv_fft)(tmp, tree, g0, g1, g0, logn); - - /* - * Split d00 (currently in g0) and d11 (currently in tmp). We - * reuse g0 and g1 as temporary storage spaces: - * d00 splits into g1, g1+hn - * d11 splits into g0, g0+hn - */ - Zf(poly_split_fft)(g1, g1 + hn, g0, logn); - Zf(poly_split_fft)(g0, g0 + hn, tmp, logn); - - /* - * Each split result is the first row of a new auto-adjoint - * quasicyclic matrix for the next recursive step. - */ - ffLDL_fft_inner(tree + n, - g1, g1 + hn, logn - 1, tmp); - ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1), - g0, g0 + hn, logn - 1, tmp); -} - -/* - * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix - * is provided as three polynomials (FFT representation). - * - * The "tree" array is filled with the computed tree, of size - * (logn+1)*(2^logn) elements (see ffLDL_treesize()). - * - * Input arrays MUST NOT overlap, except possibly the three unmodified - * arrays g00, g01 and g11. tmp[] should have room for at least three - * polynomials of 2^logn elements each. - */ -static void -ffLDL_fft(fpr *restrict tree, const fpr *restrict g00, - const fpr *restrict g01, const fpr *restrict g11, - unsigned logn, fpr *restrict tmp) -{ - size_t n, hn; - fpr *d00, *d11; - - n = MKN(logn); - if (n == 1) { - tree[0] = g00[0]; - return; - } - hn = n >> 1; - d00 = tmp; - d11 = tmp + n; - tmp += n << 1; - - memcpy(d00, g00, n * sizeof *g00); - Zf(poly_LDLmv_fft)(d11, tree, g00, g01, g11, logn); - - Zf(poly_split_fft)(tmp, tmp + hn, d00, logn); - Zf(poly_split_fft)(d00, d00 + hn, d11, logn); - memcpy(d11, tmp, n * sizeof *tmp); - ffLDL_fft_inner(tree + n, - d11, d11 + hn, logn - 1, tmp); - ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1), - d00, d00 + hn, logn - 1, tmp); -} - -/* - * Normalize an ffLDL tree: each leaf of value x is replaced with - * sigma / sqrt(x). - */ -static void -ffLDL_binary_normalize(fpr *tree, unsigned logn) -{ - /* - * TODO: make an iterative version. - */ - size_t n; - - n = MKN(logn); - if (n == 1) { - /* - * We actually store in the tree leaf the inverse of - * the value mandated by the specification: this - * saves a division both here and in the sampler. - */ - tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma); - } else { - ffLDL_binary_normalize(tree + n, logn - 1); - ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1), - logn - 1); - } -} - -/* =================================================================== */ - -/* - * Convert an integer polynomial (with small values) into the - * representation with complex numbers. - */ -static void -smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn) -{ - size_t n, u; - - n = MKN(logn); - for (u = 0; u < n; u ++) { - r[u] = fpr_of(t[u]); - } -} - -/* - * The expanded private key contains: - * - The B0 matrix (four elements) - * - The ffLDL tree - */ - -static inline size_t -skoff_b00(unsigned logn) -{ - (void)logn; - return 0; -} - -static inline size_t -skoff_b01(unsigned logn) -{ - return MKN(logn); -} - -static inline size_t -skoff_b10(unsigned logn) -{ - return 2 * MKN(logn); -} - -static inline size_t -skoff_b11(unsigned logn) -{ - return 3 * MKN(logn); -} - -static inline size_t -skoff_tree(unsigned logn) -{ - return 4 * MKN(logn); -} - -/* see inner.h */ -void -Zf(expand_privkey)(fpr *restrict expanded_key, - const int8_t *f, const int8_t *g, - const int8_t *F, const int8_t *G, - unsigned logn, uint8_t *restrict tmp) -{ - size_t n; - fpr *rf, *rg, *rF, *rG; - fpr *b00, *b01, *b10, *b11; - fpr *g00, *g01, *g11, *gxx; - fpr *tree; - - n = MKN(logn); - b00 = expanded_key + skoff_b00(logn); - b01 = expanded_key + skoff_b01(logn); - b10 = expanded_key + skoff_b10(logn); - b11 = expanded_key + skoff_b11(logn); - tree = expanded_key + skoff_tree(logn); - - /* - * We load the private key elements directly into the B0 matrix, - * since B0 = [[g, -f], [G, -F]]. - */ - rf = b01; - rg = b00; - rF = b11; - rG = b10; - - smallints_to_fpr(rf, f, logn); - smallints_to_fpr(rg, g, logn); - smallints_to_fpr(rF, F, logn); - smallints_to_fpr(rG, G, logn); - - /* - * Compute the FFT for the key elements, and negate f and F. - */ - Zf(FFT)(rf, logn); - Zf(FFT)(rg, logn); - Zf(FFT)(rF, logn); - Zf(FFT)(rG, logn); - Zf(poly_neg)(rf, logn); - Zf(poly_neg)(rF, logn); - - /* - * The Gram matrix is G = B·B*. Formulas are: - * g00 = b00*adj(b00) + b01*adj(b01) - * g01 = b00*adj(b10) + b01*adj(b11) - * g10 = b10*adj(b00) + b11*adj(b01) - * g11 = b10*adj(b10) + b11*adj(b11) - * - * For historical reasons, this implementation uses - * g00, g01 and g11 (upper triangle). - */ - g00 = (fpr *)tmp; - g01 = g00 + n; - g11 = g01 + n; - gxx = g11 + n; - - memcpy(g00, b00, n * sizeof *b00); - Zf(poly_mulselfadj_fft)(g00, logn); - memcpy(gxx, b01, n * sizeof *b01); - Zf(poly_mulselfadj_fft)(gxx, logn); - Zf(poly_add)(g00, gxx, logn); - - memcpy(g01, b00, n * sizeof *b00); - Zf(poly_muladj_fft)(g01, b10, logn); - memcpy(gxx, b01, n * sizeof *b01); - Zf(poly_muladj_fft)(gxx, b11, logn); - Zf(poly_add)(g01, gxx, logn); - - memcpy(g11, b10, n * sizeof *b10); - Zf(poly_mulselfadj_fft)(g11, logn); - memcpy(gxx, b11, n * sizeof *b11); - Zf(poly_mulselfadj_fft)(gxx, logn); - Zf(poly_add)(g11, gxx, logn); - - /* - * Compute the Falcon tree. - */ - ffLDL_fft(tree, g00, g01, g11, logn, gxx); - - /* - * Normalize tree. - */ - ffLDL_binary_normalize(tree, logn); -} - -typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma); - -/* - * Perform Fast Fourier Sampling for target vector t. The Gram matrix - * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector - * is written over (t0,t1). The Gram matrix is modified as well. The - * tmp[] buffer must have room for four polynomials. - */ -TARGET_AVX2 -static void -ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx, - fpr *restrict t0, fpr *restrict t1, - fpr *restrict g00, fpr *restrict g01, fpr *restrict g11, - unsigned logn, fpr *restrict tmp) -{ - size_t n, hn; - fpr *z0, *z1; - - /* - * Deepest level: the LDL tree leaf value is just g00 (the - * array has length only 1 at this point); we normalize it - * with regards to sigma, then use it for sampling. - */ - if (logn == 0) { - fpr leaf; - - leaf = g00[0]; - leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma); - t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf)); - t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf)); - return; - } - - n = (size_t)1 << logn; - hn = n >> 1; - - /* - * Decompose G into LDL. We only need d00 (identical to g00), - * d11, and l10; we do that in place. - */ - Zf(poly_LDL_fft)(g00, g01, g11, logn); - - /* - * Split d00 and d11 and expand them into half-size quasi-cyclic - * Gram matrices. We also save l10 in tmp[]. - */ - Zf(poly_split_fft)(tmp, tmp + hn, g00, logn); - memcpy(g00, tmp, n * sizeof *tmp); - Zf(poly_split_fft)(tmp, tmp + hn, g11, logn); - memcpy(g11, tmp, n * sizeof *tmp); - memcpy(tmp, g01, n * sizeof *g01); - memcpy(g01, g00, hn * sizeof *g00); - memcpy(g01 + hn, g11, hn * sizeof *g00); - - /* - * The half-size Gram matrices for the recursive LDL tree - * building are now: - * - left sub-tree: g00, g00+hn, g01 - * - right sub-tree: g11, g11+hn, g01+hn - * l10 is in tmp[]. - */ - - /* - * We split t1 and use the first recursive call on the two - * halves, using the right sub-tree. The result is merged - * back into tmp + 2*n. - */ - z1 = tmp + n; - Zf(poly_split_fft)(z1, z1 + hn, t1, logn); - ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn, - g11, g11 + hn, g01 + hn, logn - 1, z1 + n); - Zf(poly_merge_fft)(tmp + (n << 1), z1, z1 + hn, logn); - - /* - * Compute tb0 = t0 + (t1 - z1) * l10. - * At that point, l10 is in tmp, t1 is unmodified, and z1 is - * in tmp + (n << 1). The buffer in z1 is free. - * - * In the end, z1 is written over t1, and tb0 is in t0. - */ - memcpy(z1, t1, n * sizeof *t1); - Zf(poly_sub)(z1, tmp + (n << 1), logn); - memcpy(t1, tmp + (n << 1), n * sizeof *tmp); - Zf(poly_mul_fft)(tmp, z1, logn); - Zf(poly_add)(t0, tmp, logn); - - /* - * Second recursive invocation, on the split tb0 (currently in t0) - * and the left sub-tree. - */ - z0 = tmp; - Zf(poly_split_fft)(z0, z0 + hn, t0, logn); - ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn, - g00, g00 + hn, g01, logn - 1, z0 + n); - Zf(poly_merge_fft)(t0, z0, z0 + hn, logn); -} - -/* - * Perform Fast Fourier Sampling for target vector t and LDL tree T. - * tmp[] must have size for at least two polynomials of size 2^logn. - */ -TARGET_AVX2 -static void -ffSampling_fft(samplerZ samp, void *samp_ctx, - fpr *restrict z0, fpr *restrict z1, - const fpr *restrict tree, - const fpr *restrict t0, const fpr *restrict t1, unsigned logn, - fpr *restrict tmp) -{ - size_t n, hn; - const fpr *tree0, *tree1; - - /* - * When logn == 2, we inline the last two recursion levels. - */ - if (logn == 2) { -#if FALCON_AVX2 // yyyAVX2+1 - fpr w0, w1, w2, w3, sigma; - __m128d ww0, ww1, wa, wb, wc, wd; - __m128d wy0, wy1, wz0, wz1; - __m128d half, invsqrt8, invsqrt2, neghi, neglo; - int si0, si1, si2, si3; - - tree0 = tree + 4; - tree1 = tree + 8; - - half = _mm_set1_pd(0.5); - invsqrt8 = _mm_set1_pd(0.353553390593273762200422181052); - invsqrt2 = _mm_set1_pd(0.707106781186547524400844362105); - neghi = _mm_set_pd(-0.0, 0.0); - neglo = _mm_set_pd(0.0, -0.0); - - /* - * We split t1 into w*, then do the recursive invocation, - * with output in w*. We finally merge back into z1. - */ - ww0 = _mm_loadu_pd(&t1[0].v); - ww1 = _mm_loadu_pd(&t1[2].v); - wa = _mm_unpacklo_pd(ww0, ww1); - wb = _mm_unpackhi_pd(ww0, ww1); - wc = _mm_add_pd(wa, wb); - ww0 = _mm_mul_pd(wc, half); - wc = _mm_sub_pd(wa, wb); - wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi); - ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8); - - w2.v = _mm_cvtsd_f64(ww1); - w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1)); - wa = ww1; - sigma = tree1[3]; - si2 = samp(samp_ctx, w2, sigma); - si3 = samp(samp_ctx, w3, sigma); - ww1 = _mm_set_pd((double)si3, (double)si2); - wa = _mm_sub_pd(wa, ww1); - wb = _mm_loadu_pd(&tree1[0].v); - wc = _mm_mul_pd(wa, wb); - wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1)); - wa = _mm_unpacklo_pd(wc, wd); - wb = _mm_unpackhi_pd(wc, wd); - ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo))); - w0.v = _mm_cvtsd_f64(ww0); - w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1)); - sigma = tree1[2]; - si0 = samp(samp_ctx, w0, sigma); - si1 = samp(samp_ctx, w1, sigma); - ww0 = _mm_set_pd((double)si1, (double)si0); - - wc = _mm_mul_pd( - _mm_set_pd((double)(si2 + si3), (double)(si2 - si3)), - invsqrt2); - wa = _mm_add_pd(ww0, wc); - wb = _mm_sub_pd(ww0, wc); - ww0 = _mm_unpacklo_pd(wa, wb); - ww1 = _mm_unpackhi_pd(wa, wb); - _mm_storeu_pd(&z1[0].v, ww0); - _mm_storeu_pd(&z1[2].v, ww1); - - /* - * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*. - */ - wy0 = _mm_sub_pd(_mm_loadu_pd(&t1[0].v), ww0); - wy1 = _mm_sub_pd(_mm_loadu_pd(&t1[2].v), ww1); - wz0 = _mm_loadu_pd(&tree[0].v); - wz1 = _mm_loadu_pd(&tree[2].v); - ww0 = _mm_sub_pd(_mm_mul_pd(wy0, wz0), _mm_mul_pd(wy1, wz1)); - ww1 = _mm_add_pd(_mm_mul_pd(wy0, wz1), _mm_mul_pd(wy1, wz0)); - ww0 = _mm_add_pd(ww0, _mm_loadu_pd(&t0[0].v)); - ww1 = _mm_add_pd(ww1, _mm_loadu_pd(&t0[2].v)); - - /* - * Second recursive invocation. - */ - wa = _mm_unpacklo_pd(ww0, ww1); - wb = _mm_unpackhi_pd(ww0, ww1); - wc = _mm_add_pd(wa, wb); - ww0 = _mm_mul_pd(wc, half); - wc = _mm_sub_pd(wa, wb); - wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi); - ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8); - - w2.v = _mm_cvtsd_f64(ww1); - w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1)); - wa = ww1; - sigma = tree0[3]; - si2 = samp(samp_ctx, w2, sigma); - si3 = samp(samp_ctx, w3, sigma); - ww1 = _mm_set_pd((double)si3, (double)si2); - wa = _mm_sub_pd(wa, ww1); - wb = _mm_loadu_pd(&tree0[0].v); - wc = _mm_mul_pd(wa, wb); - wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1)); - wa = _mm_unpacklo_pd(wc, wd); - wb = _mm_unpackhi_pd(wc, wd); - ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo))); - w0.v = _mm_cvtsd_f64(ww0); - w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1)); - sigma = tree0[2]; - si0 = samp(samp_ctx, w0, sigma); - si1 = samp(samp_ctx, w1, sigma); - ww0 = _mm_set_pd((double)si1, (double)si0); - - wc = _mm_mul_pd( - _mm_set_pd((double)(si2 + si3), (double)(si2 - si3)), - invsqrt2); - wa = _mm_add_pd(ww0, wc); - wb = _mm_sub_pd(ww0, wc); - ww0 = _mm_unpacklo_pd(wa, wb); - ww1 = _mm_unpackhi_pd(wa, wb); - _mm_storeu_pd(&z0[0].v, ww0); - _mm_storeu_pd(&z0[2].v, ww1); - - return; -#else // yyyAVX2+0 - fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma; - fpr a_re, a_im, b_re, b_im, c_re, c_im; - - tree0 = tree + 4; - tree1 = tree + 8; - - /* - * We split t1 into w*, then do the recursive invocation, - * with output in w*. We finally merge back into z1. - */ - a_re = t1[0]; - a_im = t1[2]; - b_re = t1[1]; - b_im = t1[3]; - c_re = fpr_add(a_re, b_re); - c_im = fpr_add(a_im, b_im); - w0 = fpr_half(c_re); - w1 = fpr_half(c_im); - c_re = fpr_sub(a_re, b_re); - c_im = fpr_sub(a_im, b_im); - w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8); - w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8); - - x0 = w2; - x1 = w3; - sigma = tree1[3]; - w2 = fpr_of(samp(samp_ctx, x0, sigma)); - w3 = fpr_of(samp(samp_ctx, x1, sigma)); - a_re = fpr_sub(x0, w2); - a_im = fpr_sub(x1, w3); - b_re = tree1[0]; - b_im = tree1[1]; - c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); - c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); - x0 = fpr_add(c_re, w0); - x1 = fpr_add(c_im, w1); - sigma = tree1[2]; - w0 = fpr_of(samp(samp_ctx, x0, sigma)); - w1 = fpr_of(samp(samp_ctx, x1, sigma)); - - a_re = w0; - a_im = w1; - b_re = w2; - b_im = w3; - c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2); - c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2); - z1[0] = w0 = fpr_add(a_re, c_re); - z1[2] = w2 = fpr_add(a_im, c_im); - z1[1] = w1 = fpr_sub(a_re, c_re); - z1[3] = w3 = fpr_sub(a_im, c_im); - - /* - * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*. - */ - w0 = fpr_sub(t1[0], w0); - w1 = fpr_sub(t1[1], w1); - w2 = fpr_sub(t1[2], w2); - w3 = fpr_sub(t1[3], w3); - - a_re = w0; - a_im = w2; - b_re = tree[0]; - b_im = tree[2]; - w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); - w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); - a_re = w1; - a_im = w3; - b_re = tree[1]; - b_im = tree[3]; - w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); - w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); - - w0 = fpr_add(w0, t0[0]); - w1 = fpr_add(w1, t0[1]); - w2 = fpr_add(w2, t0[2]); - w3 = fpr_add(w3, t0[3]); - - /* - * Second recursive invocation. - */ - a_re = w0; - a_im = w2; - b_re = w1; - b_im = w3; - c_re = fpr_add(a_re, b_re); - c_im = fpr_add(a_im, b_im); - w0 = fpr_half(c_re); - w1 = fpr_half(c_im); - c_re = fpr_sub(a_re, b_re); - c_im = fpr_sub(a_im, b_im); - w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8); - w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8); - - x0 = w2; - x1 = w3; - sigma = tree0[3]; - w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma)); - w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma)); - a_re = fpr_sub(x0, y0); - a_im = fpr_sub(x1, y1); - b_re = tree0[0]; - b_im = tree0[1]; - c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); - c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); - x0 = fpr_add(c_re, w0); - x1 = fpr_add(c_im, w1); - sigma = tree0[2]; - w0 = fpr_of(samp(samp_ctx, x0, sigma)); - w1 = fpr_of(samp(samp_ctx, x1, sigma)); - - a_re = w0; - a_im = w1; - b_re = w2; - b_im = w3; - c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2); - c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2); - z0[0] = fpr_add(a_re, c_re); - z0[2] = fpr_add(a_im, c_im); - z0[1] = fpr_sub(a_re, c_re); - z0[3] = fpr_sub(a_im, c_im); - - return; -#endif // yyyAVX2- - } - - /* - * Case logn == 1 is reachable only when using Falcon-2 (the - * smallest size for which Falcon is mathematically defined, but - * of course way too insecure to be of any use). - */ - if (logn == 1) { - fpr x0, x1, y0, y1, sigma; - fpr a_re, a_im, b_re, b_im, c_re, c_im; - - x0 = t1[0]; - x1 = t1[1]; - sigma = tree[3]; - z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma)); - z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma)); - a_re = fpr_sub(x0, y0); - a_im = fpr_sub(x1, y1); - b_re = tree[0]; - b_im = tree[1]; - c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); - c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); - x0 = fpr_add(c_re, t0[0]); - x1 = fpr_add(c_im, t0[1]); - sigma = tree[2]; - z0[0] = fpr_of(samp(samp_ctx, x0, sigma)); - z0[1] = fpr_of(samp(samp_ctx, x1, sigma)); - - return; - } - - /* - * Normal end of recursion is for logn == 0. Since the last - * steps of the recursions were inlined in the blocks above - * (when logn == 1 or 2), this case is not reachable, and is - * retained here only for documentation purposes. - - if (logn == 0) { - fpr x0, x1, sigma; - - x0 = t0[0]; - x1 = t1[0]; - sigma = tree[0]; - z0[0] = fpr_of(samp(samp_ctx, x0, sigma)); - z1[0] = fpr_of(samp(samp_ctx, x1, sigma)); - return; - } - - */ - - /* - * General recursive case (logn >= 3). - */ - - n = (size_t)1 << logn; - hn = n >> 1; - tree0 = tree + n; - tree1 = tree + n + ffLDL_treesize(logn - 1); - - /* - * We split t1 into z1 (reused as temporary storage), then do - * the recursive invocation, with output in tmp. We finally - * merge back into z1. - */ - Zf(poly_split_fft)(z1, z1 + hn, t1, logn); - ffSampling_fft(samp, samp_ctx, tmp, tmp + hn, - tree1, z1, z1 + hn, logn - 1, tmp + n); - Zf(poly_merge_fft)(z1, tmp, tmp + hn, logn); - - /* - * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[]. - */ - memcpy(tmp, t1, n * sizeof *t1); - Zf(poly_sub)(tmp, z1, logn); - Zf(poly_mul_fft)(tmp, tree, logn); - Zf(poly_add)(tmp, t0, logn); - - /* - * Second recursive invocation. - */ - Zf(poly_split_fft)(z0, z0 + hn, tmp, logn); - ffSampling_fft(samp, samp_ctx, tmp, tmp + hn, - tree0, z0, z0 + hn, logn - 1, tmp + n); - Zf(poly_merge_fft)(z0, tmp, tmp + hn, logn); -} - -/* - * Compute a signature: the signature contains two vectors, s1 and s2. - * The s1 vector is not returned. The squared norm of (s1,s2) is - * computed, and if it is short enough, then s2 is returned into the - * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is - * returned; the caller should then try again. This function uses an - * expanded key. - * - * tmp[] must have room for at least six polynomials. - */ -static int -do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2, - const fpr *restrict expanded_key, - const uint16_t *hm, - unsigned logn, fpr *restrict tmp) -{ - size_t n, u; - fpr *t0, *t1, *tx, *ty; - const fpr *b00, *b01, *b10, *b11, *tree; - fpr ni; - uint32_t sqn, ng; - int16_t *s1tmp, *s2tmp; - - n = MKN(logn); - t0 = tmp; - t1 = t0 + n; - b00 = expanded_key + skoff_b00(logn); - b01 = expanded_key + skoff_b01(logn); - b10 = expanded_key + skoff_b10(logn); - b11 = expanded_key + skoff_b11(logn); - tree = expanded_key + skoff_tree(logn); - - /* - * Set the target vector to [hm, 0] (hm is the hashed message). - */ - for (u = 0; u < n; u ++) { - t0[u] = fpr_of(hm[u]); - /* This is implicit. - t1[u] = fpr_zero; - */ - } - - /* - * Apply the lattice basis to obtain the real target - * vector (after normalization with regards to modulus). - */ - Zf(FFT)(t0, logn); - ni = fpr_inverse_of_q; - memcpy(t1, t0, n * sizeof *t0); - Zf(poly_mul_fft)(t1, b01, logn); - Zf(poly_mulconst)(t1, fpr_neg(ni), logn); - Zf(poly_mul_fft)(t0, b11, logn); - Zf(poly_mulconst)(t0, ni, logn); - - tx = t1 + n; - ty = tx + n; - - /* - * Apply sampling. Output is written back in [tx, ty]. - */ - ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, logn, ty + n); - - /* - * Get the lattice point corresponding to that tiny vector. - */ - memcpy(t0, tx, n * sizeof *tx); - memcpy(t1, ty, n * sizeof *ty); - Zf(poly_mul_fft)(tx, b00, logn); - Zf(poly_mul_fft)(ty, b10, logn); - Zf(poly_add)(tx, ty, logn); - memcpy(ty, t0, n * sizeof *t0); - Zf(poly_mul_fft)(ty, b01, logn); - - memcpy(t0, tx, n * sizeof *tx); - Zf(poly_mul_fft)(t1, b11, logn); - Zf(poly_add)(t1, ty, logn); - - Zf(iFFT)(t0, logn); - Zf(iFFT)(t1, logn); - - /* - * Compute the signature. - */ - s1tmp = (int16_t *)tx; - sqn = 0; - ng = 0; - for (u = 0; u < n; u ++) { - int32_t z; - - z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]); - sqn += (uint32_t)(z * z); - ng |= sqn; - s1tmp[u] = (int16_t)z; - } - sqn |= -(ng >> 31); - - /* - * With "normal" degrees (e.g. 512 or 1024), it is very - * improbable that the computed vector is not short enough; - * however, it may happen in practice for the very reduced - * versions (e.g. degree 16 or below). In that case, the caller - * will loop, and we must not write anything into s2[] because - * s2[] may overlap with the hashed message hm[] and we need - * hm[] for the next iteration. - */ - s2tmp = (int16_t *)tmp; - for (u = 0; u < n; u ++) { - s2tmp[u] = (int16_t)-fpr_rint(t1[u]); - } - if (Zf(is_short_half)(sqn, s2tmp, logn)) { - memcpy(s2, s2tmp, n * sizeof *s2); - memcpy(tmp, s1tmp, n * sizeof *s1tmp); - return 1; - } - return 0; -} - -/* - * Compute a signature: the signature contains two vectors, s1 and s2. - * The s1 vector is not returned. The squared norm of (s1,s2) is - * computed, and if it is short enough, then s2 is returned into the - * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is - * returned; the caller should then try again. - * - * tmp[] must have room for at least nine polynomials. - */ -static int -do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2, - const int8_t *restrict f, const int8_t *restrict g, - const int8_t *restrict F, const int8_t *restrict G, - const uint16_t *hm, unsigned logn, fpr *restrict tmp) -{ - size_t n, u; - fpr *t0, *t1, *tx, *ty; - fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11; - fpr ni; - uint32_t sqn, ng; - int16_t *s1tmp, *s2tmp; - - n = MKN(logn); - - /* - * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT. - */ - b00 = tmp; - b01 = b00 + n; - b10 = b01 + n; - b11 = b10 + n; - smallints_to_fpr(b01, f, logn); - smallints_to_fpr(b00, g, logn); - smallints_to_fpr(b11, F, logn); - smallints_to_fpr(b10, G, logn); - Zf(FFT)(b01, logn); - Zf(FFT)(b00, logn); - Zf(FFT)(b11, logn); - Zf(FFT)(b10, logn); - Zf(poly_neg)(b01, logn); - Zf(poly_neg)(b11, logn); - - /* - * Compute the Gram matrix G = B·B*. Formulas are: - * g00 = b00*adj(b00) + b01*adj(b01) - * g01 = b00*adj(b10) + b01*adj(b11) - * g10 = b10*adj(b00) + b11*adj(b01) - * g11 = b10*adj(b10) + b11*adj(b11) - * - * For historical reasons, this implementation uses - * g00, g01 and g11 (upper triangle). g10 is not kept - * since it is equal to adj(g01). - * - * We _replace_ the matrix B with the Gram matrix, but we - * must keep b01 and b11 for computing the target vector. - */ - t0 = b11 + n; - t1 = t0 + n; - - memcpy(t0, b01, n * sizeof *b01); - Zf(poly_mulselfadj_fft)(t0, logn); // t0 <- b01*adj(b01) - - memcpy(t1, b00, n * sizeof *b00); - Zf(poly_muladj_fft)(t1, b10, logn); // t1 <- b00*adj(b10) - Zf(poly_mulselfadj_fft)(b00, logn); // b00 <- b00*adj(b00) - Zf(poly_add)(b00, t0, logn); // b00 <- g00 - memcpy(t0, b01, n * sizeof *b01); - Zf(poly_muladj_fft)(b01, b11, logn); // b01 <- b01*adj(b11) - Zf(poly_add)(b01, t1, logn); // b01 <- g01 - - Zf(poly_mulselfadj_fft)(b10, logn); // b10 <- b10*adj(b10) - memcpy(t1, b11, n * sizeof *b11); - Zf(poly_mulselfadj_fft)(t1, logn); // t1 <- b11*adj(b11) - Zf(poly_add)(b10, t1, logn); // b10 <- g11 - - /* - * We rename variables to make things clearer. The three elements - * of the Gram matrix uses the first 3*n slots of tmp[], followed - * by b11 and b01 (in that order). - */ - g00 = b00; - g01 = b01; - g11 = b10; - b01 = t0; - t0 = b01 + n; - t1 = t0 + n; - - /* - * Memory layout at that point: - * g00 g01 g11 b11 b01 t0 t1 - */ - - /* - * Set the target vector to [hm, 0] (hm is the hashed message). - */ - for (u = 0; u < n; u ++) { - t0[u] = fpr_of(hm[u]); - /* This is implicit. - t1[u] = fpr_zero; - */ - } - - /* - * Apply the lattice basis to obtain the real target - * vector (after normalization with regards to modulus). - */ - Zf(FFT)(t0, logn); - ni = fpr_inverse_of_q; - memcpy(t1, t0, n * sizeof *t0); - Zf(poly_mul_fft)(t1, b01, logn); - Zf(poly_mulconst)(t1, fpr_neg(ni), logn); - Zf(poly_mul_fft)(t0, b11, logn); - Zf(poly_mulconst)(t0, ni, logn); - - /* - * b01 and b11 can be discarded, so we move back (t0,t1). - * Memory layout is now: - * g00 g01 g11 t0 t1 - */ - memcpy(b11, t0, n * 2 * sizeof *t0); - t0 = g11 + n; - t1 = t0 + n; - - /* - * Apply sampling; result is written over (t0,t1). - */ - ffSampling_fft_dyntree(samp, samp_ctx, - t0, t1, g00, g01, g11, logn, t1 + n); - - /* - * We arrange the layout back to: - * b00 b01 b10 b11 t0 t1 - * - * We did not conserve the matrix basis, so we must recompute - * it now. - */ - b00 = tmp; - b01 = b00 + n; - b10 = b01 + n; - b11 = b10 + n; - memmove(b11 + n, t0, n * 2 * sizeof *t0); - t0 = b11 + n; - t1 = t0 + n; - smallints_to_fpr(b01, f, logn); - smallints_to_fpr(b00, g, logn); - smallints_to_fpr(b11, F, logn); - smallints_to_fpr(b10, G, logn); - Zf(FFT)(b01, logn); - Zf(FFT)(b00, logn); - Zf(FFT)(b11, logn); - Zf(FFT)(b10, logn); - Zf(poly_neg)(b01, logn); - Zf(poly_neg)(b11, logn); - tx = t1 + n; - ty = tx + n; - - /* - * Get the lattice point corresponding to that tiny vector. - */ - memcpy(tx, t0, n * sizeof *t0); - memcpy(ty, t1, n * sizeof *t1); - Zf(poly_mul_fft)(tx, b00, logn); - Zf(poly_mul_fft)(ty, b10, logn); - Zf(poly_add)(tx, ty, logn); - memcpy(ty, t0, n * sizeof *t0); - Zf(poly_mul_fft)(ty, b01, logn); - - memcpy(t0, tx, n * sizeof *tx); - Zf(poly_mul_fft)(t1, b11, logn); - Zf(poly_add)(t1, ty, logn); - Zf(iFFT)(t0, logn); - Zf(iFFT)(t1, logn); - - s1tmp = (int16_t *)tx; - sqn = 0; - ng = 0; - for (u = 0; u < n; u ++) { - int32_t z; - - z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]); - sqn += (uint32_t)(z * z); - ng |= sqn; - s1tmp[u] = (int16_t)z; - } - sqn |= -(ng >> 31); - - /* - * With "normal" degrees (e.g. 512 or 1024), it is very - * improbable that the computed vector is not short enough; - * however, it may happen in practice for the very reduced - * versions (e.g. degree 16 or below). In that case, the caller - * will loop, and we must not write anything into s2[] because - * s2[] may overlap with the hashed message hm[] and we need - * hm[] for the next iteration. - */ - s2tmp = (int16_t *)tmp; - for (u = 0; u < n; u ++) { - s2tmp[u] = (int16_t)-fpr_rint(t1[u]); - } - if (Zf(is_short_half)(sqn, s2tmp, logn)) { - memcpy(s2, s2tmp, n * sizeof *s2); - memcpy(tmp, s1tmp, n * sizeof *s1tmp); - return 1; - } - return 0; -} - -/* - * Sample an integer value along a half-gaussian distribution centered - * on zero and standard deviation 1.8205, with a precision of 72 bits. - */ -TARGET_AVX2 -int -Zf(gaussian0_sampler)(prng *p) -{ -#if FALCON_AVX2 // yyyAVX2+1 - - /* - * High words. - */ - static const union { - uint16_t u16[16]; - __m256i ymm[1]; - } rhi15 = { - { - 0x51FB, 0x2A69, 0x113E, 0x0568, - 0x014A, 0x003B, 0x0008, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000 - } - }; - - static const union { - uint64_t u64[20]; - __m256i ymm[5]; - } rlo57 = { - { - 0x1F42ED3AC391802, 0x12B181F3F7DDB82, - 0x1CDD0934829C1FF, 0x1754377C7994AE4, - 0x1846CAEF33F1F6F, 0x14AC754ED74BD5F, - 0x024DD542B776AE4, 0x1A1FFDC65AD63DA, - 0x01F80D88A7B6428, 0x001C3FDB2040C69, - 0x00012CF24D031FB, 0x00000949F8B091F, - 0x0000003665DA998, 0x00000000EBF6EBB, - 0x0000000002F5D7E, 0x000000000007098, - 0x0000000000000C6, 0x000000000000001, - 0x000000000000000, 0x000000000000000 - } - }; - - uint64_t lo; - unsigned hi; - __m256i xhi, rhi, gthi, eqhi, eqm; - __m256i xlo, gtlo0, gtlo1, gtlo2, gtlo3, gtlo4; - __m128i t, zt; - int r; - - /* - * Get a 72-bit random value and split it into a low part - * (57 bits) and a high part (15 bits) - */ - lo = prng_get_u64(p); - hi = prng_get_u8(p); - hi = (hi << 7) | (unsigned)(lo >> 57); - lo &= 0x1FFFFFFFFFFFFFF; - - /* - * Broadcast the high part and compare it with the relevant - * values. We need both a "greater than" and an "equal" - * comparisons. - */ - xhi = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(hi)); - rhi = _mm256_loadu_si256(&rhi15.ymm[0]); - gthi = _mm256_cmpgt_epi16(rhi, xhi); - eqhi = _mm256_cmpeq_epi16(rhi, xhi); - - /* - * The result is the number of 72-bit values (among the list of 19) - * which are greater than the 72-bit random value. We first count - * all non-zero 16-bit elements in the first eight of gthi. Such - * elements have value -1 or 0, so we first negate them. - */ - t = _mm_srli_epi16(_mm256_castsi256_si128(gthi), 15); - zt = _mm_setzero_si128(); - t = _mm_hadd_epi16(t, zt); - t = _mm_hadd_epi16(t, zt); - t = _mm_hadd_epi16(t, zt); - r = _mm_cvtsi128_si32(t); - - /* - * We must look at the low bits for all values for which the - * high bits are an "equal" match; values 8-18 all have the - * same high bits (0). - * On 32-bit systems, 'lo' really is two registers, requiring - * some extra code. - */ -#if defined(__x86_64__) || defined(_M_X64) - xlo = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(*(int64_t *)&lo)); -#else - { - uint32_t e0, e1; - int32_t f0, f1; - - e0 = (uint32_t)lo; - e1 = (uint32_t)(lo >> 32); - f0 = *(int32_t *)&e0; - f1 = *(int32_t *)&e1; - xlo = _mm256_set_epi32(f1, f0, f1, f0, f1, f0, f1, f0); - } -#endif - gtlo0 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[0]), xlo); - gtlo1 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[1]), xlo); - gtlo2 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[2]), xlo); - gtlo3 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[3]), xlo); - gtlo4 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[4]), xlo); - - /* - * Keep only comparison results that correspond to the non-zero - * elements in eqhi. - */ - gtlo0 = _mm256_and_si256(gtlo0, _mm256_cvtepi16_epi64( - _mm256_castsi256_si128(eqhi))); - gtlo1 = _mm256_and_si256(gtlo1, _mm256_cvtepi16_epi64( - _mm256_castsi256_si128(_mm256_bsrli_epi128(eqhi, 8)))); - eqm = _mm256_permute4x64_epi64(eqhi, 0xFF); - gtlo2 = _mm256_and_si256(gtlo2, eqm); - gtlo3 = _mm256_and_si256(gtlo3, eqm); - gtlo4 = _mm256_and_si256(gtlo4, eqm); - - /* - * Add all values to count the total number of "-1" elements. - * Since the first eight "high" words are all different, only - * one element (at most) in gtlo0:gtlo1 can be non-zero; however, - * if the high word of the random value is zero, then many - * elements of gtlo2:gtlo3:gtlo4 can be non-zero. - */ - gtlo0 = _mm256_or_si256(gtlo0, gtlo1); - gtlo0 = _mm256_add_epi64( - _mm256_add_epi64(gtlo0, gtlo2), - _mm256_add_epi64(gtlo3, gtlo4)); - t = _mm_add_epi64( - _mm256_castsi256_si128(gtlo0), - _mm256_extracti128_si256(gtlo0, 1)); - t = _mm_add_epi64(t, _mm_srli_si128(t, 8)); - r -= _mm_cvtsi128_si32(t); - - return r; - -#else // yyyAVX2+0 - - static const uint32_t dist[] = { - 10745844u, 3068844u, 3741698u, - 5559083u, 1580863u, 8248194u, - 2260429u, 13669192u, 2736639u, - 708981u, 4421575u, 10046180u, - 169348u, 7122675u, 4136815u, - 30538u, 13063405u, 7650655u, - 4132u, 14505003u, 7826148u, - 417u, 16768101u, 11363290u, - 31u, 8444042u, 8086568u, - 1u, 12844466u, 265321u, - 0u, 1232676u, 13644283u, - 0u, 38047u, 9111839u, - 0u, 870u, 6138264u, - 0u, 14u, 12545723u, - 0u, 0u, 3104126u, - 0u, 0u, 28824u, - 0u, 0u, 198u, - 0u, 0u, 1u - }; - - uint32_t v0, v1, v2, hi; - uint64_t lo; - size_t u; - int z; - - /* - * Get a random 72-bit value, into three 24-bit limbs v0..v2. - */ - lo = prng_get_u64(p); - hi = prng_get_u8(p); - v0 = (uint32_t)lo & 0xFFFFFF; - v1 = (uint32_t)(lo >> 24) & 0xFFFFFF; - v2 = (uint32_t)(lo >> 48) | (hi << 16); - - /* - * Sampled value is z, such that v0..v2 is lower than the first - * z elements of the table. - */ - z = 0; - for (u = 0; u < (sizeof dist) / sizeof(dist[0]); u += 3) { - uint32_t w0, w1, w2, cc; - - w0 = dist[u + 2]; - w1 = dist[u + 1]; - w2 = dist[u + 0]; - cc = (v0 - w0) >> 31; - cc = (v1 - w1 - cc) >> 31; - cc = (v2 - w2 - cc) >> 31; - z += (int)cc; - } - return z; - -#endif // yyyAVX2- -} - -/* - * Sample a bit with probability exp(-x) for some x >= 0. - */ -TARGET_AVX2 -static int -BerExp(prng *p, fpr x, fpr ccs) -{ - int s, i; - fpr r; - uint32_t sw, w; - uint64_t z; - - /* - * Reduce x modulo log(2): x = s*log(2) + r, with s an integer, - * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc(). - */ - s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2)); - r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2)); - - /* - * It may happen (quite rarely) that s >= 64; if sigma = 1.2 - * (the minimum value for sigma), r = 0 and b = 1, then we get - * s >= 64 if the half-Gaussian produced a z >= 13, which happens - * with probability about 0.000000000230383991, which is - * approximatively equal to 2^(-32). In any case, if s >= 64, - * then BerExp will be non-zero with probability less than - * 2^(-64), so we can simply saturate s at 63. - */ - sw = (uint32_t)s; - sw ^= (sw ^ 63) & -((63 - sw) >> 31); - s = (int)sw; - - /* - * Compute exp(-r); we know that 0 <= r < log(2) at this point, so - * we can use fpr_expm_p63(), which yields a result scaled to 2^63. - * We scale it up to 2^64, then right-shift it by s bits because - * we really want exp(-x) = 2^(-s)*exp(-r). - * - * The "-1" operation makes sure that the value fits on 64 bits - * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that - * case). The bias is negligible since fpr_expm_p63() only computes - * with 51 bits of precision or so. - */ - z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s; - - /* - * Sample a bit with probability exp(-x). Since x = s*log(2) + r, - * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the - * PRNG output to limit its consumption, the sign of the difference - * yields the expected result. - */ - i = 64; - do { - i -= 8; - w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF); - } while (!w && i > 0); - return (int)(w >> 31); -} - -/* - * The sampler produces a random integer that follows a discrete Gaussian - * distribution, centered on mu, and with standard deviation sigma. The - * provided parameter isigma is equal to 1/sigma. - * - * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between - * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9. - */ -TARGET_AVX2 -int -Zf(sampler)(void *ctx, fpr mu, fpr isigma) -{ - sampler_context *spc; - int s; - fpr r, dss, ccs; - - spc = ctx; - - /* - * Center is mu. We compute mu = s + r where s is an integer - * and 0 <= r < 1. - */ - s = (int)fpr_floor(mu); - r = fpr_sub(mu, fpr_of(s)); - - /* - * dss = 1/(2*sigma^2) = 0.5*(isigma^2). - */ - dss = fpr_half(fpr_sqr(isigma)); - - /* - * ccs = sigma_min / sigma = sigma_min * isigma. - */ - ccs = fpr_mul(isigma, spc->sigma_min); - - /* - * We now need to sample on center r. - */ - for (;;) { - int z0, z, b; - fpr x; - - /* - * Sample z for a Gaussian distribution. Then get a - * random bit b to turn the sampling into a bimodal - * distribution: if b = 1, we use z+1, otherwise we - * use -z. We thus have two situations: - * - * - b = 1: z >= 1 and sampled against a Gaussian - * centered on 1. - * - b = 0: z <= 0 and sampled against a Gaussian - * centered on 0. - */ - z0 = Zf(gaussian0_sampler)(&spc->p); - b = prng_get_u8(&spc->p) & 1; - z = b + ((b << 1) - 1) * z0; - - /* - * Rejection sampling. We want a Gaussian centered on r; - * but we sampled against a Gaussian centered on b (0 or - * 1). But we know that z is always in the range where - * our sampling distribution is greater than the Gaussian - * distribution, so rejection works. - * - * We got z with distribution: - * G(z) = exp(-((z-b)^2)/(2*sigma0^2)) - * We target distribution: - * S(z) = exp(-((z-r)^2)/(2*sigma^2)) - * Rejection sampling works by keeping the value z with - * probability S(z)/G(z), and starting again otherwise. - * This requires S(z) <= G(z), which is the case here. - * Thus, we simply need to keep our z with probability: - * P = exp(-x) - * where: - * x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2) - * - * Here, we scale up the Bernouilli distribution, which - * makes rejection more probable, but makes rejection - * rate sufficiently decorrelated from the Gaussian - * center and standard deviation that the whole sampler - * can be said to be constant-time. - */ - x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss); - x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0)); - if (BerExp(&spc->p, x, ccs)) { - /* - * Rejection sampling was centered on r, but the - * actual center is mu = s + r. - */ - return s + z; - } - } -} - -/* see inner.h */ -void -Zf(sign_tree)(int16_t *sig, inner_shake256_context *rng, - const fpr *restrict expanded_key, - const uint16_t *hm, unsigned logn, uint8_t *tmp) -{ - fpr *ftmp; - - ftmp = (fpr *)tmp; - for (;;) { - /* - * Signature produces short vectors s1 and s2. The - * signature is acceptable only if the aggregate vector - * s1,s2 is short; we must use the same bound as the - * verifier. - * - * If the signature is acceptable, then we return only s2 - * (the verifier recomputes s1 from s2, the hashed message, - * and the public key). - */ - sampler_context spc; - samplerZ samp; - void *samp_ctx; - - /* - * Normal sampling. We use a fast PRNG seeded from our - * SHAKE context ('rng'). - */ - spc.sigma_min = (logn == 10) - ? fpr_sigma_min_10 - : fpr_sigma_min_9; - Zf(prng_init)(&spc.p, rng); - samp = Zf(sampler); - samp_ctx = &spc; - - /* - * Do the actual signature. - */ - if (do_sign_tree(samp, samp_ctx, sig, - expanded_key, hm, logn, ftmp)) - { - break; - } - } -} - -/* see inner.h */ -void -Zf(sign_dyn)(int16_t *sig, inner_shake256_context *rng, - const int8_t *restrict f, const int8_t *restrict g, - const int8_t *restrict F, const int8_t *restrict G, - const uint16_t *hm, unsigned logn, uint8_t *tmp) -{ - fpr *ftmp; - - ftmp = (fpr *)tmp; - for (;;) { - /* - * Signature produces short vectors s1 and s2. The - * signature is acceptable only if the aggregate vector - * s1,s2 is short; we must use the same bound as the - * verifier. - * - * If the signature is acceptable, then we return only s2 - * (the verifier recomputes s1 from s2, the hashed message, - * and the public key). - */ - sampler_context spc; - samplerZ samp; - void *samp_ctx; - - /* - * Normal sampling. We use a fast PRNG seeded from our - * SHAKE context ('rng'). - */ - spc.sigma_min = (logn == 10) - ? fpr_sigma_min_10 - : fpr_sigma_min_9; - Zf(prng_init)(&spc.p, rng); - samp = Zf(sampler); - samp_ctx = &spc; - - /* - * Do the actual signature. - */ - if (do_sign_dyn(samp, samp_ctx, sig, - f, g, F, G, hm, logn, ftmp)) - { - break; - } - } -} diff --git a/crypto_sign/falcon-512-tree/m4-ct/vrfy.c b/crypto_sign/falcon-512-tree/m4-ct/vrfy.c deleted file mode 100644 index c74a3dd3..00000000 --- a/crypto_sign/falcon-512-tree/m4-ct/vrfy.c +++ /dev/null @@ -1,871 +0,0 @@ -/* - * Falcon signature verification. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -/* ===================================================================== */ -/* - * Constants for NTT. - * - * n = 2^logn (2 <= n <= 1024) - * phi = X^n + 1 - * q = 12289 - * q0i = -1/q mod 2^16 - * R = 2^16 mod q - * R2 = 2^32 mod q - */ - -#define Q 12289 -#define Q0I 12287 -#define R 4091 -#define R2 10952 - -/* - * Table for NTT, binary case: - * GMb[x] = R*(g^rev(x)) mod q - * where g = 7 (it is a 2048-th primitive root of 1 modulo q) - * and rev() is the bit-reversal function over 10 bits. - */ -static const uint16_t GMb[] = { - 4091, 7888, 11060, 11208, 6960, 4342, 6275, 9759, - 1591, 6399, 9477, 5266, 586, 5825, 7538, 9710, - 1134, 6407, 1711, 965, 7099, 7674, 3743, 6442, - 10414, 8100, 1885, 1688, 1364, 10329, 10164, 9180, - 12210, 6240, 997, 117, 4783, 4407, 1549, 7072, - 2829, 6458, 4431, 8877, 7144, 2564, 5664, 4042, - 12189, 432, 10751, 1237, 7610, 1534, 3983, 7863, - 2181, 6308, 8720, 6570, 4843, 1690, 14, 3872, - 5569, 9368, 12163, 2019, 7543, 2315, 4673, 7340, - 1553, 1156, 8401, 11389, 1020, 2967, 10772, 7045, - 3316, 11236, 5285, 11578, 10637, 10086, 9493, 6180, - 9277, 6130, 3323, 883, 10469, 489, 1502, 2851, - 11061, 9729, 2742, 12241, 4970, 10481, 10078, 1195, - 730, 1762, 3854, 2030, 5892, 10922, 9020, 5274, - 9179, 3604, 3782, 10206, 3180, 3467, 4668, 2446, - 7613, 9386, 834, 7703, 6836, 3403, 5351, 12276, - 3580, 1739, 10820, 9787, 10209, 4070, 12250, 8525, - 10401, 2749, 7338, 10574, 6040, 943, 9330, 1477, - 6865, 9668, 3585, 6633, 12145, 4063, 3684, 7680, - 8188, 6902, 3533, 9807, 6090, 727, 10099, 7003, - 6945, 1949, 9731, 10559, 6057, 378, 7871, 8763, - 8901, 9229, 8846, 4551, 9589, 11664, 7630, 8821, - 5680, 4956, 6251, 8388, 10156, 8723, 2341, 3159, - 1467, 5460, 8553, 7783, 2649, 2320, 9036, 6188, - 737, 3698, 4699, 5753, 9046, 3687, 16, 914, - 5186, 10531, 4552, 1964, 3509, 8436, 7516, 5381, - 10733, 3281, 7037, 1060, 2895, 7156, 8887, 5357, - 6409, 8197, 2962, 6375, 5064, 6634, 5625, 278, - 932, 10229, 8927, 7642, 351, 9298, 237, 5858, - 7692, 3146, 12126, 7586, 2053, 11285, 3802, 5204, - 4602, 1748, 11300, 340, 3711, 4614, 300, 10993, - 5070, 10049, 11616, 12247, 7421, 10707, 5746, 5654, - 3835, 5553, 1224, 8476, 9237, 3845, 250, 11209, - 4225, 6326, 9680, 12254, 4136, 2778, 692, 8808, - 6410, 6718, 10105, 10418, 3759, 7356, 11361, 8433, - 6437, 3652, 6342, 8978, 5391, 2272, 6476, 7416, - 8418, 10824, 11986, 5733, 876, 7030, 2167, 2436, - 3442, 9217, 8206, 4858, 5964, 2746, 7178, 1434, - 7389, 8879, 10661, 11457, 4220, 1432, 10832, 4328, - 8557, 1867, 9454, 2416, 3816, 9076, 686, 5393, - 2523, 4339, 6115, 619, 937, 2834, 7775, 3279, - 2363, 7488, 6112, 5056, 824, 10204, 11690, 1113, - 2727, 9848, 896, 2028, 5075, 2654, 10464, 7884, - 12169, 5434, 3070, 6400, 9132, 11672, 12153, 4520, - 1273, 9739, 11468, 9937, 10039, 9720, 2262, 9399, - 11192, 315, 4511, 1158, 6061, 6751, 11865, 357, - 7367, 4550, 983, 8534, 8352, 10126, 7530, 9253, - 4367, 5221, 3999, 8777, 3161, 6990, 4130, 11652, - 3374, 11477, 1753, 292, 8681, 2806, 10378, 12188, - 5800, 11811, 3181, 1988, 1024, 9340, 2477, 10928, - 4582, 6750, 3619, 5503, 5233, 2463, 8470, 7650, - 7964, 6395, 1071, 1272, 3474, 11045, 3291, 11344, - 8502, 9478, 9837, 1253, 1857, 6233, 4720, 11561, - 6034, 9817, 3339, 1797, 2879, 6242, 5200, 2114, - 7962, 9353, 11363, 5475, 6084, 9601, 4108, 7323, - 10438, 9471, 1271, 408, 6911, 3079, 360, 8276, - 11535, 9156, 9049, 11539, 850, 8617, 784, 7919, - 8334, 12170, 1846, 10213, 12184, 7827, 11903, 5600, - 9779, 1012, 721, 2784, 6676, 6552, 5348, 4424, - 6816, 8405, 9959, 5150, 2356, 5552, 5267, 1333, - 8801, 9661, 7308, 5788, 4910, 909, 11613, 4395, - 8238, 6686, 4302, 3044, 2285, 12249, 1963, 9216, - 4296, 11918, 695, 4371, 9793, 4884, 2411, 10230, - 2650, 841, 3890, 10231, 7248, 8505, 11196, 6688, - 4059, 6060, 3686, 4722, 11853, 5816, 7058, 6868, - 11137, 7926, 4894, 12284, 4102, 3908, 3610, 6525, - 7938, 7982, 11977, 6755, 537, 4562, 1623, 8227, - 11453, 7544, 906, 11816, 9548, 10858, 9703, 2815, - 11736, 6813, 6979, 819, 8903, 6271, 10843, 348, - 7514, 8339, 6439, 694, 852, 5659, 2781, 3716, - 11589, 3024, 1523, 8659, 4114, 10738, 3303, 5885, - 2978, 7289, 11884, 9123, 9323, 11830, 98, 2526, - 2116, 4131, 11407, 1844, 3645, 3916, 8133, 2224, - 10871, 8092, 9651, 5989, 7140, 8480, 1670, 159, - 10923, 4918, 128, 7312, 725, 9157, 5006, 6393, - 3494, 6043, 10972, 6181, 11838, 3423, 10514, 7668, - 3693, 6658, 6905, 11953, 10212, 11922, 9101, 8365, - 5110, 45, 2400, 1921, 4377, 2720, 1695, 51, - 2808, 650, 1896, 9997, 9971, 11980, 8098, 4833, - 4135, 4257, 5838, 4765, 10985, 11532, 590, 12198, - 482, 12173, 2006, 7064, 10018, 3912, 12016, 10519, - 11362, 6954, 2210, 284, 5413, 6601, 3865, 10339, - 11188, 6231, 517, 9564, 11281, 3863, 1210, 4604, - 8160, 11447, 153, 7204, 5763, 5089, 9248, 12154, - 11748, 1354, 6672, 179, 5532, 2646, 5941, 12185, - 862, 3158, 477, 7279, 5678, 7914, 4254, 302, - 2893, 10114, 6890, 9560, 9647, 11905, 4098, 9824, - 10269, 1353, 10715, 5325, 6254, 3951, 1807, 6449, - 5159, 1308, 8315, 3404, 1877, 1231, 112, 6398, - 11724, 12272, 7286, 1459, 12274, 9896, 3456, 800, - 1397, 10678, 103, 7420, 7976, 936, 764, 632, - 7996, 8223, 8445, 7758, 10870, 9571, 2508, 1946, - 6524, 10158, 1044, 4338, 2457, 3641, 1659, 4139, - 4688, 9733, 11148, 3946, 2082, 5261, 2036, 11850, - 7636, 12236, 5366, 2380, 1399, 7720, 2100, 3217, - 10912, 8898, 7578, 11995, 2791, 1215, 3355, 2711, - 2267, 2004, 8568, 10176, 3214, 2337, 1750, 4729, - 4997, 7415, 6315, 12044, 4374, 7157, 4844, 211, - 8003, 10159, 9290, 11481, 1735, 2336, 5793, 9875, - 8192, 986, 7527, 1401, 870, 3615, 8465, 2756, - 9770, 2034, 10168, 3264, 6132, 54, 2880, 4763, - 11805, 3074, 8286, 9428, 4881, 6933, 1090, 10038, - 2567, 708, 893, 6465, 4962, 10024, 2090, 5718, - 10743, 780, 4733, 4623, 2134, 2087, 4802, 884, - 5372, 5795, 5938, 4333, 6559, 7549, 5269, 10664, - 4252, 3260, 5917, 10814, 5768, 9983, 8096, 7791, - 6800, 7491, 6272, 1907, 10947, 6289, 11803, 6032, - 11449, 1171, 9201, 7933, 2479, 7970, 11337, 7062, - 8911, 6728, 6542, 8114, 8828, 6595, 3545, 4348, - 4610, 2205, 6999, 8106, 5560, 10390, 9321, 2499, - 2413, 7272, 6881, 10582, 9308, 9437, 3554, 3326, - 5991, 11969, 3415, 12283, 9838, 12063, 4332, 7830, - 11329, 6605, 12271, 2044, 11611, 7353, 11201, 11582, - 3733, 8943, 9978, 1627, 7168, 3935, 5050, 2762, - 7496, 10383, 755, 1654, 12053, 4952, 10134, 4394, - 6592, 7898, 7497, 8904, 12029, 3581, 10748, 5674, - 10358, 4901, 7414, 8771, 710, 6764, 8462, 7193, - 5371, 7274, 11084, 290, 7864, 6827, 11822, 2509, - 6578, 4026, 5807, 1458, 5721, 5762, 4178, 2105, - 11621, 4852, 8897, 2856, 11510, 9264, 2520, 8776, - 7011, 2647, 1898, 7039, 5950, 11163, 5488, 6277, - 9182, 11456, 633, 10046, 11554, 5633, 9587, 2333, - 7008, 7084, 5047, 7199, 9865, 8997, 569, 6390, - 10845, 9679, 8268, 11472, 4203, 1997, 2, 9331, - 162, 6182, 2000, 3649, 9792, 6363, 7557, 6187, - 8510, 9935, 5536, 9019, 3706, 12009, 1452, 3067, - 5494, 9692, 4865, 6019, 7106, 9610, 4588, 10165, - 6261, 5887, 2652, 10172, 1580, 10379, 4638, 9949 -}; - -/* - * Table for inverse NTT, binary case: - * iGMb[x] = R*((1/g)^rev(x)) mod q - * Since g = 7, 1/g = 8778 mod 12289. - */ -static const uint16_t iGMb[] = { - 4091, 4401, 1081, 1229, 2530, 6014, 7947, 5329, - 2579, 4751, 6464, 11703, 7023, 2812, 5890, 10698, - 3109, 2125, 1960, 10925, 10601, 10404, 4189, 1875, - 5847, 8546, 4615, 5190, 11324, 10578, 5882, 11155, - 8417, 12275, 10599, 7446, 5719, 3569, 5981, 10108, - 4426, 8306, 10755, 4679, 11052, 1538, 11857, 100, - 8247, 6625, 9725, 5145, 3412, 7858, 5831, 9460, - 5217, 10740, 7882, 7506, 12172, 11292, 6049, 79, - 13, 6938, 8886, 5453, 4586, 11455, 2903, 4676, - 9843, 7621, 8822, 9109, 2083, 8507, 8685, 3110, - 7015, 3269, 1367, 6397, 10259, 8435, 10527, 11559, - 11094, 2211, 1808, 7319, 48, 9547, 2560, 1228, - 9438, 10787, 11800, 1820, 11406, 8966, 6159, 3012, - 6109, 2796, 2203, 1652, 711, 7004, 1053, 8973, - 5244, 1517, 9322, 11269, 900, 3888, 11133, 10736, - 4949, 7616, 9974, 4746, 10270, 126, 2921, 6720, - 6635, 6543, 1582, 4868, 42, 673, 2240, 7219, - 1296, 11989, 7675, 8578, 11949, 989, 10541, 7687, - 7085, 8487, 1004, 10236, 4703, 163, 9143, 4597, - 6431, 12052, 2991, 11938, 4647, 3362, 2060, 11357, - 12011, 6664, 5655, 7225, 5914, 9327, 4092, 5880, - 6932, 3402, 5133, 9394, 11229, 5252, 9008, 1556, - 6908, 4773, 3853, 8780, 10325, 7737, 1758, 7103, - 11375, 12273, 8602, 3243, 6536, 7590, 8591, 11552, - 6101, 3253, 9969, 9640, 4506, 3736, 6829, 10822, - 9130, 9948, 3566, 2133, 3901, 6038, 7333, 6609, - 3468, 4659, 625, 2700, 7738, 3443, 3060, 3388, - 3526, 4418, 11911, 6232, 1730, 2558, 10340, 5344, - 5286, 2190, 11562, 6199, 2482, 8756, 5387, 4101, - 4609, 8605, 8226, 144, 5656, 8704, 2621, 5424, - 10812, 2959, 11346, 6249, 1715, 4951, 9540, 1888, - 3764, 39, 8219, 2080, 2502, 1469, 10550, 8709, - 5601, 1093, 3784, 5041, 2058, 8399, 11448, 9639, - 2059, 9878, 7405, 2496, 7918, 11594, 371, 7993, - 3073, 10326, 40, 10004, 9245, 7987, 5603, 4051, - 7894, 676, 11380, 7379, 6501, 4981, 2628, 3488, - 10956, 7022, 6737, 9933, 7139, 2330, 3884, 5473, - 7865, 6941, 5737, 5613, 9505, 11568, 11277, 2510, - 6689, 386, 4462, 105, 2076, 10443, 119, 3955, - 4370, 11505, 3672, 11439, 750, 3240, 3133, 754, - 4013, 11929, 9210, 5378, 11881, 11018, 2818, 1851, - 4966, 8181, 2688, 6205, 6814, 926, 2936, 4327, - 10175, 7089, 6047, 9410, 10492, 8950, 2472, 6255, - 728, 7569, 6056, 10432, 11036, 2452, 2811, 3787, - 945, 8998, 1244, 8815, 11017, 11218, 5894, 4325, - 4639, 3819, 9826, 7056, 6786, 8670, 5539, 7707, - 1361, 9812, 2949, 11265, 10301, 9108, 478, 6489, - 101, 1911, 9483, 3608, 11997, 10536, 812, 8915, - 637, 8159, 5299, 9128, 3512, 8290, 7068, 7922, - 3036, 4759, 2163, 3937, 3755, 11306, 7739, 4922, - 11932, 424, 5538, 6228, 11131, 7778, 11974, 1097, - 2890, 10027, 2569, 2250, 2352, 821, 2550, 11016, - 7769, 136, 617, 3157, 5889, 9219, 6855, 120, - 4405, 1825, 9635, 7214, 10261, 11393, 2441, 9562, - 11176, 599, 2085, 11465, 7233, 6177, 4801, 9926, - 9010, 4514, 9455, 11352, 11670, 6174, 7950, 9766, - 6896, 11603, 3213, 8473, 9873, 2835, 10422, 3732, - 7961, 1457, 10857, 8069, 832, 1628, 3410, 4900, - 10855, 5111, 9543, 6325, 7431, 4083, 3072, 8847, - 9853, 10122, 5259, 11413, 6556, 303, 1465, 3871, - 4873, 5813, 10017, 6898, 3311, 5947, 8637, 5852, - 3856, 928, 4933, 8530, 1871, 2184, 5571, 5879, - 3481, 11597, 9511, 8153, 35, 2609, 5963, 8064, - 1080, 12039, 8444, 3052, 3813, 11065, 6736, 8454, - 2340, 7651, 1910, 10709, 2117, 9637, 6402, 6028, - 2124, 7701, 2679, 5183, 6270, 7424, 2597, 6795, - 9222, 10837, 280, 8583, 3270, 6753, 2354, 3779, - 6102, 4732, 5926, 2497, 8640, 10289, 6107, 12127, - 2958, 12287, 10292, 8086, 817, 4021, 2610, 1444, - 5899, 11720, 3292, 2424, 5090, 7242, 5205, 5281, - 9956, 2702, 6656, 735, 2243, 11656, 833, 3107, - 6012, 6801, 1126, 6339, 5250, 10391, 9642, 5278, - 3513, 9769, 3025, 779, 9433, 3392, 7437, 668, - 10184, 8111, 6527, 6568, 10831, 6482, 8263, 5711, - 9780, 467, 5462, 4425, 11999, 1205, 5015, 6918, - 5096, 3827, 5525, 11579, 3518, 4875, 7388, 1931, - 6615, 1541, 8708, 260, 3385, 4792, 4391, 5697, - 7895, 2155, 7337, 236, 10635, 11534, 1906, 4793, - 9527, 7239, 8354, 5121, 10662, 2311, 3346, 8556, - 707, 1088, 4936, 678, 10245, 18, 5684, 960, - 4459, 7957, 226, 2451, 6, 8874, 320, 6298, - 8963, 8735, 2852, 2981, 1707, 5408, 5017, 9876, - 9790, 2968, 1899, 6729, 4183, 5290, 10084, 7679, - 7941, 8744, 5694, 3461, 4175, 5747, 5561, 3378, - 5227, 952, 4319, 9810, 4356, 3088, 11118, 840, - 6257, 486, 6000, 1342, 10382, 6017, 4798, 5489, - 4498, 4193, 2306, 6521, 1475, 6372, 9029, 8037, - 1625, 7020, 4740, 5730, 7956, 6351, 6494, 6917, - 11405, 7487, 10202, 10155, 7666, 7556, 11509, 1546, - 6571, 10199, 2265, 7327, 5824, 11396, 11581, 9722, - 2251, 11199, 5356, 7408, 2861, 4003, 9215, 484, - 7526, 9409, 12235, 6157, 9025, 2121, 10255, 2519, - 9533, 3824, 8674, 11419, 10888, 4762, 11303, 4097, - 2414, 6496, 9953, 10554, 808, 2999, 2130, 4286, - 12078, 7445, 5132, 7915, 245, 5974, 4874, 7292, - 7560, 10539, 9952, 9075, 2113, 3721, 10285, 10022, - 9578, 8934, 11074, 9498, 294, 4711, 3391, 1377, - 9072, 10189, 4569, 10890, 9909, 6923, 53, 4653, - 439, 10253, 7028, 10207, 8343, 1141, 2556, 7601, - 8150, 10630, 8648, 9832, 7951, 11245, 2131, 5765, - 10343, 9781, 2718, 1419, 4531, 3844, 4066, 4293, - 11657, 11525, 11353, 4313, 4869, 12186, 1611, 10892, - 11489, 8833, 2393, 15, 10830, 5003, 17, 565, - 5891, 12177, 11058, 10412, 8885, 3974, 10981, 7130, - 5840, 10482, 8338, 6035, 6964, 1574, 10936, 2020, - 2465, 8191, 384, 2642, 2729, 5399, 2175, 9396, - 11987, 8035, 4375, 6611, 5010, 11812, 9131, 11427, - 104, 6348, 9643, 6757, 12110, 5617, 10935, 541, - 135, 3041, 7200, 6526, 5085, 12136, 842, 4129, - 7685, 11079, 8426, 1008, 2725, 11772, 6058, 1101, - 1950, 8424, 5688, 6876, 12005, 10079, 5335, 927, - 1770, 273, 8377, 2271, 5225, 10283, 116, 11807, - 91, 11699, 757, 1304, 7524, 6451, 8032, 8154, - 7456, 4191, 309, 2318, 2292, 10393, 11639, 9481, - 12238, 10594, 9569, 7912, 10368, 9889, 12244, 7179, - 3924, 3188, 367, 2077, 336, 5384, 5631, 8596, - 4621, 1775, 8866, 451, 6108, 1317, 6246, 8795, - 5896, 7283, 3132, 11564, 4977, 12161, 7371, 1366, - 12130, 10619, 3809, 5149, 6300, 2638, 4197, 1418, - 10065, 4156, 8373, 8644, 10445, 882, 8158, 10173, - 9763, 12191, 459, 2966, 3166, 405, 5000, 9311, - 6404, 8986, 1551, 8175, 3630, 10766, 9265, 700, - 8573, 9508, 6630, 11437, 11595, 5850, 3950, 4775, - 11941, 1446, 6018, 3386, 11470, 5310, 5476, 553, - 9474, 2586, 1431, 2741, 473, 11383, 4745, 836, - 4062, 10666, 7727, 11752, 5534, 312, 4307, 4351, - 5764, 8679, 8381, 8187, 5, 7395, 4363, 1152, - 5421, 5231, 6473, 436, 7567, 8603, 6229, 8230 -}; - -/* - * Reduce a small signed integer modulo q. The source integer MUST - * be between -q/2 and +q/2. - */ -static inline uint32_t -mq_conv_small(int x) -{ - /* - * If x < 0, the cast to uint32_t will set the high bit to 1. - */ - uint32_t y; - - y = (uint32_t)x; - y += Q & -(y >> 31); - return y; -} - -/* - * Addition modulo q. Operands must be in the 0..q-1 range. - */ -static inline uint32_t -mq_add(uint32_t x, uint32_t y) -{ - /* - * We compute x + y - q. If the result is negative, then the - * high bit will be set, and 'd >> 31' will be equal to 1; - * thus '-(d >> 31)' will be an all-one pattern. Otherwise, - * it will be an all-zero pattern. In other words, this - * implements a conditional addition of q. - */ - uint32_t d; - - d = x + y - Q; - d += Q & -(d >> 31); - return d; -} - -/* - * Subtraction modulo q. Operands must be in the 0..q-1 range. - */ -static inline uint32_t -mq_sub(uint32_t x, uint32_t y) -{ - /* - * As in mq_add(), we use a conditional addition to ensure the - * result is in the 0..q-1 range. - */ - uint32_t d; - - d = x - y; - d += Q & -(d >> 31); - return d; -} - -/* - * Division by 2 modulo q. Operand must be in the 0..q-1 range. - */ -static inline uint32_t -mq_rshift1(uint32_t x) -{ - x += Q & -(x & 1); - return (x >> 1); -} - -/* - * Montgomery multiplication modulo q. If we set R = 2^16 mod q, then - * this function computes: x * y / R mod q - * Operands must be in the 0..q-1 range. - */ -static inline uint32_t -mq_montymul(uint32_t x, uint32_t y) -{ - uint32_t z, w; - - /* - * We compute x*y + k*q with a value of k chosen so that the 16 - * low bits of the result are 0. We can then shift the value. - * After the shift, result may still be larger than q, but it - * will be lower than 2*q, so a conditional subtraction works. - */ - - z = x * y; - w = ((z * Q0I) & 0xFFFF) * Q; - - /* - * When adding z and w, the result will have its low 16 bits - * equal to 0. Since x, y and z are lower than q, the sum will - * be no more than (2^15 - 1) * q + (q - 1)^2, which will - * fit on 29 bits. - */ - z = (z + w) >> 16; - - /* - * After the shift, analysis shows that the value will be less - * than 2q. We do a subtraction then conditional subtraction to - * ensure the result is in the expected range. - */ - z -= Q; - z += Q & -(z >> 31); - return z; -} - -/* - * Montgomery squaring (computes (x^2)/R). - */ -static inline uint32_t -mq_montysqr(uint32_t x) -{ - return mq_montymul(x, x); -} - -/* - * Divide x by y modulo q = 12289. - */ -static inline uint32_t -mq_div_12289(uint32_t x, uint32_t y) -{ - /* - * We invert y by computing y^(q-2) mod q. - * - * We use the following addition chain for exponent e = 12287: - * - * e0 = 1 - * e1 = 2 * e0 = 2 - * e2 = e1 + e0 = 3 - * e3 = e2 + e1 = 5 - * e4 = 2 * e3 = 10 - * e5 = 2 * e4 = 20 - * e6 = 2 * e5 = 40 - * e7 = 2 * e6 = 80 - * e8 = 2 * e7 = 160 - * e9 = e8 + e2 = 163 - * e10 = e9 + e8 = 323 - * e11 = 2 * e10 = 646 - * e12 = 2 * e11 = 1292 - * e13 = e12 + e9 = 1455 - * e14 = 2 * e13 = 2910 - * e15 = 2 * e14 = 5820 - * e16 = e15 + e10 = 6143 - * e17 = 2 * e16 = 12286 - * e18 = e17 + e0 = 12287 - * - * Additions on exponents are converted to Montgomery - * multiplications. We define all intermediate results as so - * many local variables, and let the C compiler work out which - * must be kept around. - */ - uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9; - uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18; - - y0 = mq_montymul(y, R2); - y1 = mq_montysqr(y0); - y2 = mq_montymul(y1, y0); - y3 = mq_montymul(y2, y1); - y4 = mq_montysqr(y3); - y5 = mq_montysqr(y4); - y6 = mq_montysqr(y5); - y7 = mq_montysqr(y6); - y8 = mq_montysqr(y7); - y9 = mq_montymul(y8, y2); - y10 = mq_montymul(y9, y8); - y11 = mq_montysqr(y10); - y12 = mq_montysqr(y11); - y13 = mq_montymul(y12, y9); - y14 = mq_montysqr(y13); - y15 = mq_montysqr(y14); - y16 = mq_montymul(y15, y10); - y17 = mq_montysqr(y16); - y18 = mq_montymul(y17, y0); - - /* - * Final multiplication with x, which is not in Montgomery - * representation, computes the correct division result. - */ - return mq_montymul(y18, x); -} - -/* - * Compute NTT on a ring element. - */ -static void -mq_NTT(uint16_t *a, unsigned logn) -{ - size_t n, t, m; - - n = (size_t)1 << logn; - t = n; - for (m = 1; m < n; m <<= 1) { - size_t ht, i, j1; - - ht = t >> 1; - for (i = 0, j1 = 0; i < m; i ++, j1 += t) { - size_t j, j2; - uint32_t s; - - s = GMb[m + i]; - j2 = j1 + ht; - for (j = j1; j < j2; j ++) { - uint32_t u, v; - - u = a[j]; - v = mq_montymul(a[j + ht], s); - a[j] = (uint16_t)mq_add(u, v); - a[j + ht] = (uint16_t)mq_sub(u, v); - } - } - t = ht; - } -} - -/* - * Compute the inverse NTT on a ring element, binary case. - */ -static void -mq_iNTT(uint16_t *a, unsigned logn) -{ - size_t n, t, m; - uint32_t ni; - - n = (size_t)1 << logn; - t = 1; - m = n; - while (m > 1) { - size_t hm, dt, i, j1; - - hm = m >> 1; - dt = t << 1; - for (i = 0, j1 = 0; i < hm; i ++, j1 += dt) { - size_t j, j2; - uint32_t s; - - j2 = j1 + t; - s = iGMb[hm + i]; - for (j = j1; j < j2; j ++) { - uint32_t u, v, w; - - u = a[j]; - v = a[j + t]; - a[j] = (uint16_t)mq_add(u, v); - w = mq_sub(u, v); - a[j + t] = (uint16_t) - mq_montymul(w, s); - } - } - t = dt; - m = hm; - } - - /* - * To complete the inverse NTT, we must now divide all values by - * n (the vector size). We thus need the inverse of n, i.e. we - * need to divide 1 by 2 logn times. But we also want it in - * Montgomery representation, i.e. we also want to multiply it - * by R = 2^16. In the common case, this should be a simple right - * shift. The loop below is generic and works also in corner cases; - * its computation time is negligible. - */ - ni = R; - for (m = n; m > 1; m >>= 1) { - ni = mq_rshift1(ni); - } - for (m = 0; m < n; m ++) { - a[m] = (uint16_t)mq_montymul(a[m], ni); - } -} - -/* - * Convert a polynomial (mod q) to Montgomery representation. - */ -static void -mq_poly_tomonty(uint16_t *f, unsigned logn) -{ - size_t u, n; - - n = (size_t)1 << logn; - for (u = 0; u < n; u ++) { - f[u] = (uint16_t)mq_montymul(f[u], R2); - } -} - -/* - * Multiply two polynomials together (NTT representation, and using - * a Montgomery multiplication). Result f*g is written over f. - */ -static void -mq_poly_montymul_ntt(uint16_t *f, const uint16_t *g, unsigned logn) -{ - size_t u, n; - - n = (size_t)1 << logn; - for (u = 0; u < n; u ++) { - f[u] = (uint16_t)mq_montymul(f[u], g[u]); - } -} - -/* - * Subtract polynomial g from polynomial f. - */ -static void -mq_poly_sub(uint16_t *f, const uint16_t *g, unsigned logn) -{ - size_t u, n; - - n = (size_t)1 << logn; - for (u = 0; u < n; u ++) { - f[u] = (uint16_t)mq_sub(f[u], g[u]); - } -} - -/* ===================================================================== */ - -/* see inner.h */ -void -Zf(to_ntt_monty)(uint16_t *h, unsigned logn) -{ - mq_NTT(h, logn); - mq_poly_tomonty(h, logn); -} - -/* see inner.h */ -int -Zf(verify_raw)(const uint16_t *c0, const int16_t *s2, - const uint16_t *h, unsigned logn, uint8_t *tmp) -{ - size_t u, n; - uint16_t *tt; - - n = (size_t)1 << logn; - tt = (uint16_t *)tmp; - - /* - * Reduce s2 elements modulo q ([0..q-1] range). - */ - for (u = 0; u < n; u ++) { - uint32_t w; - - w = (uint32_t)s2[u]; - w += Q & -(w >> 31); - tt[u] = (uint16_t)w; - } - - /* - * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]). - */ - mq_NTT(tt, logn); - mq_poly_montymul_ntt(tt, h, logn); - mq_iNTT(tt, logn); - mq_poly_sub(tt, c0, logn); - - /* - * Normalize -s1 elements into the [-q/2..q/2] range. - */ - for (u = 0; u < n; u ++) { - int32_t w; - - w = (int32_t)tt[u]; - w -= (int32_t)(Q & -(((Q >> 1) - (uint32_t)w) >> 31)); - ((int16_t *)tt)[u] = (int16_t)w; - } - - /* - * Signature is valid if and only if the aggregate (-s1,s2) vector - * is short enough. - */ - return Zf(is_short)((int16_t *)tt, s2, logn); -} - -/* see inner.h */ -int -Zf(compute_public)(uint16_t *h, - const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp) -{ - size_t u, n; - uint16_t *tt; - - n = (size_t)1 << logn; - tt = (uint16_t *)tmp; - for (u = 0; u < n; u ++) { - tt[u] = (uint16_t)mq_conv_small(f[u]); - h[u] = (uint16_t)mq_conv_small(g[u]); - } - mq_NTT(h, logn); - mq_NTT(tt, logn); - for (u = 0; u < n; u ++) { - if (tt[u] == 0) { - return 0; - } - h[u] = (uint16_t)mq_div_12289(h[u], tt[u]); - } - mq_iNTT(h, logn); - return 1; -} - -/* see inner.h */ -int -Zf(complete_private)(int8_t *G, - const int8_t *f, const int8_t *g, const int8_t *F, - unsigned logn, uint8_t *tmp) -{ - size_t u, n; - uint16_t *t1, *t2; - - n = (size_t)1 << logn; - t1 = (uint16_t *)tmp; - t2 = t1 + n; - for (u = 0; u < n; u ++) { - t1[u] = (uint16_t)mq_conv_small(g[u]); - t2[u] = (uint16_t)mq_conv_small(F[u]); - } - mq_NTT(t1, logn); - mq_NTT(t2, logn); - mq_poly_tomonty(t1, logn); - mq_poly_montymul_ntt(t1, t2, logn); - for (u = 0; u < n; u ++) { - t2[u] = (uint16_t)mq_conv_small(f[u]); - } - mq_NTT(t2, logn); - for (u = 0; u < n; u ++) { - if (t2[u] == 0) { - return 0; - } - t1[u] = (uint16_t)mq_div_12289(t1[u], t2[u]); - } - mq_iNTT(t1, logn); - for (u = 0; u < n; u ++) { - uint32_t w; - int32_t gi; - - w = t1[u]; - w -= (Q & ~-((w - (Q >> 1)) >> 31)); - gi = *(int32_t *)&w; - if (gi < -127 || gi > +127) { - return 0; - } - G[u] = (int8_t)gi; - } - return 1; -} - -/* see inner.h */ -int -Zf(is_invertible)( - const int16_t *s2, unsigned logn, uint8_t *tmp) -{ - size_t u, n; - uint16_t *tt; - uint32_t r; - - n = (size_t)1 << logn; - tt = (uint16_t *)tmp; - for (u = 0; u < n; u ++) { - uint32_t w; - - w = (uint32_t)s2[u]; - w += Q & -(w >> 31); - tt[u] = (uint16_t)w; - } - mq_NTT(tt, logn); - r = 0; - for (u = 0; u < n; u ++) { - r |= (uint32_t)(tt[u] - 1); - } - return (int)(1u - (r >> 31)); -} - -/* see inner.h */ -int -Zf(verify_recover)(uint16_t *h, - const uint16_t *c0, const int16_t *s1, const int16_t *s2, - unsigned logn, uint8_t *tmp) -{ - size_t u, n; - uint16_t *tt; - uint32_t r; - - n = (size_t)1 << logn; - - /* - * Reduce elements of s1 and s2 modulo q; then write s2 into tt[] - * and c0 - s1 into h[]. - */ - tt = (uint16_t *)tmp; - for (u = 0; u < n; u ++) { - uint32_t w; - - w = (uint32_t)s2[u]; - w += Q & -(w >> 31); - tt[u] = (uint16_t)w; - - w = (uint32_t)s1[u]; - w += Q & -(w >> 31); - w = mq_sub(c0[u], w); - h[u] = (uint16_t)w; - } - - /* - * Compute h = (c0 - s1) / s2. If one of the coefficients of s2 - * is zero (in NTT representation) then the operation fails. We - * keep that information into a flag so that we do not deviate - * from strict constant-time processing; if all coefficients of - * s2 are non-zero, then the high bit of r will be zero. - */ - mq_NTT(tt, logn); - mq_NTT(h, logn); - r = 0; - for (u = 0; u < n; u ++) { - r |= (uint32_t)(tt[u] - 1); - h[u] = (uint16_t)mq_div_12289(h[u], tt[u]); - } - mq_iNTT(h, logn); - - /* - * Signature is acceptable if and only if it is short enough, - * and s2 was invertible mod phi mod q. The caller must still - * check that the rebuilt public key matches the expected - * value (e.g. through a hash). - */ - r = ~r & (uint32_t)-Zf(is_short)(s1, s2, logn); - return (int)(r >> 31); -} - -/* see inner.h */ -int -Zf(count_nttzero)(const int16_t *sig, unsigned logn, uint8_t *tmp) -{ - uint16_t *s2; - size_t u, n; - uint32_t r; - - n = (size_t)1 << logn; - s2 = (uint16_t *)tmp; - for (u = 0; u < n; u ++) { - uint32_t w; - - w = (uint32_t)sig[u]; - w += Q & -(w >> 31); - s2[u] = (uint16_t)w; - } - mq_NTT(s2, logn); - r = 0; - for (u = 0; u < n; u ++) { - uint32_t w; - - w = (uint32_t)s2[u] - 1u; - r += (w >> 31); - } - return (int)r; -} diff --git a/crypto_sign/falcon-512/m4-ct/README.txt b/crypto_sign/falcon-512/m4-ct/README.txt deleted file mode 100644 index 7bedf7f1..00000000 --- a/crypto_sign/falcon-512/m4-ct/README.txt +++ /dev/null @@ -1,137 +0,0 @@ -Falcon implementation for PQM4 (or even mupq in general). - - -There are multiple variants. Each variant is selected with the choice of -api.h (four choices: api512dyn.h, api512tree.h, api1024dyn.h, -api1024tree.h), and additional compile-time macro that are documented in -config.h and can be set either in config.h, or through command-line -flags passed to the C compiler. - -Choice of api.h: - - api512dyn.h - "Normal" Falcon-512. Private key is reasonably compact. The - Falcon LDL tree is internally recomputed for each signature. - - api512tree.h - Falcon-512 is key expansion. The Falcon LDL tree is computed - as part of the keygen, and returned as private key. This - speeds up signature generation, but also greatly enlarges - the private key size. - - api1024dyn.h - "Normal" Falcon-1024. - - api1024tree.h - Falcon-1024 with key expansion. - -Compile-time options (config.h): - - FALCON_FPEMU - Set to 1 to enable use of the internal constant-time emulation - of floating-point operations. - - FALCON_FPNATIVE - Set to 1 to use the native 'double' type and floating-point - operations. On architectures that lack a FPU, this will use the - compiler-provided floating-point emulation routines, which are - usually not constant-time (and sometimes return values which - do not follow IEEE-754 rounding rules). - - FALCON_ASM_CORTEXM4 - Set to 1 to use the M4 assembly routine for the constant-time - emulation of floating-point operations. These are faster than - the generic routines in C activated by FALCON_FPEMU. - -There is some internal autodetection that tries to select the right -values automatically, but it's safer to explicitly select things: - - To use the native 'double' type: - -DFALCON_FPNATIVE=1 - - To use the generic FP emulation code: - -DFALCON_FPEMU=1 -DFALCON_ASM_CORTEXM4=0 - - To use the M4 assembly code for FP emulation: - -DFALCON_FPEMU=1 -DFALCON_ASM_CORTEXM4=1 - -The code relying on the native 'double' type requires an implementation -that follows IEEE-754 rules with a 64-bit type. It works on 64-bit x86 -and PowerPC / POWER systems. On 32-bit x86, it tends to fail because the -80387 FPU is used with more precision; on such a system, use -'-msse2 -mfpmath=sse' to force use of the SSE2 unit (this might be the -default on some systems, e.g. Darwin / macOS). - - -IMPORTANT NOTES -=============== - - * The PQM4 API is implemented in pqm4.c. Since the M4 stack is usually - small (usual default is 4 kB), temporary buffers are statically - allocated. This implies that the crypto_sign_keypair(), crypto_sign() - and crypto_sign_open() functions are not thread-safe or reentrant. - Also, the static allocation is "forever". - - See the comments for the 'tmp' variable in pqm4.c; this gives the - relevant sizes. - - * When using expanded keys, the private key contains 64-bit values - (floating-point, i.e. 'double' or 'uint64_t' depending on the kind - of floating-point emulation that is used). On many systems, this - implies some alignment requirements. I.e. crypto_sign_keypair() and - crypto_sign() then require the 'sk' pointer to be suitably aligned. - On an ARM Cortex M4, 32-bit alignment is required (while the basic - RAM access opcodes tolerate unaligned accesses, the 'ldm' and 'stm' - opcodes need 32-bit aligned pointers). - - * When using the native 'double' type, the code has a dependency on - the sqrt() function. On x86, the relevant SSE2 opcode is inlined, - but the library function is still (potentially) invoked in case the - operand is negative, so that proper error management is performed. - This case does not happen in Falcon, but the library function is - still referenced, and explicitly linking with '-lm' may be - necessary. - - * When using the native 'double' type, do _NOT_ enable -ffast-math. - The internal rounding function relies on the usual trick: - when x >= 0, round(x) = (x + 2**52) - 2**52 - - This trick works only as long as each addition is rounded as per - the IEEE-754 rules to the exact precision of the 64-bit type. - When -ffast-math is enabled, the compiler may assume commutativity - and "optimize" that expression into 'round(x) = x', which does not - work at all. - - -TESTS -===== - -In the 'tests/' directory is a generator for known-answer tests, and the -expected file. The code comes from the NIST, but was modified to avoid a -dependency on OpenSSL. When compiling the C source file against the -selected Falcon implementation, an executable is produced, that, when -executed, generates an '*.req' and an '*.rsp' files. The .req file is -redundant (the .rsp file contains all the information, and some more). - -The expected .rsp files are provided as: - KAT512dyn.rsp Falcon-512, no expanded key - KAT512tree.rsp Falcon-512, with expanded key - KAT1024dyn.rsp Falcon-1024, no expanded key - KAT1024tree.rsp Falcon-1024, with expanded key - - -Normally, all computations are exact and the files are exactly -reproducible. However, some discrepancies may occur with the '*tree' -files in the following cases: - - - On big-endian architectures, the bytes in sk[] will be in a - different order. This is a side effect of putting the raw bytes - of the expanded key in sk[] (this could be fixed with some - reencoding pass, but this was not implemented yet). - - - If a non-exact IEEE-754 implementation is used, some of the - low bits of the values may be changed. This may happen if the - underlying implementation is not strictly faithful to rounding. - -As long as only the 'sk' lines are changed, then the public keys -and signature values are unimpacted. diff --git a/crypto_sign/falcon-512/m4-ct/api.h b/crypto_sign/falcon-512/m4-ct/api.h deleted file mode 100644 index 9275eaf9..00000000 --- a/crypto_sign/falcon-512/m4-ct/api.h +++ /dev/null @@ -1,17 +0,0 @@ -#include - -#define CRYPTO_SECRETKEYBYTES 1281 -#define CRYPTO_PUBLICKEYBYTES 897 -#define CRYPTO_BYTES 690 - -#define CRYPTO_ALGNAME "Falcon-512" - -int crypto_sign_keypair(unsigned char *pk, unsigned char *sk); - -int crypto_sign(unsigned char *sm, size_t *smlen, - const unsigned char *m, size_t mlen, - const unsigned char *sk); - -int crypto_sign_open(unsigned char *m, size_t *mlen, - const unsigned char *sm, size_t smlen, - const unsigned char *pk); diff --git a/crypto_sign/falcon-512/m4-ct/codec.c b/crypto_sign/falcon-512/m4-ct/codec.c deleted file mode 100644 index 5bd61424..00000000 --- a/crypto_sign/falcon-512/m4-ct/codec.c +++ /dev/null @@ -1,559 +0,0 @@ -/* - * Encoding/decoding of keys and signatures. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -/* see inner.h */ -size_t -Zf(modq_encode)( - void *out, size_t max_out_len, - const uint16_t *x, unsigned logn) -{ - size_t n, out_len, u; - uint8_t *buf; - uint32_t acc; - int acc_len; - - n = (size_t)1 << logn; - for (u = 0; u < n; u ++) { - if (x[u] >= 12289) { - return 0; - } - } - out_len = ((n * 14) + 7) >> 3; - if (out == NULL) { - return out_len; - } - if (out_len > max_out_len) { - return 0; - } - buf = out; - acc = 0; - acc_len = 0; - for (u = 0; u < n; u ++) { - acc = (acc << 14) | x[u]; - acc_len += 14; - while (acc_len >= 8) { - acc_len -= 8; - *buf ++ = (uint8_t)(acc >> acc_len); - } - } - if (acc_len > 0) { - *buf = (uint8_t)(acc << (8 - acc_len)); - } - return out_len; -} - -/* see inner.h */ -size_t -Zf(modq_decode)( - uint16_t *x, unsigned logn, - const void *in, size_t max_in_len) -{ - size_t n, in_len, u; - const uint8_t *buf; - uint32_t acc; - int acc_len; - - n = (size_t)1 << logn; - in_len = ((n * 14) + 7) >> 3; - if (in_len > max_in_len) { - return 0; - } - buf = in; - acc = 0; - acc_len = 0; - u = 0; - while (u < n) { - acc = (acc << 8) | (*buf ++); - acc_len += 8; - if (acc_len >= 14) { - unsigned w; - - acc_len -= 14; - w = (acc >> acc_len) & 0x3FFF; - if (w >= 12289) { - return 0; - } - x[u ++] = (uint16_t)w; - } - } - if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) { - return 0; - } - return in_len; -} - -/* see inner.h */ -size_t -Zf(trim_i16_encode)( - void *out, size_t max_out_len, - const int16_t *x, unsigned logn, unsigned bits) -{ - size_t n, u, out_len; - int minv, maxv; - uint8_t *buf; - uint32_t acc, mask; - unsigned acc_len; - - n = (size_t)1 << logn; - maxv = (1 << (bits - 1)) - 1; - minv = -maxv; - for (u = 0; u < n; u ++) { - if (x[u] < minv || x[u] > maxv) { - return 0; - } - } - out_len = ((n * bits) + 7) >> 3; - if (out == NULL) { - return out_len; - } - if (out_len > max_out_len) { - return 0; - } - buf = out; - acc = 0; - acc_len = 0; - mask = ((uint32_t)1 << bits) - 1; - for (u = 0; u < n; u ++) { - acc = (acc << bits) | ((uint16_t)x[u] & mask); - acc_len += bits; - while (acc_len >= 8) { - acc_len -= 8; - *buf ++ = (uint8_t)(acc >> acc_len); - } - } - if (acc_len > 0) { - *buf ++ = (uint8_t)(acc << (8 - acc_len)); - } - return out_len; -} - -/* see inner.h */ -size_t -Zf(trim_i16_decode)( - int16_t *x, unsigned logn, unsigned bits, - const void *in, size_t max_in_len) -{ - size_t n, in_len; - const uint8_t *buf; - size_t u; - uint32_t acc, mask1, mask2; - unsigned acc_len; - - n = (size_t)1 << logn; - in_len = ((n * bits) + 7) >> 3; - if (in_len > max_in_len) { - return 0; - } - buf = in; - u = 0; - acc = 0; - acc_len = 0; - mask1 = ((uint32_t)1 << bits) - 1; - mask2 = (uint32_t)1 << (bits - 1); - while (u < n) { - acc = (acc << 8) | *buf ++; - acc_len += 8; - while (acc_len >= bits && u < n) { - uint32_t w; - - acc_len -= bits; - w = (acc >> acc_len) & mask1; - w |= -(w & mask2); - if (w == -mask2) { - /* - * The -2^(bits-1) value is forbidden. - */ - return 0; - } - w |= -(w & mask2); - x[u ++] = (int16_t)*(int32_t *)&w; - } - } - if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) { - /* - * Extra bits in the last byte must be zero. - */ - return 0; - } - return in_len; -} - -/* see inner.h */ -size_t -Zf(trim_i8_encode)( - void *out, size_t max_out_len, - const int8_t *x, unsigned logn, unsigned bits) -{ - size_t n, u, out_len; - int minv, maxv; - uint8_t *buf; - uint32_t acc, mask; - unsigned acc_len; - - n = (size_t)1 << logn; - maxv = (1 << (bits - 1)) - 1; - minv = -maxv; - for (u = 0; u < n; u ++) { - if (x[u] < minv || x[u] > maxv) { - return 0; - } - } - out_len = ((n * bits) + 7) >> 3; - if (out == NULL) { - return out_len; - } - if (out_len > max_out_len) { - return 0; - } - buf = out; - acc = 0; - acc_len = 0; - mask = ((uint32_t)1 << bits) - 1; - for (u = 0; u < n; u ++) { - acc = (acc << bits) | ((uint8_t)x[u] & mask); - acc_len += bits; - while (acc_len >= 8) { - acc_len -= 8; - *buf ++ = (uint8_t)(acc >> acc_len); - } - } - if (acc_len > 0) { - *buf ++ = (uint8_t)(acc << (8 - acc_len)); - } - return out_len; -} - -/* see inner.h */ -size_t -Zf(trim_i8_decode)( - int8_t *x, unsigned logn, unsigned bits, - const void *in, size_t max_in_len) -{ - size_t n, in_len; - const uint8_t *buf; - size_t u; - uint32_t acc, mask1, mask2; - unsigned acc_len; - - n = (size_t)1 << logn; - in_len = ((n * bits) + 7) >> 3; - if (in_len > max_in_len) { - return 0; - } - buf = in; - u = 0; - acc = 0; - acc_len = 0; - mask1 = ((uint32_t)1 << bits) - 1; - mask2 = (uint32_t)1 << (bits - 1); - while (u < n) { - acc = (acc << 8) | *buf ++; - acc_len += 8; - while (acc_len >= bits && u < n) { - uint32_t w; - - acc_len -= bits; - w = (acc >> acc_len) & mask1; - w |= -(w & mask2); - if (w == -mask2) { - /* - * The -2^(bits-1) value is forbidden. - */ - return 0; - } - x[u ++] = (int8_t)*(int32_t *)&w; - } - } - if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) { - /* - * Extra bits in the last byte must be zero. - */ - return 0; - } - return in_len; -} - -/* see inner.h */ -size_t -Zf(comp_encode)( - void *out, size_t max_out_len, - const int16_t *x, unsigned logn) -{ - uint8_t *buf; - size_t n, u, v; - uint32_t acc; - unsigned acc_len; - - n = (size_t)1 << logn; - buf = out; - - /* - * Make sure that all values are within the -2047..+2047 range. - */ - for (u = 0; u < n; u ++) { - if (x[u] < -2047 || x[u] > +2047) { - return 0; - } - } - - acc = 0; - acc_len = 0; - v = 0; - for (u = 0; u < n; u ++) { - int t; - unsigned w; - - /* - * Get sign and absolute value of next integer; push the - * sign bit. - */ - acc <<= 1; - t = x[u]; - if (t < 0) { - t = -t; - acc |= 1; - } - w = (unsigned)t; - - /* - * Push the low 7 bits of the absolute value. - */ - acc <<= 7; - acc |= w & 127u; - w >>= 7; - - /* - * We pushed exactly 8 bits. - */ - acc_len += 8; - - /* - * Push as many zeros as necessary, then a one. Since the - * absolute value is at most 2047, w can only range up to - * 15 at this point, thus we will add at most 16 bits - * here. With the 8 bits above and possibly up to 7 bits - * from previous iterations, we may go up to 31 bits, which - * will fit in the accumulator, which is an uint32_t. - */ - acc <<= (w + 1); - acc |= 1; - acc_len += w + 1; - - /* - * Produce all full bytes. - */ - while (acc_len >= 8) { - acc_len -= 8; - if (buf != NULL) { - if (v >= max_out_len) { - return 0; - } - buf[v] = (uint8_t)(acc >> acc_len); - } - v ++; - } - } - - /* - * Flush remaining bits (if any). - */ - if (acc_len > 0) { - if (buf != NULL) { - if (v >= max_out_len) { - return 0; - } - buf[v] = (uint8_t)(acc << (8 - acc_len)); - } - v ++; - } - - return v; -} - -/* see inner.h */ -size_t -Zf(comp_decode)( - int16_t *x, unsigned logn, - const void *in, size_t max_in_len) -{ - const uint8_t *buf; - size_t n, u, v; - uint32_t acc; - unsigned acc_len; - - n = (size_t)1 << logn; - buf = in; - acc = 0; - acc_len = 0; - v = 0; - for (u = 0; u < n; u ++) { - unsigned b, s, m; - - /* - * Get next eight bits: sign and low seven bits of the - * absolute value. - */ - if (v >= max_in_len) { - return 0; - } - acc = (acc << 8) | (uint32_t)buf[v ++]; - b = acc >> acc_len; - s = b & 128; - m = b & 127; - - /* - * Get next bits until a 1 is reached. - */ - for (;;) { - if (acc_len == 0) { - if (v >= max_in_len) { - return 0; - } - acc = (acc << 8) | (uint32_t)buf[v ++]; - acc_len = 8; - } - acc_len --; - if (((acc >> acc_len) & 1) != 0) { - break; - } - m += 128; - if (m > 2047) { - return 0; - } - } - x[u] = (int16_t)(s ? -(int)m : (int)m); - } - return v; -} - -/* - * Key elements and signatures are polynomials with small integer - * coefficients. Here are some statistics gathered over many - * generated key pairs (10000 or more for each degree): - * - * log(n) n max(f,g) std(f,g) max(F,G) std(F,G) - * 1 2 129 56.31 143 60.02 - * 2 4 123 40.93 160 46.52 - * 3 8 97 28.97 159 38.01 - * 4 16 100 21.48 154 32.50 - * 5 32 71 15.41 151 29.36 - * 6 64 59 11.07 138 27.77 - * 7 128 39 7.91 144 27.00 - * 8 256 32 5.63 148 26.61 - * 9 512 22 4.00 137 26.46 - * 10 1024 15 2.84 146 26.41 - * - * We want a compact storage format for private key, and, as part of - * key generation, we are allowed to reject some keys which would - * otherwise be fine (this does not induce any noticeable vulnerability - * as long as we reject only a small proportion of possible keys). - * Hence, we enforce at key generation time maximum values for the - * elements of f, g, F and G, so that their encoding can be expressed - * in fixed-width values. Limits have been chosen so that generated - * keys are almost always within bounds, thus not impacting neither - * security or performance. - * - * IMPORTANT: the code assumes that all coefficients of f, g, F and G - * ultimately fit in the -127..+127 range. Thus, none of the elements - * of max_fg_bits[] and max_FG_bits[] shall be greater than 8. - */ - -const uint8_t Zf(max_fg_bits)[] = { - 0, /* unused */ - 8, - 8, - 8, - 8, - 8, - 7, - 7, - 6, - 6, - 5 -}; - -const uint8_t Zf(max_FG_bits)[] = { - 0, /* unused */ - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 8 -}; - -/* - * When generating a new key pair, we can always reject keys which - * feature an abnormally large coefficient. This can also be done for - * signatures, albeit with some care: in case the signature process is - * used in a derandomized setup (explicitly seeded with the message and - * private key), we have to follow the specification faithfully, and the - * specification only enforces a limit on the L2 norm of the signature - * vector. The limit on the L2 norm implies that the absolute value of - * a coefficient of the signature cannot be more than the following: - * - * log(n) n max sig coeff (theoretical) - * 1 2 412 - * 2 4 583 - * 3 8 824 - * 4 16 1166 - * 5 32 1649 - * 6 64 2332 - * 7 128 3299 - * 8 256 4665 - * 9 512 6598 - * 10 1024 9331 - * - * However, the largest observed signature coefficients during our - * experiments was 1077 (in absolute value), hence we can assume that, - * with overwhelming probability, signature coefficients will fit - * in -2047..2047, i.e. 12 bits. - */ - -const uint8_t Zf(max_sig_bits)[] = { - 0, /* unused */ - 10, - 11, - 11, - 12, - 12, - 12, - 12, - 12, - 12, - 12 -}; diff --git a/crypto_sign/falcon-512/m4-ct/common.c b/crypto_sign/falcon-512/m4-ct/common.c deleted file mode 100644 index ef30028b..00000000 --- a/crypto_sign/falcon-512/m4-ct/common.c +++ /dev/null @@ -1,297 +0,0 @@ -/* - * Support functions for signatures (hash-to-point, norm). - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -/* see inner.h */ -void -Zf(hash_to_point_vartime)( - inner_shake256_context *sc, - uint16_t *x, unsigned logn) -{ - /* - * This is the straightforward per-the-spec implementation. It - * is not constant-time, thus it might reveal information on the - * plaintext (at least, enough to check the plaintext against a - * list of potential plaintexts) in a scenario where the - * attacker does not have access to the signature value or to - * the public key, but knows the nonce (without knowledge of the - * nonce, the hashed output cannot be matched against potential - * plaintexts). - */ - size_t n; - - n = (size_t)1 << logn; - while (n > 0) { - uint8_t buf[2]; - uint32_t w; - - inner_shake256_extract(sc, (void *)buf, sizeof buf); - w = ((unsigned)buf[0] << 8) | (unsigned)buf[1]; - if (w < 61445) { - while (w >= 12289) { - w -= 12289; - } - *x ++ = (uint16_t)w; - n --; - } - } -} - -/* see inner.h */ -void -Zf(hash_to_point_ct)( - inner_shake256_context *sc, - uint16_t *x, unsigned logn, uint8_t *tmp) -{ - /* - * Each 16-bit sample is a value in 0..65535. The value is - * kept if it falls in 0..61444 (because 61445 = 5*12289) - * and rejected otherwise; thus, each sample has probability - * about 0.93758 of being selected. - * - * We want to oversample enough to be sure that we will - * have enough values with probability at least 1 - 2^(-256). - * Depending on degree N, this leads to the following - * required oversampling: - * - * logn n oversampling - * 1 2 65 - * 2 4 67 - * 3 8 71 - * 4 16 77 - * 5 32 86 - * 6 64 100 - * 7 128 122 - * 8 256 154 - * 9 512 205 - * 10 1024 287 - * - * If logn >= 7, then the provided temporary buffer is large - * enough. Otherwise, we use a stack buffer of 63 entries - * (i.e. 126 bytes) for the values that do not fit in tmp[]. - */ - - static const uint16_t overtab[] = { - 0, /* unused */ - 65, - 67, - 71, - 77, - 86, - 100, - 122, - 154, - 205, - 287 - }; - - unsigned n, n2, u, m, p, over; - uint16_t *tt1, tt2[63]; - - /* - * We first generate m 16-bit value. Values 0..n-1 go to x[]. - * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[]. - * We also reduce modulo q the values; rejected values are set - * to 0xFFFF. - */ - n = 1U << logn; - n2 = n << 1; - over = overtab[logn]; - m = n + over; - tt1 = (uint16_t *)tmp; - for (u = 0; u < m; u ++) { - uint8_t buf[2]; - uint32_t w, wr; - - inner_shake256_extract(sc, buf, sizeof buf); - w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1]; - wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1)); - wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1)); - wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1)); - wr |= ((w - 61445) >> 31) - 1; - if (u < n) { - x[u] = (uint16_t)wr; - } else if (u < n2) { - tt1[u - n] = (uint16_t)wr; - } else { - tt2[u - n2] = (uint16_t)wr; - } - } - - /* - * Now we must "squeeze out" the invalid values. We do this in - * a logarithmic sequence of passes; each pass computes where a - * value should go, and moves it down by 'p' slots if necessary, - * where 'p' uses an increasing powers-of-two scale. It can be - * shown that in all cases where the loop decides that a value - * has to be moved down by p slots, the destination slot is - * "free" (i.e. contains an invalid value). - */ - for (p = 1; p <= over; p <<= 1) { - unsigned v; - - /* - * In the loop below: - * - * - v contains the index of the final destination of - * the value; it is recomputed dynamically based on - * whether values are valid or not. - * - * - u is the index of the value we consider ("source"); - * its address is s. - * - * - The loop may swap the value with the one at index - * u-p. The address of the swap destination is d. - */ - v = 0; - for (u = 0; u < m; u ++) { - uint16_t *s, *d; - unsigned j, sv, dv, mk; - - if (u < n) { - s = &x[u]; - } else if (u < n2) { - s = &tt1[u - n]; - } else { - s = &tt2[u - n2]; - } - sv = *s; - - /* - * The value in sv should ultimately go to - * address v, i.e. jump back by u-v slots. - */ - j = u - v; - - /* - * We increment v for the next iteration, but - * only if the source value is valid. The mask - * 'mk' is -1 if the value is valid, 0 otherwise, - * so we _subtract_ mk. - */ - mk = (sv >> 15) - 1U; - v -= mk; - - /* - * In this loop we consider jumps by p slots; if - * u < p then there is nothing more to do. - */ - if (u < p) { - continue; - } - - /* - * Destination for the swap: value at address u-p. - */ - if ((u - p) < n) { - d = &x[u - p]; - } else if ((u - p) < n2) { - d = &tt1[(u - p) - n]; - } else { - d = &tt2[(u - p) - n2]; - } - dv = *d; - - /* - * The swap should be performed only if the source - * is valid AND the jump j has its 'p' bit set. - */ - mk &= -(((j & p) + 0x1FF) >> 9); - - *s = (uint16_t)(sv ^ (mk & (sv ^ dv))); - *d = (uint16_t)(dv ^ (mk & (sv ^ dv))); - } - } -} - -/* see inner.h */ -int -Zf(is_short)( - const int16_t *s1, const int16_t *s2, unsigned logn) -{ - /* - * We use the l2-norm. Code below uses only 32-bit operations to - * compute the square of the norm with saturation to 2^32-1 if - * the value exceeds 2^31-1. - */ - size_t n, u; - uint32_t s, ng; - - n = (size_t)1 << logn; - s = 0; - ng = 0; - for (u = 0; u < n; u ++) { - int32_t z; - - z = s1[u]; - s += (uint32_t)(z * z); - ng |= s; - z = s2[u]; - s += (uint32_t)(z * z); - ng |= s; - } - s |= -(ng >> 31); - - /* - * Acceptance bound on the l2-norm is: - * 1.2*1.55*sqrt(q)*sqrt(2*N) - * Value 7085 is floor((1.2^2)*(1.55^2)*2*1024). - */ - return s < (((uint32_t)7085 * (uint32_t)12289) >> (10 - logn)); -} - -/* see inner.h */ -int -Zf(is_short_half)( - uint32_t sqn, const int16_t *s2, unsigned logn) -{ - size_t n, u; - uint32_t ng; - - n = (size_t)1 << logn; - ng = -(sqn >> 31); - for (u = 0; u < n; u ++) { - int32_t z; - - z = s2[u]; - sqn += (uint32_t)(z * z); - ng |= sqn; - } - sqn |= -(ng >> 31); - - /* - * Acceptance bound on the l2-norm is: - * 1.2*1.55*sqrt(q)*sqrt(2*N) - * Value 7085 is floor((1.2^2)*(1.55^2)*2*1024). - */ - return sqn < (((uint32_t)7085 * (uint32_t)12289) >> (10 - logn)); -} diff --git a/crypto_sign/falcon-512/m4-ct/config.h b/crypto_sign/falcon-512/m4-ct/config.h deleted file mode 100644 index cd78727e..00000000 --- a/crypto_sign/falcon-512/m4-ct/config.h +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Manual configuration file for the Falcon implementation. Here can - * be set some compilation-time options. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#ifndef FALCON_CONFIG_H__ -#define FALCON_CONFIG_H__ - -/* - * Each option is a macro which should be defined to either 1 or 0. - * If any of the options below is left undefined, then a default value - * will be used by the code, possibly using compile-time autodetection - * from compiler-defined macros. - * - * Explicitly setting a parameter can be done by uncommenting/modifying - * its definition below, in this file, or equivalently by setting it as - * a compiler flag. - */ - -/* - * Use the native 'double' C type for floating-point computations. Exact - * reproducibility of all tests requires that type to faithfully follow - * IEEE-754 "round-to-nearest" rules. - * - * Native double support will use the CPU hardware and/or - * compiler-provided functions; the latter is typically NOT - * constant-time, while the former MAY be constant-time, or not. On - * recent x86 CPU in 64-bit mode, SSE2 opcodes are used and they provide - * constant-time operations for all the operations used in Falcon, - * except for some special cases of divisions and square roots, but it - * can be shown that theses cases imply only negligible leak of - * information that cannot be leveraged into a full attack. - * - * If neither FALCON_FPNATIVE nor FALCON_FPEMU is defined, then use of - * the native 'double' C type is the default behaviour unless - * FALCON_ASM_CORTEXM4 is defined to 1, in which case the emulated code - * will be used. - * -#define FALCON_FPNATIVE 1 - */ - -/* - * Use emulated floating-point implementation. - * - * Emulation uses only integer operations with uint32_t and uint64_t - * types. This is constant-time, provided that the underlying platform - * offers constant-time opcodes for the following operations: - * - * - Multiplication of two 32-bit unsigned integers into a 64-bit result. - * - Left-shift or right-shift of a 32-bit unsigned integer by a - * potentially secret shift count in the 0..31 range. - * - * Notably, the ARM Cortex M3 does not fulfill the first condition, - * while the Pentium IV does not fulfill the second. - * - * If neither FALCON_FPNATIVE nor FALCON_FPEMU is defined, then use of - * the native 'double' C type is the default behaviour unless - * FALCON_ASM_CORTEXM4 is defined to 1, in which case the emulated code - * will be used. - * -#define FALCON_FPEMU 1 - */ - -/* - * Enable use of assembly for ARM Cortex-M4 CPU. By default, such - * support will be used based on some autodection on the compiler - * version and target architecture. Define this variable to 1 to force - * use of the assembly code, or 0 to disable it regardless of the - * autodetection. - * - * When FALCON_ASM_CORTEXM4 is enabled (whether defined explicitly or - * autodetected), emulated floating-point code will be used, unless - * FALCON_FPNATIVE or FALCON_FPEMU is explicitly set to override the - * choice. Emulated code with ARM assembly is constant-time and provides - * better performance than emulated code with plain C. - * - * The assembly code for the M4 can also work on a Cortex-M3. If the - * compiler is instructed to target the M3 (e.g. '-mcpu=cortex-m3' with - * GCC) then FALCON_ASM_CORTEXM4 won't be autodetected, but it can be - * enabled explicitly. Take care, though, that the M3 multiplication - * opcode (multiplication of two 32-bit unsigned integers with a 64-bit - * result) is NOT constant-time. - * -#define FALCON_ASM_CORTEXM4 1 - */ - -#define FALCON_ASM_CORTEXM4 1 - -/* - * Enable use of AVX2 intrinsics. If enabled, then the code will compile - * only when targeting x86 with a compiler that supports AVX2 intrinsics - * (tested with GCC 7.4.0, Clang 6.0.0, and MSVC 2015, both in 32-bit - * and 64-bit modes), and run only on systems that offer the AVX2 - * opcodes. Some operations leverage AVX2 for better performance. - * -#define FALCON_AVX2 1 - */ - -/* - * Enable use of FMA intrinsics. This setting has any effect only if - * FALCON_AVX2 is also enabled. The FMA intrinsics are normally available - * on any x86 CPU that also has AVX2. Note that setting this option will - * slightly modify the values of expanded private keys, but will normally - * not change the values of non-expanded private keys, public keys or - * signatures, for a given keygen/sign seed (non-expanded private keys - * and signatures might theoretically change, but only with low probability, - * less than 2^(-40); produced signatures are still safe and interoperable). - * -#define FALCON_FMA 1 - */ - -/* - * Assert that the platform uses little-endian encoding. If enabled, - * then encoding and decoding of aligned multibyte values will be - * slightly faster (especially for hashing and random number - * generation). If not defined explicitly, then autodetection is - * applied. - * -#define FALCON_LE 1 - */ - -/* - * Assert that the platform tolerates accesses to unaligned multibyte - * values. If enabled, then some operations are slightly faster. Note - * that ARM Cortex M4 do _not_ fully tolerate unaligned accesses; for - * such systems, this option should not be enabled. If not defined - * explicitly, then autodetection is applied. - * -#define FALCON_UNALIGNED 1 - */ - -/* - * Use a PRNG based on ChaCha20 and seeded with SHAKE256, instead of - * SHAKE256 directly, for key pair generation purposes. This speeds up - * key pair generation, especially on platforms where SHAKE256 is - * comparatively slow: on the ARM Cortex M4, average key generation time - * is reduced by 19% with this setting; on a recent x86 Skylake, the - * reduction is smaller (less than 8%). - * - * However, this setting changes the private/public key pair obtained - * from a given seed, thus preventing reproducibility of the - * known-answer tests vectors. For compatibility with existing KAT - * vectors (e.g. in PQClean, pqm4 and NIST implementations), this - * setting is not enabled by default. - * -#define FALCON_KG_CHACHA20 1 - */ - -/* - * Use an explicit OS-provided source of randomness for seeding (for the - * Zf(get_seed)() function implementation). Three possible sources are - * defined: - * - * - getentropy() system call - * - /dev/urandom special file - * - CryptGenRandom() function call - * - * More than one source may be enabled, in which case they will be tried - * in the order above, until a success is reached. - * - * By default, sources are enabled at compile-time based on these - * conditions: - * - * - getentropy(): target is one of: Linux with Glibc-2.25+, FreeBSD 12+, - * or OpenBSD. - * - /dev/urandom: target is a Unix-like system (including Linux, - * FreeBSD, NetBSD, OpenBSD, DragonFly, macOS, Android, Solaris, AIX). - * - CryptGenRandom(): target is Windows (Win32 or Win64). - * - * On most small embedded systems, none will be enabled and Zf(get_seed)() - * will always return 0. Applications will need to provide their own seeds. - * -#define FALCON_RAND_GETENTROPY 1 -#define FALCON_RAND_URANDOM 1 -#define FALCON_RAND_WIN32 1 - */ - -#endif diff --git a/crypto_sign/falcon-512/m4-ct/fft.c b/crypto_sign/falcon-512/m4-ct/fft.c deleted file mode 100644 index b1904b24..00000000 --- a/crypto_sign/falcon-512/m4-ct/fft.c +++ /dev/null @@ -1,1412 +0,0 @@ -/* - * FFT code. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -/* - * Rules for complex number macros: - * -------------------------------- - * - * Operand order is: destination, source1, source2... - * - * Each operand is a real and an imaginary part. - * - * All overlaps are allowed. - */ - -/* - * Addition of two complex numbers (d = a + b). - */ -#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im) do { \ - fpr fpct_re, fpct_im; \ - fpct_re = fpr_add(a_re, b_re); \ - fpct_im = fpr_add(a_im, b_im); \ - (d_re) = fpct_re; \ - (d_im) = fpct_im; \ - } while (0) - -/* - * Subtraction of two complex numbers (d = a - b). - */ -#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im) do { \ - fpr fpct_re, fpct_im; \ - fpct_re = fpr_sub(a_re, b_re); \ - fpct_im = fpr_sub(a_im, b_im); \ - (d_re) = fpct_re; \ - (d_im) = fpct_im; \ - } while (0) - -/* - * Multplication of two complex numbers (d = a * b). - */ -#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im) do { \ - fpr fpct_a_re, fpct_a_im; \ - fpr fpct_b_re, fpct_b_im; \ - fpr fpct_d_re, fpct_d_im; \ - fpct_a_re = (a_re); \ - fpct_a_im = (a_im); \ - fpct_b_re = (b_re); \ - fpct_b_im = (b_im); \ - fpct_d_re = fpr_sub( \ - fpr_mul(fpct_a_re, fpct_b_re), \ - fpr_mul(fpct_a_im, fpct_b_im)); \ - fpct_d_im = fpr_add( \ - fpr_mul(fpct_a_re, fpct_b_im), \ - fpr_mul(fpct_a_im, fpct_b_re)); \ - (d_re) = fpct_d_re; \ - (d_im) = fpct_d_im; \ - } while (0) - -/* - * Squaring of a complex number (d = a * a). - */ -#define FPC_SQR(d_re, d_im, a_re, a_im) do { \ - fpr fpct_a_re, fpct_a_im; \ - fpr fpct_d_re, fpct_d_im; \ - fpct_a_re = (a_re); \ - fpct_a_im = (a_im); \ - fpct_d_re = fpr_sub(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \ - fpct_d_im = fpr_double(fpr_mul(fpct_a_re, fpct_a_im)); \ - (d_re) = fpct_d_re; \ - (d_im) = fpct_d_im; \ - } while (0) - -/* - * Inversion of a complex number (d = 1 / a). - */ -#define FPC_INV(d_re, d_im, a_re, a_im) do { \ - fpr fpct_a_re, fpct_a_im; \ - fpr fpct_d_re, fpct_d_im; \ - fpr fpct_m; \ - fpct_a_re = (a_re); \ - fpct_a_im = (a_im); \ - fpct_m = fpr_add(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \ - fpct_m = fpr_inv(fpct_m); \ - fpct_d_re = fpr_mul(fpct_a_re, fpct_m); \ - fpct_d_im = fpr_mul(fpr_neg(fpct_a_im), fpct_m); \ - (d_re) = fpct_d_re; \ - (d_im) = fpct_d_im; \ - } while (0) - -/* - * Division of complex numbers (d = a / b). - */ -#define FPC_DIV(d_re, d_im, a_re, a_im, b_re, b_im) do { \ - fpr fpct_a_re, fpct_a_im; \ - fpr fpct_b_re, fpct_b_im; \ - fpr fpct_d_re, fpct_d_im; \ - fpr fpct_m; \ - fpct_a_re = (a_re); \ - fpct_a_im = (a_im); \ - fpct_b_re = (b_re); \ - fpct_b_im = (b_im); \ - fpct_m = fpr_add(fpr_sqr(fpct_b_re), fpr_sqr(fpct_b_im)); \ - fpct_m = fpr_inv(fpct_m); \ - fpct_b_re = fpr_mul(fpct_b_re, fpct_m); \ - fpct_b_im = fpr_mul(fpr_neg(fpct_b_im), fpct_m); \ - fpct_d_re = fpr_sub( \ - fpr_mul(fpct_a_re, fpct_b_re), \ - fpr_mul(fpct_a_im, fpct_b_im)); \ - fpct_d_im = fpr_add( \ - fpr_mul(fpct_a_re, fpct_b_im), \ - fpr_mul(fpct_a_im, fpct_b_re)); \ - (d_re) = fpct_d_re; \ - (d_im) = fpct_d_im; \ - } while (0) - -/* - * Let w = exp(i*pi/N); w is a primitive 2N-th root of 1. We define the - * values w_j = w^(2j+1) for all j from 0 to N-1: these are the roots - * of X^N+1 in the field of complex numbers. A crucial property is that - * w_{N-1-j} = conj(w_j) = 1/w_j for all j. - * - * FFT representation of a polynomial f (taken modulo X^N+1) is the - * set of values f(w_j). Since f is real, conj(f(w_j)) = f(conj(w_j)), - * thus f(w_{N-1-j}) = conj(f(w_j)). We thus store only half the values, - * for j = 0 to N/2-1; the other half can be recomputed easily when (if) - * needed. A consequence is that FFT representation has the same size - * as normal representation: N/2 complex numbers use N real numbers (each - * complex number is the combination of a real and an imaginary part). - * - * We use a specific ordering which makes computations easier. Let rev() - * be the bit-reversal function over log(N) bits. For j in 0..N/2-1, we - * store the real and imaginary parts of f(w_j) in slots: - * - * Re(f(w_j)) -> slot rev(j)/2 - * Im(f(w_j)) -> slot rev(j)/2+N/2 - * - * (Note that rev(j) is even for j < N/2.) - */ - -/* see inner.h */ -TARGET_AVX2 -void -Zf(FFT)(fpr *f, unsigned logn) -{ - /* - * FFT algorithm in bit-reversal order uses the following - * iterative algorithm: - * - * t = N - * for m = 1; m < N; m *= 2: - * ht = t/2 - * for i1 = 0; i1 < m; i1 ++: - * j1 = i1 * t - * s = GM[m + i1] - * for j = j1; j < (j1 + ht); j ++: - * x = f[j] - * y = s * f[j + ht] - * f[j] = x + y - * f[j + ht] = x - y - * t = ht - * - * GM[k] contains w^rev(k) for primitive root w = exp(i*pi/N). - * - * In the description above, f[] is supposed to contain complex - * numbers. In our in-memory representation, the real and - * imaginary parts of f[k] are in array slots k and k+N/2. - * - * We only keep the first half of the complex numbers. We can - * see that after the first iteration, the first and second halves - * of the array of complex numbers have separate lives, so we - * simply ignore the second part. - */ - - unsigned u; - size_t t, n, hn, m; - - /* - * First iteration: compute f[j] + i * f[j+N/2] for all j < N/2 - * (because GM[1] = w^rev(1) = w^(N/2) = i). - * In our chosen representation, this is a no-op: everything is - * already where it should be. - */ - - /* - * Subsequent iterations are truncated to use only the first - * half of values. - */ - n = (size_t)1 << logn; - hn = n >> 1; - t = hn; - for (u = 1, m = 2; u < logn; u ++, m <<= 1) { - size_t ht, hm, i1, j1; - - ht = t >> 1; - hm = m >> 1; - for (i1 = 0, j1 = 0; i1 < hm; i1 ++, j1 += t) { - size_t j, j2; - - j2 = j1 + ht; -#if FALCON_AVX2 // yyyAVX2+1 - if (ht >= 4) { - __m256d s_re, s_im; - - s_re = _mm256_set1_pd( - fpr_gm_tab[((m + i1) << 1) + 0].v); - s_im = _mm256_set1_pd( - fpr_gm_tab[((m + i1) << 1) + 1].v); - for (j = j1; j < j2; j += 4) { - __m256d x_re, x_im, y_re, y_im; - __m256d z_re, z_im; - - x_re = _mm256_loadu_pd(&f[j].v); - x_im = _mm256_loadu_pd(&f[j + hn].v); - z_re = _mm256_loadu_pd(&f[j+ht].v); - z_im = _mm256_loadu_pd(&f[j+ht + hn].v); - y_re = FMSUB(z_re, s_re, - _mm256_mul_pd(z_im, s_im)); - y_im = FMADD(z_re, s_im, - _mm256_mul_pd(z_im, s_re)); - _mm256_storeu_pd(&f[j].v, - _mm256_add_pd(x_re, y_re)); - _mm256_storeu_pd(&f[j + hn].v, - _mm256_add_pd(x_im, y_im)); - _mm256_storeu_pd(&f[j + ht].v, - _mm256_sub_pd(x_re, y_re)); - _mm256_storeu_pd(&f[j + ht + hn].v, - _mm256_sub_pd(x_im, y_im)); - } - } else { - fpr s_re, s_im; - - s_re = fpr_gm_tab[((m + i1) << 1) + 0]; - s_im = fpr_gm_tab[((m + i1) << 1) + 1]; - for (j = j1; j < j2; j ++) { - fpr x_re, x_im, y_re, y_im; - - x_re = f[j]; - x_im = f[j + hn]; - y_re = f[j + ht]; - y_im = f[j + ht + hn]; - FPC_MUL(y_re, y_im, - y_re, y_im, s_re, s_im); - FPC_ADD(f[j], f[j + hn], - x_re, x_im, y_re, y_im); - FPC_SUB(f[j + ht], f[j + ht + hn], - x_re, x_im, y_re, y_im); - } - } -#else // yyyAVX2+0 - fpr s_re, s_im; - - s_re = fpr_gm_tab[((m + i1) << 1) + 0]; - s_im = fpr_gm_tab[((m + i1) << 1) + 1]; - for (j = j1; j < j2; j ++) { - fpr x_re, x_im, y_re, y_im; - - x_re = f[j]; - x_im = f[j + hn]; - y_re = f[j + ht]; - y_im = f[j + ht + hn]; - FPC_MUL(y_re, y_im, y_re, y_im, s_re, s_im); - FPC_ADD(f[j], f[j + hn], - x_re, x_im, y_re, y_im); - FPC_SUB(f[j + ht], f[j + ht + hn], - x_re, x_im, y_re, y_im); - } -#endif // yyyAVX2- - } - t = ht; - } -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(iFFT)(fpr *f, unsigned logn) -{ - /* - * Inverse FFT algorithm in bit-reversal order uses the following - * iterative algorithm: - * - * t = 1 - * for m = N; m > 1; m /= 2: - * hm = m/2 - * dt = t*2 - * for i1 = 0; i1 < hm; i1 ++: - * j1 = i1 * dt - * s = iGM[hm + i1] - * for j = j1; j < (j1 + t); j ++: - * x = f[j] - * y = f[j + t] - * f[j] = x + y - * f[j + t] = s * (x - y) - * t = dt - * for i1 = 0; i1 < N; i1 ++: - * f[i1] = f[i1] / N - * - * iGM[k] contains (1/w)^rev(k) for primitive root w = exp(i*pi/N) - * (actually, iGM[k] = 1/GM[k] = conj(GM[k])). - * - * In the main loop (not counting the final division loop), in - * all iterations except the last, the first and second half of f[] - * (as an array of complex numbers) are separate. In our chosen - * representation, we do not keep the second half. - * - * The last iteration recombines the recomputed half with the - * implicit half, and should yield only real numbers since the - * target polynomial is real; moreover, s = i at that step. - * Thus, when considering x and y: - * y = conj(x) since the final f[j] must be real - * Therefore, f[j] is filled with 2*Re(x), and f[j + t] is - * filled with 2*Im(x). - * But we already have Re(x) and Im(x) in array slots j and j+t - * in our chosen representation. That last iteration is thus a - * simple doubling of the values in all the array. - * - * We make the last iteration a no-op by tweaking the final - * division into a division by N/2, not N. - */ - size_t u, n, hn, t, m; - - n = (size_t)1 << logn; - t = 1; - m = n; - hn = n >> 1; - for (u = logn; u > 1; u --) { - size_t hm, dt, i1, j1; - - hm = m >> 1; - dt = t << 1; - for (i1 = 0, j1 = 0; j1 < hn; i1 ++, j1 += dt) { - size_t j, j2; - - j2 = j1 + t; -#if FALCON_AVX2 // yyyAVX2+1 - if (t >= 4) { - __m256d s_re, s_im; - - s_re = _mm256_set1_pd( - fpr_gm_tab[((hm + i1) << 1) + 0].v); - s_im = _mm256_set1_pd( - fpr_gm_tab[((hm + i1) << 1) + 1].v); - for (j = j1; j < j2; j += 4) { - __m256d x_re, x_im, y_re, y_im; - __m256d z_re, z_im; - - x_re = _mm256_loadu_pd(&f[j].v); - x_im = _mm256_loadu_pd(&f[j + hn].v); - y_re = _mm256_loadu_pd(&f[j+t].v); - y_im = _mm256_loadu_pd(&f[j+t + hn].v); - _mm256_storeu_pd(&f[j].v, - _mm256_add_pd(x_re, y_re)); - _mm256_storeu_pd(&f[j + hn].v, - _mm256_add_pd(x_im, y_im)); - x_re = _mm256_sub_pd(y_re, x_re); - x_im = _mm256_sub_pd(x_im, y_im); - z_re = FMSUB(x_im, s_im, - _mm256_mul_pd(x_re, s_re)); - z_im = FMADD(x_re, s_im, - _mm256_mul_pd(x_im, s_re)); - _mm256_storeu_pd(&f[j+t].v, z_re); - _mm256_storeu_pd(&f[j+t + hn].v, z_im); - } - } else { - fpr s_re, s_im; - - s_re = fpr_gm_tab[((hm + i1) << 1)+0]; - s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1)+1]); - for (j = j1; j < j2; j ++) { - fpr x_re, x_im, y_re, y_im; - - x_re = f[j]; - x_im = f[j + hn]; - y_re = f[j + t]; - y_im = f[j + t + hn]; - FPC_ADD(f[j], f[j + hn], - x_re, x_im, y_re, y_im); - FPC_SUB(x_re, x_im, - x_re, x_im, y_re, y_im); - FPC_MUL(f[j + t], f[j + t + hn], - x_re, x_im, s_re, s_im); - } - } -#else // yyyAVX2+0 - fpr s_re, s_im; - - s_re = fpr_gm_tab[((hm + i1) << 1) + 0]; - s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1) + 1]); - for (j = j1; j < j2; j ++) { - fpr x_re, x_im, y_re, y_im; - - x_re = f[j]; - x_im = f[j + hn]; - y_re = f[j + t]; - y_im = f[j + t + hn]; - FPC_ADD(f[j], f[j + hn], - x_re, x_im, y_re, y_im); - FPC_SUB(x_re, x_im, x_re, x_im, y_re, y_im); - FPC_MUL(f[j + t], f[j + t + hn], - x_re, x_im, s_re, s_im); - } -#endif // yyyAVX2- - } - t = dt; - m = hm; - } - - /* - * Last iteration is a no-op, provided that we divide by N/2 - * instead of N. We need to make a special case for logn = 0. - */ - if (logn > 0) { - fpr ni; - - ni = fpr_p2_tab[logn]; - for (u = 0; u < n; u ++) { - f[u] = fpr_mul(f[u], ni); - } - } -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_add)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, u; - - n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 4) { - for (u = 0; u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_add_pd( - _mm256_loadu_pd(&a[u].v), - _mm256_loadu_pd(&b[u].v))); - } - } else { - for (u = 0; u < n; u ++) { - a[u] = fpr_add(a[u], b[u]); - } - } -#else // yyyAVX2+0 - for (u = 0; u < n; u ++) { - a[u] = fpr_add(a[u], b[u]); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_sub)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, u; - - n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 4) { - for (u = 0; u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_sub_pd( - _mm256_loadu_pd(&a[u].v), - _mm256_loadu_pd(&b[u].v))); - } - } else { - for (u = 0; u < n; u ++) { - a[u] = fpr_sub(a[u], b[u]); - } - } -#else // yyyAVX2+0 - for (u = 0; u < n; u ++) { - a[u] = fpr_sub(a[u], b[u]); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_neg)(fpr *a, unsigned logn) -{ - size_t n, u; - - n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 4) { - __m256d s; - - s = _mm256_set1_pd(-0.0); - for (u = 0; u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s)); - } - } else { - for (u = 0; u < n; u ++) { - a[u] = fpr_neg(a[u]); - } - } -#else // yyyAVX2+0 - for (u = 0; u < n; u ++) { - a[u] = fpr_neg(a[u]); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_adj_fft)(fpr *a, unsigned logn) -{ - size_t n, u; - - n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d s; - - s = _mm256_set1_pd(-0.0); - for (u = (n >> 1); u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s)); - } - } else { - for (u = (n >> 1); u < n; u ++) { - a[u] = fpr_neg(a[u]); - } - } -#else // yyyAVX2+0 - for (u = (n >> 1); u < n; u ++) { - a[u] = fpr_neg(a[u]); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_mul_fft)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, b_re, b_im, c_re, c_im; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - b_re = _mm256_loadu_pd(&b[u].v); - b_im = _mm256_loadu_pd(&b[u + hn].v); - c_re = FMSUB( - a_re, b_re, _mm256_mul_pd(a_im, b_im)); - c_im = FMADD( - a_re, b_im, _mm256_mul_pd(a_im, b_re)); - _mm256_storeu_pd(&a[u].v, c_re); - _mm256_storeu_pd(&a[u + hn].v, c_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_muladj_fft)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, b_re, b_im, c_re, c_im; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - b_re = _mm256_loadu_pd(&b[u].v); - b_im = _mm256_loadu_pd(&b[u + hn].v); - c_re = FMADD( - a_re, b_re, _mm256_mul_pd(a_im, b_im)); - c_im = FMSUB( - a_im, b_re, _mm256_mul_pd(a_re, b_im)); - _mm256_storeu_pd(&a[u].v, c_re); - _mm256_storeu_pd(&a[u + hn].v, c_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = fpr_neg(b[u + hn]); - FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = fpr_neg(b[u + hn]); - FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn) -{ - /* - * Since each coefficient is multiplied with its own conjugate, - * the result contains only real values. - */ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d zero; - - zero = _mm256_setzero_pd(); - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - _mm256_storeu_pd(&a[u].v, - FMADD(a_re, a_re, - _mm256_mul_pd(a_im, a_im))); - _mm256_storeu_pd(&a[u + hn].v, zero); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im; - - a_re = a[u]; - a_im = a[u + hn]; - a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)); - a[u + hn] = fpr_zero; - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr a_re, a_im; - - a_re = a[u]; - a_im = a[u + hn]; - a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)); - a[u + hn] = fpr_zero; - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_mulconst)(fpr *a, fpr x, unsigned logn) -{ - size_t n, u; - - n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 4) { - __m256d x4; - - x4 = _mm256_set1_pd(x.v); - for (u = 0; u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_mul_pd(x4, _mm256_loadu_pd(&a[u].v))); - } - } else { - for (u = 0; u < n; u ++) { - a[u] = fpr_mul(a[u], x); - } - } -#else // yyyAVX2+0 - for (u = 0; u < n; u ++) { - a[u] = fpr_mul(a[u], x); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_div_fft)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, b_re, b_im, c_re, c_im, t; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - b_re = _mm256_loadu_pd(&b[u].v); - b_im = _mm256_loadu_pd(&b[u + hn].v); - t = _mm256_div_pd(one, - FMADD(b_re, b_re, - _mm256_mul_pd(b_im, b_im))); - b_re = _mm256_mul_pd(b_re, t); - b_im = _mm256_mul_pd(b_im, t); - c_re = FMADD( - a_re, b_re, _mm256_mul_pd(a_im, b_im)); - c_im = FMSUB( - a_im, b_re, _mm256_mul_pd(a_re, b_im)); - _mm256_storeu_pd(&a[u].v, c_re); - _mm256_storeu_pd(&a[u + hn].v, c_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_invnorm2_fft)(fpr *restrict d, - const fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, b_re, b_im, dv; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - b_re = _mm256_loadu_pd(&b[u].v); - b_im = _mm256_loadu_pd(&b[u + hn].v); - dv = _mm256_div_pd(one, - _mm256_add_pd( - FMADD(a_re, a_re, - _mm256_mul_pd(a_im, a_im)), - FMADD(b_re, b_re, - _mm256_mul_pd(b_im, b_im)))); - _mm256_storeu_pd(&d[u].v, dv); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im; - fpr b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - d[u] = fpr_inv(fpr_add( - fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)), - fpr_add(fpr_sqr(b_re), fpr_sqr(b_im)))); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr a_re, a_im; - fpr b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - d[u] = fpr_inv(fpr_add( - fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)), - fpr_add(fpr_sqr(b_re), fpr_sqr(b_im)))); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_add_muladj_fft)(fpr *restrict d, - const fpr *restrict F, const fpr *restrict G, - const fpr *restrict f, const fpr *restrict g, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - for (u = 0; u < hn; u += 4) { - __m256d F_re, F_im, G_re, G_im; - __m256d f_re, f_im, g_re, g_im; - __m256d a_re, a_im, b_re, b_im; - - F_re = _mm256_loadu_pd(&F[u].v); - F_im = _mm256_loadu_pd(&F[u + hn].v); - G_re = _mm256_loadu_pd(&G[u].v); - G_im = _mm256_loadu_pd(&G[u + hn].v); - f_re = _mm256_loadu_pd(&f[u].v); - f_im = _mm256_loadu_pd(&f[u + hn].v); - g_re = _mm256_loadu_pd(&g[u].v); - g_im = _mm256_loadu_pd(&g[u + hn].v); - - a_re = FMADD(F_re, f_re, - _mm256_mul_pd(F_im, f_im)); - a_im = FMSUB(F_im, f_re, - _mm256_mul_pd(F_re, f_im)); - b_re = FMADD(G_re, g_re, - _mm256_mul_pd(G_im, g_im)); - b_im = FMSUB(G_im, g_re, - _mm256_mul_pd(G_re, g_im)); - _mm256_storeu_pd(&d[u].v, - _mm256_add_pd(a_re, b_re)); - _mm256_storeu_pd(&d[u + hn].v, - _mm256_add_pd(a_im, b_im)); - } - } else { - for (u = 0; u < hn; u ++) { - fpr F_re, F_im, G_re, G_im; - fpr f_re, f_im, g_re, g_im; - fpr a_re, a_im, b_re, b_im; - - F_re = F[u]; - F_im = F[u + hn]; - G_re = G[u]; - G_im = G[u + hn]; - f_re = f[u]; - f_im = f[u + hn]; - g_re = g[u]; - g_im = g[u + hn]; - - FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im)); - FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im)); - d[u] = fpr_add(a_re, b_re); - d[u + hn] = fpr_add(a_im, b_im); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr F_re, F_im, G_re, G_im; - fpr f_re, f_im, g_re, g_im; - fpr a_re, a_im, b_re, b_im; - - F_re = F[u]; - F_im = F[u + hn]; - G_re = G[u]; - G_im = G[u + hn]; - f_re = f[u]; - f_im = f[u + hn]; - g_re = g[u]; - g_im = g[u + hn]; - - FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im)); - FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im)); - d[u] = fpr_add(a_re, b_re); - d[u + hn] = fpr_add(a_im, b_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_mul_autoadj_fft)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, bv; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - bv = _mm256_loadu_pd(&b[u].v); - _mm256_storeu_pd(&a[u].v, - _mm256_mul_pd(a_re, bv)); - _mm256_storeu_pd(&a[u + hn].v, - _mm256_mul_pd(a_im, bv)); - } - } else { - for (u = 0; u < hn; u ++) { - a[u] = fpr_mul(a[u], b[u]); - a[u + hn] = fpr_mul(a[u + hn], b[u]); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - a[u] = fpr_mul(a[u], b[u]); - a[u + hn] = fpr_mul(a[u + hn], b[u]); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_div_autoadj_fft)( - fpr *restrict a, const fpr *restrict b, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d ib, a_re, a_im; - - ib = _mm256_div_pd(one, _mm256_loadu_pd(&b[u].v)); - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - _mm256_storeu_pd(&a[u].v, _mm256_mul_pd(a_re, ib)); - _mm256_storeu_pd(&a[u + hn].v, _mm256_mul_pd(a_im, ib)); - } - } else { - for (u = 0; u < hn; u ++) { - fpr ib; - - ib = fpr_inv(b[u]); - a[u] = fpr_mul(a[u], ib); - a[u + hn] = fpr_mul(a[u + hn], ib); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr ib; - - ib = fpr_inv(b[u]); - a[u] = fpr_mul(a[u], ib); - a[u + hn] = fpr_mul(a[u + hn], ib); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_LDL_fft)( - const fpr *restrict g00, - fpr *restrict g01, fpr *restrict g11, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - __m256d t, mu_re, mu_im, xi_re, xi_im; - - g00_re = _mm256_loadu_pd(&g00[u].v); - g00_im = _mm256_loadu_pd(&g00[u + hn].v); - g01_re = _mm256_loadu_pd(&g01[u].v); - g01_im = _mm256_loadu_pd(&g01[u + hn].v); - g11_re = _mm256_loadu_pd(&g11[u].v); - g11_im = _mm256_loadu_pd(&g11[u + hn].v); - - t = _mm256_div_pd(one, - FMADD(g00_re, g00_re, - _mm256_mul_pd(g00_im, g00_im))); - g00_re = _mm256_mul_pd(g00_re, t); - g00_im = _mm256_mul_pd(g00_im, t); - mu_re = FMADD(g01_re, g00_re, - _mm256_mul_pd(g01_im, g00_im)); - mu_im = FMSUB(g01_re, g00_im, - _mm256_mul_pd(g01_im, g00_re)); - xi_re = FMSUB(mu_re, g01_re, - _mm256_mul_pd(mu_im, g01_im)); - xi_im = FMADD(mu_im, g01_re, - _mm256_mul_pd(mu_re, g01_im)); - _mm256_storeu_pd(&g11[u].v, - _mm256_sub_pd(g11_re, xi_re)); - _mm256_storeu_pd(&g11[u + hn].v, - _mm256_add_pd(g11_im, xi_im)); - _mm256_storeu_pd(&g01[u].v, mu_re); - _mm256_storeu_pd(&g01[u + hn].v, mu_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - fpr mu_re, mu_im; - - g00_re = g00[u]; - g00_im = g00[u + hn]; - g01_re = g01[u]; - g01_im = g01[u + hn]; - g11_re = g11[u]; - g11_im = g11[u + hn]; - FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im); - FPC_MUL(g01_re, g01_im, - mu_re, mu_im, g01_re, fpr_neg(g01_im)); - FPC_SUB(g11[u], g11[u + hn], - g11_re, g11_im, g01_re, g01_im); - g01[u] = mu_re; - g01[u + hn] = fpr_neg(mu_im); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - fpr mu_re, mu_im; - - g00_re = g00[u]; - g00_im = g00[u + hn]; - g01_re = g01[u]; - g01_im = g01[u + hn]; - g11_re = g11[u]; - g11_im = g11[u + hn]; - FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im); - FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im)); - FPC_SUB(g11[u], g11[u + hn], g11_re, g11_im, g01_re, g01_im); - g01[u] = mu_re; - g01[u + hn] = fpr_neg(mu_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_LDLmv_fft)( - fpr *restrict d11, fpr *restrict l10, - const fpr *restrict g00, const fpr *restrict g01, - const fpr *restrict g11, unsigned logn) -{ - size_t n, hn, u; - - n = (size_t)1 << logn; - hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - __m256d t, mu_re, mu_im, xi_re, xi_im; - - g00_re = _mm256_loadu_pd(&g00[u].v); - g00_im = _mm256_loadu_pd(&g00[u + hn].v); - g01_re = _mm256_loadu_pd(&g01[u].v); - g01_im = _mm256_loadu_pd(&g01[u + hn].v); - g11_re = _mm256_loadu_pd(&g11[u].v); - g11_im = _mm256_loadu_pd(&g11[u + hn].v); - - t = _mm256_div_pd(one, - FMADD(g00_re, g00_re, - _mm256_mul_pd(g00_im, g00_im))); - g00_re = _mm256_mul_pd(g00_re, t); - g00_im = _mm256_mul_pd(g00_im, t); - mu_re = FMADD(g01_re, g00_re, - _mm256_mul_pd(g01_im, g00_im)); - mu_im = FMSUB(g01_re, g00_im, - _mm256_mul_pd(g01_im, g00_re)); - xi_re = FMSUB(mu_re, g01_re, - _mm256_mul_pd(mu_im, g01_im)); - xi_im = FMADD(mu_im, g01_re, - _mm256_mul_pd(mu_re, g01_im)); - _mm256_storeu_pd(&d11[u].v, - _mm256_sub_pd(g11_re, xi_re)); - _mm256_storeu_pd(&d11[u + hn].v, - _mm256_add_pd(g11_im, xi_im)); - _mm256_storeu_pd(&l10[u].v, mu_re); - _mm256_storeu_pd(&l10[u + hn].v, mu_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - fpr mu_re, mu_im; - - g00_re = g00[u]; - g00_im = g00[u + hn]; - g01_re = g01[u]; - g01_im = g01[u + hn]; - g11_re = g11[u]; - g11_im = g11[u + hn]; - FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im); - FPC_MUL(g01_re, g01_im, - mu_re, mu_im, g01_re, fpr_neg(g01_im)); - FPC_SUB(d11[u], d11[u + hn], - g11_re, g11_im, g01_re, g01_im); - l10[u] = mu_re; - l10[u + hn] = fpr_neg(mu_im); - } - } -#else // yyyAVX2+0 - for (u = 0; u < hn; u ++) { - fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - fpr mu_re, mu_im; - - g00_re = g00[u]; - g00_im = g00[u + hn]; - g01_re = g01[u]; - g01_im = g01[u + hn]; - g11_re = g11[u]; - g11_im = g11[u + hn]; - FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im); - FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im)); - FPC_SUB(d11[u], d11[u + hn], g11_re, g11_im, g01_re, g01_im); - l10[u] = mu_re; - l10[u + hn] = fpr_neg(mu_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_split_fft)( - fpr *restrict f0, fpr *restrict f1, - const fpr *restrict f, unsigned logn) -{ - /* - * The FFT representation we use is in bit-reversed order - * (element i contains f(w^(rev(i))), where rev() is the - * bit-reversal function over the ring degree. This changes - * indexes with regards to the Falcon specification. - */ - size_t n, hn, qn, u; - - n = (size_t)1 << logn; - hn = n >> 1; - qn = hn >> 1; - -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d half, sv; - - half = _mm256_set1_pd(0.5); - sv = _mm256_set_pd(-0.0, 0.0, -0.0, 0.0); - for (u = 0; u < qn; u += 2) { - __m256d ab_re, ab_im, ff0, ff1, ff2, ff3, gmt; - - ab_re = _mm256_loadu_pd(&f[(u << 1)].v); - ab_im = _mm256_loadu_pd(&f[(u << 1) + hn].v); - ff0 = _mm256_mul_pd(_mm256_hadd_pd(ab_re, ab_im), half); - ff0 = _mm256_permute4x64_pd(ff0, 0xD8); - _mm_storeu_pd(&f0[u].v, - _mm256_extractf128_pd(ff0, 0)); - _mm_storeu_pd(&f0[u + qn].v, - _mm256_extractf128_pd(ff0, 1)); - - ff1 = _mm256_mul_pd(_mm256_hsub_pd(ab_re, ab_im), half); - gmt = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v); - ff2 = _mm256_shuffle_pd(ff1, ff1, 0x5); - ff3 = _mm256_hadd_pd( - _mm256_mul_pd(ff1, gmt), - _mm256_xor_pd(_mm256_mul_pd(ff2, gmt), sv)); - ff3 = _mm256_permute4x64_pd(ff3, 0xD8); - _mm_storeu_pd(&f1[u].v, - _mm256_extractf128_pd(ff3, 0)); - _mm_storeu_pd(&f1[u + qn].v, - _mm256_extractf128_pd(ff3, 1)); - } - } else { - f0[0] = f[0]; - f1[0] = f[hn]; - - for (u = 0; u < qn; u ++) { - fpr a_re, a_im, b_re, b_im; - fpr t_re, t_im; - - a_re = f[(u << 1) + 0]; - a_im = f[(u << 1) + 0 + hn]; - b_re = f[(u << 1) + 1]; - b_im = f[(u << 1) + 1 + hn]; - - FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im); - f0[u] = fpr_half(t_re); - f0[u + qn] = fpr_half(t_im); - - FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im); - FPC_MUL(t_re, t_im, t_re, t_im, - fpr_gm_tab[((u + hn) << 1) + 0], - fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1])); - f1[u] = fpr_half(t_re); - f1[u + qn] = fpr_half(t_im); - } - } -#else // yyyAVX2+0 - /* - * We process complex values by pairs. For logn = 1, there is only - * one complex value (the other one is the implicit conjugate), - * so we add the two lines below because the loop will be - * skipped. - */ - f0[0] = f[0]; - f1[0] = f[hn]; - - for (u = 0; u < qn; u ++) { - fpr a_re, a_im, b_re, b_im; - fpr t_re, t_im; - - a_re = f[(u << 1) + 0]; - a_im = f[(u << 1) + 0 + hn]; - b_re = f[(u << 1) + 1]; - b_im = f[(u << 1) + 1 + hn]; - - FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im); - f0[u] = fpr_half(t_re); - f0[u + qn] = fpr_half(t_im); - - FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im); - FPC_MUL(t_re, t_im, t_re, t_im, - fpr_gm_tab[((u + hn) << 1) + 0], - fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1])); - f1[u] = fpr_half(t_re); - f1[u + qn] = fpr_half(t_im); - } -#endif // yyyAVX2- -} - -/* see inner.h */ -TARGET_AVX2 -void -Zf(poly_merge_fft)( - fpr *restrict f, - const fpr *restrict f0, const fpr *restrict f1, unsigned logn) -{ - size_t n, hn, qn, u; - - n = (size_t)1 << logn; - hn = n >> 1; - qn = hn >> 1; - -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 16) { - for (u = 0; u < qn; u += 4) { - __m256d a_re, a_im, b_re, b_im, c_re, c_im; - __m256d gm1, gm2, g_re, g_im; - __m256d t_re, t_im, u_re, u_im; - __m256d tu1_re, tu2_re, tu1_im, tu2_im; - - a_re = _mm256_loadu_pd(&f0[u].v); - a_im = _mm256_loadu_pd(&f0[u + qn].v); - c_re = _mm256_loadu_pd(&f1[u].v); - c_im = _mm256_loadu_pd(&f1[u + qn].v); - - gm1 = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v); - gm2 = _mm256_loadu_pd(&fpr_gm_tab[(u + 2 + hn) << 1].v); - g_re = _mm256_unpacklo_pd(gm1, gm2); - g_im = _mm256_unpackhi_pd(gm1, gm2); - g_re = _mm256_permute4x64_pd(g_re, 0xD8); - g_im = _mm256_permute4x64_pd(g_im, 0xD8); - - b_re = FMSUB( - c_re, g_re, _mm256_mul_pd(c_im, g_im)); - b_im = FMADD( - c_re, g_im, _mm256_mul_pd(c_im, g_re)); - - t_re = _mm256_add_pd(a_re, b_re); - t_im = _mm256_add_pd(a_im, b_im); - u_re = _mm256_sub_pd(a_re, b_re); - u_im = _mm256_sub_pd(a_im, b_im); - - tu1_re = _mm256_unpacklo_pd(t_re, u_re); - tu2_re = _mm256_unpackhi_pd(t_re, u_re); - tu1_im = _mm256_unpacklo_pd(t_im, u_im); - tu2_im = _mm256_unpackhi_pd(t_im, u_im); - _mm256_storeu_pd(&f[(u << 1)].v, - _mm256_permute2f128_pd(tu1_re, tu2_re, 0x20)); - _mm256_storeu_pd(&f[(u << 1) + 4].v, - _mm256_permute2f128_pd(tu1_re, tu2_re, 0x31)); - _mm256_storeu_pd(&f[(u << 1) + hn].v, - _mm256_permute2f128_pd(tu1_im, tu2_im, 0x20)); - _mm256_storeu_pd(&f[(u << 1) + 4 + hn].v, - _mm256_permute2f128_pd(tu1_im, tu2_im, 0x31)); - } - } else { - f[0] = f0[0]; - f[hn] = f1[0]; - - for (u = 0; u < qn; u ++) { - fpr a_re, a_im, b_re, b_im; - fpr t_re, t_im; - - a_re = f0[u]; - a_im = f0[u + qn]; - FPC_MUL(b_re, b_im, f1[u], f1[u + qn], - fpr_gm_tab[((u + hn) << 1) + 0], - fpr_gm_tab[((u + hn) << 1) + 1]); - FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im); - f[(u << 1) + 0] = t_re; - f[(u << 1) + 0 + hn] = t_im; - FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im); - f[(u << 1) + 1] = t_re; - f[(u << 1) + 1 + hn] = t_im; - } - } -#else // yyyAVX2+0 - /* - * An extra copy to handle the special case logn = 1. - */ - f[0] = f0[0]; - f[hn] = f1[0]; - - for (u = 0; u < qn; u ++) { - fpr a_re, a_im, b_re, b_im; - fpr t_re, t_im; - - a_re = f0[u]; - a_im = f0[u + qn]; - FPC_MUL(b_re, b_im, f1[u], f1[u + qn], - fpr_gm_tab[((u + hn) << 1) + 0], - fpr_gm_tab[((u + hn) << 1) + 1]); - FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im); - f[(u << 1) + 0] = t_re; - f[(u << 1) + 0 + hn] = t_im; - FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im); - f[(u << 1) + 1] = t_re; - f[(u << 1) + 1 + hn] = t_im; - } -#endif // yyyAVX2- -} diff --git a/crypto_sign/falcon-512/m4-ct/fpr.c b/crypto_sign/falcon-512/m4-ct/fpr.c deleted file mode 100644 index eb23a44b..00000000 --- a/crypto_sign/falcon-512/m4-ct/fpr.c +++ /dev/null @@ -1,3460 +0,0 @@ -/* - * Floating-point operations. - * - * This file implements the non-inline functions declared in - * fpr.h, as well as the constants for FFT / iFFT. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -#if FALCON_FPEMU // yyyFPEMU+1 - -/* - * Normalize a provided unsigned integer to the 2^63..2^64-1 range by - * left-shifting it if necessary. The exponent e is adjusted accordingly - * (i.e. if the value was left-shifted by n bits, then n is subtracted - * from e). If source m is 0, then it remains 0, but e is altered. - * Both m and e must be simple variables (no expressions allowed). - */ -#define FPR_NORM64(m, e) do { \ - uint32_t nt; \ - \ - (e) -= 63; \ - \ - nt = (uint32_t)((m) >> 32); \ - nt = (nt | -nt) >> 31; \ - (m) ^= ((m) ^ ((m) << 32)) & ((uint64_t)nt - 1); \ - (e) += (int)(nt << 5); \ - \ - nt = (uint32_t)((m) >> 48); \ - nt = (nt | -nt) >> 31; \ - (m) ^= ((m) ^ ((m) << 16)) & ((uint64_t)nt - 1); \ - (e) += (int)(nt << 4); \ - \ - nt = (uint32_t)((m) >> 56); \ - nt = (nt | -nt) >> 31; \ - (m) ^= ((m) ^ ((m) << 8)) & ((uint64_t)nt - 1); \ - (e) += (int)(nt << 3); \ - \ - nt = (uint32_t)((m) >> 60); \ - nt = (nt | -nt) >> 31; \ - (m) ^= ((m) ^ ((m) << 4)) & ((uint64_t)nt - 1); \ - (e) += (int)(nt << 2); \ - \ - nt = (uint32_t)((m) >> 62); \ - nt = (nt | -nt) >> 31; \ - (m) ^= ((m) ^ ((m) << 2)) & ((uint64_t)nt - 1); \ - (e) += (int)(nt << 1); \ - \ - nt = (uint32_t)((m) >> 63); \ - (m) ^= ((m) ^ ((m) << 1)) & ((uint64_t)nt - 1); \ - (e) += (int)(nt); \ - } while (0) - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -__attribute__((naked)) -fpr -fpr_scaled(int64_t i __attribute__((unused)), int sc __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, lr }\n\t" - "\n\t" - "@ Input i is in r0:r1, and sc in r2.\n\t" - "@ Extract the sign bit, and compute the absolute value.\n\t" - "@ -> sign bit in r3, with value 0 or -1\n\t" - "asrs r3, r1, #31\n\t" - "eors r0, r3\n\t" - "eors r1, r3\n\t" - "subs r0, r3\n\t" - "sbcs r1, r3\n\t" - "\n\t" - "@ Scale exponent to account for the encoding; if the source is\n\t" - "@ zero or if the scaled exponent is negative, it is set to 32.\n\t" - "addw r2, r2, #1022\n\t" - "orrs r4, r0, r1\n\t" - "bics r4, r4, r2, asr #31\n\t" - "rsbs r5, r4, #0\n\t" - "orrs r4, r5\n\t" - "ands r2, r2, r4, asr #31\n\t" - "adds r2, #32\n\t" - "\n\t" - "@ Normalize value to a full 64-bit width, by shifting it left.\n\t" - "@ The shift count is subtracted from the exponent (in r2).\n\t" - "@ If the mantissa is 0, the exponent is set to 0.\n\t" - "\n\t" - "@ If top word is 0, replace with low word; otherwise, add 32 to\n\t" - "@ the exponent.\n\t" - "rsbs r4, r1, #0\n\t" - "orrs r4, r1\n\t" - "eors r5, r0, r1\n\t" - "bics r5, r5, r4, asr #31\n\t" - "eors r1, r5\n\t" - "ands r0, r0, r4, asr #31\n\t" - "lsrs r4, r4, #31\n\t" - "adds r2, r2, r4, lsl #5\n\t" - "\n\t" - "@ Count leading zeros of r1 to finish the shift.\n\t" - "clz r4, r1\n\t" - "subs r2, r4\n\t" - "rsbs r5, r4, #32\n\t" - "lsls r1, r4\n\t" - "lsrs r5, r0, r5\n\t" - "lsls r0, r4\n\t" - "orrs r1, r5\n\t" - "\n\t" - "@ Clear the top bit; we know it's a 1 (unless the whole mantissa\n\t" - "@ was zero, but then it's still OK to clear it)\n\t" - "bfc r1, #31, #1\n\t" - "\n\t" - "@ Now shift right the value by 11 bits; this puts the value in\n\t" - "@ the 2^52..2^53-1 range. We also keep a copy of the pre-shift\n\t" - "@ low bits in r5.\n\t" - "movs r5, r0\n\t" - "lsrs r0, #11\n\t" - "orrs r0, r0, r1, lsl #21\n\t" - "lsrs r1, #11\n\t" - "\n\t" - "@ Also plug the exponent at the right place. This must be done\n\t" - "@ now so that, in case the rounding creates a carry, that carry\n\t" - "@ adds to the exponent, which would be exactly what we want at\n\t" - "@ that point.\n\t" - "orrs r1, r1, r2, lsl #20\n\t" - "\n\t" - "@ Rounding: we must add 1 to the mantissa in the following cases:\n\t" - "@ - bits 11 to 9 of r5 are '011', '110' or '111'\n\t" - "@ - bits 11 to 9 of r5 are '010' and one of the\n\t" - "@ bits 0 to 8 is non-zero\n\t" - "ubfx r6, r5, #0, #9\n\t" - "addw r6, r6, #511\n\t" - "orrs r5, r6\n\t" - "\n\t" - "ubfx r5, r5, #9, #3\n\t" - "movs r6, #0xC8\n\t" - "lsrs r6, r5\n\t" - "ands r6, #1\n\t" - "adds r0, r6\n\t" - "adcs r1, #0\n\t" - "\n\t" - "@ Put back the sign.\n\t" - "orrs r1, r1, r3, lsl #31\n\t" - "\n\t" - "pop { r4, r5, r6, pc}\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - -fpr -fpr_scaled(int64_t i, int sc) -{ - /* - * To convert from int to float, we have to do the following: - * 1. Get the absolute value of the input, and its sign - * 2. Shift right or left the value as appropriate - * 3. Pack the result - * - * We can assume that the source integer is not -2^63. - */ - int s, e; - uint32_t t; - uint64_t m; - - /* - * Extract sign bit. - * We have: -i = 1 + ~i - */ - s = (int)((uint64_t)i >> 63); - i ^= -(int64_t)s; - i += s; - - /* - * For now we suppose that i != 0. - * Otherwise, we set m to i and left-shift it as much as needed - * to get a 1 in the top bit. We can do that in a logarithmic - * number of conditional shifts. - */ - m = (uint64_t)i; - e = 9 + sc; - FPR_NORM64(m, e); - - /* - * Now m is in the 2^63..2^64-1 range. We must divide it by 512; - * if one of the dropped bits is a 1, this should go into the - * "sticky bit". - */ - m |= ((uint32_t)m & 0x1FF) + 0x1FF; - m >>= 9; - - /* - * Corrective action: if i = 0 then all of the above was - * incorrect, and we clamp e and m down to zero. - */ - t = (uint32_t)((uint64_t)(i | -i) >> 63); - m &= -(uint64_t)t; - e &= -(int)t; - - /* - * Assemble back everything. The FPR() function will handle cases - * where e is too low. - */ - return FPR(s, e, m); -} - -#endif // yyyASM_CORTEXM4- - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -// yyyPQCLEAN+0 -#if 0 -/* Debug code -- To get a printout of registers from a specific point - in ARM Cortex M4 assembly code, uncomment this code and add a - "bl DEBUG" call where wished for. */ - -void -print_regs(uint32_t *rr, uint32_t flags) -{ - int i; - extern int printf(const char *fmt, ...); - - printf("\nRegs:\n"); - for (i = 0; i < 7; i ++) { - int j; - - j = i + 7; - printf(" %2d = %08X %2d = %08X\n", i, rr[i], j, rr[j]); - } - printf(" flags = %08X ", flags); - if ((flags >> 31) & 1) { - printf("N"); - } - if ((flags >> 30) & 1) { - printf("Z"); - } - if ((flags >> 29) & 1) { - printf("C"); - } - if ((flags >> 28) & 1) { - printf("V"); - } - if ((flags >> 27) & 1) { - printf("Q"); - } - printf("\n"); -} - -__attribute__((naked)) -void -DEBUG(void) -{ - __asm__ ( - "push { r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr }\n\t" - "mov r0, sp\n\t" - "mrs r1, apsr\n\t" - "bl print_regs\n\t" - "pop { r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, pc }\n\t" - ); -} -#endif -// yyyPQCLEAN- - -__attribute__((naked)) -fpr -fpr_add(fpr x __attribute__((unused)), fpr y __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, r7, r8, r10, r11, lr }\n\t" - "\n\t" - "@ Make sure that the first operand (x) has the larger absolute\n\t" - "@ value. This guarantees that the exponent of y is less than\n\t" - "@ or equal to the exponent of x, and, if they are equal, then\n\t" - "@ the mantissa of y will not be greater than the mantissa of x.\n\t" - "@ However, if absolute values are equal and the sign of x is 1,\n\t" - "@ then we want to also swap the values.\n\t" - "ubfx r4, r1, #0, #31 @ top word without sign bit\n\t" - "ubfx r5, r3, #0, #31 @ top word without sign bit\n\t" - "subs r7, r0, r2 @ difference in r7:r4\n\t" - "sbcs r4, r5\n\t" - "orrs r7, r4\n\t" - "rsbs r5, r7, #0\n\t" - "orrs r7, r5 @ bit 31 of r7 is 0 iff difference is zero\n\t" - "bics r6, r1, r7\n\t" - "orrs r6, r4 @ bit 31 of r6 is 1 iff the swap must be done\n\t" - "\n\t" - "@ Conditional swap\n\t" - "eors r4, r0, r2\n\t" - "eors r5, r1, r3\n\t" - "ands r4, r4, r6, asr #31\n\t" - "ands r5, r5, r6, asr #31\n\t" - "eors r0, r4\n\t" - "eors r1, r5\n\t" - "eors r2, r4\n\t" - "eors r3, r5\n\t" - "\n\t" - "@ Extract mantissa of x into r0:r1, exponent in r4, sign in r5\n\t" - "ubfx r4, r1, #20, #11 @ Exponent in r4 (without sign)\n\t" - "addw r5, r4, #2047 @ Get a carry to test r4 for zero\n\t" - "lsrs r5, #11 @ r5 is the mantissa implicit high bit\n\t" - "bfc r1, #20, #11 @ Clear exponent bits (not the sign)\n\t" - "orrs r1, r1, r5, lsl #20 @ Set mantissa high bit\n\t" - "asrs r5, r1, #31 @ Get sign bit (sign-extended)\n\t" - "bfc r1, #31, #1 @ Clear the sign bit\n\t" - "\n\t" - "@ Extract mantissa of y into r2:r3, exponent in r6, sign in r7\n\t" - "ubfx r6, r3, #20, #11 @ Exponent in r6 (without sign)\n\t" - "addw r7, r6, #2047 @ Get a carry to test r6 for zero\n\t" - "lsrs r7, #11 @ r7 is the mantissa implicit high bit\n\t" - "bfc r3, #20, #11 @ Clear exponent bits (not the sign)\n\t" - "orrs r3, r3, r7, lsl #20 @ Set mantissa high bit\n\t" - "asrs r7, r3, #31 @ Get sign bit (sign-extended)\n\t" - "bfc r3, #31, #1 @ Clear the sign bit\n\t" - "\n\t" - "@ Scale mantissas up by three bits.\n\t" - "lsls r1, #3\n\t" - "orrs r1, r1, r0, lsr #29\n\t" - "lsls r0, #3\n\t" - "lsls r3, #3\n\t" - "orrs r3, r3, r2, lsr #29\n\t" - "lsls r2, #3\n\t" - "\n\t" - "@ x: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t" - "@ y: exponent=r6, sign=r7, mantissa=r2:r3 (scaled up 3 bits)\n\t" - "\n\t" - "@ At that point, the exponent of x (in r4) is larger than that\n\t" - "@ of y (in r6). The difference is the amount of shifting that\n\t" - "@ should be done on y. If that amount is larger than 59 then\n\t" - "@ we clamp y to 0. We won't need y's exponent beyond that point,\n\t" - "@ so we store that shift count in r6.\n\t" - "subs r6, r4, r6\n\t" - "subs r8, r6, #60\n\t" - "ands r2, r2, r8, asr #31\n\t" - "ands r3, r3, r8, asr #31\n\t" - "\n\t" - "@ Shift right r2:r3 by r6 bits. The shift count is in the 0..59\n\t" - "@ range. r11 will be non-zero if and only if some non-zero bits\n\t" - "@ were dropped.\n\t" - "subs r8, r6, #32\n\t" - "bics r11, r2, r8, asr #31\n\t" - "ands r2, r2, r8, asr #31\n\t" - "bics r10, r3, r8, asr #31\n\t" - "orrs r2, r2, r10\n\t" - "ands r3, r3, r8, asr #31\n\t" - "ands r6, r6, #31\n\t" - "rsbs r8, r6, #32\n\t" - "lsls r10, r2, r8\n\t" - "orrs r11, r11, r10\n\t" - "lsrs r2, r2, r6\n\t" - "lsls r10, r3, r8\n\t" - "orrs r2, r2, r10\n\t" - "lsrs r3, r3, r6\n\t" - "\n\t" - "@ If r11 is non-zero then some non-zero bit was dropped and the\n\t" - "@ low bit of r2 must be forced to 1 ('sticky bit').\n\t" - "rsbs r6, r11, #0\n\t" - "orrs r6, r6, r11\n\t" - "orrs r2, r2, r6, lsr #31\n\t" - "\n\t" - "@ x: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t" - "@ y: sign=r7, value=r2:r3 (scaled to same exponent as x)\n\t" - "\n\t" - "@ If x and y don't have the same sign, then we should negate r2:r3\n\t" - "@ (i.e. subtract the mantissa instead of adding it). Signs of x\n\t" - "@ and y are in r5 and r7, as full-width words. We won't need r7\n\t" - "@ afterwards.\n\t" - "eors r7, r5 @ r7 = -1 if y must be negated, 0 otherwise\n\t" - "eors r2, r7\n\t" - "eors r3, r7\n\t" - "subs r2, r7\n\t" - "sbcs r3, r7\n\t" - "\n\t" - "@ r2:r3 has been shifted, we can add to r0:r1.\n\t" - "adds r0, r2\n\t" - "adcs r1, r3\n\t" - "\n\t" - "@ result: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t" - "\n\t" - "@ Normalize the result with some left-shifting to full 64-bit\n\t" - "@ width. Shift count goes to r2, and exponent (r4) is adjusted.\n\t" - "clz r2, r0\n\t" - "clz r3, r1\n\t" - "sbfx r6, r3, #5, #1\n\t" - "ands r2, r6\n\t" - "adds r2, r2, r3\n\t" - "subs r4, r4, r2\n\t" - "\n\t" - "@ Shift r0:r1 to the left by r2 bits.\n\t" - "subs r7, r2, #32\n\t" - "lsls r7, r0, r7\n\t" - "lsls r1, r1, r2\n\t" - "rsbs r6, r2, #32\n\t" - "orrs r1, r1, r7\n\t" - "lsrs r6, r0, r6\n\t" - "orrs r1, r1, r6\n\t" - "lsls r0, r0, r2\n\t" - "\n\t" - "@ The exponent of x was in r4. The left-shift operation has\n\t" - "@ subtracted some value from it, 8 in case the result has the\n\t" - "@ same exponent as x. However, the high bit of the mantissa will\n\t" - "@ add 1 to the exponent, so we only add back 7 (the exponent is\n\t" - "@ added in because rounding might have produced a carry, which\n\t" - "@ should then spill into the exponent).\n\t" - "adds r4, #7\n\t" - "\n\t" - "@ If the mantissa new mantissa is non-zero, then its bit 63 is\n\t" - "@ non-zero (thanks to the normalizing shift). Otherwise, that bit\n\t" - "@ is zero, and we should then set the exponent to zero as well.\n\t" - "ands r4, r4, r1, asr #31\n\t" - "\n\t" - "@ Shrink back the value to a 52-bit mantissa. This requires\n\t" - "@ right-shifting by 11 bits; we keep a copy of the pre-shift\n\t" - "@ low word in r3.\n\t" - "movs r3, r0\n\t" - "lsrs r0, #11\n\t" - "orrs r0, r0, r1, lsl #21\n\t" - "lsrs r1, #11\n\t" - "\n\t" - "@ Apply rounding.\n\t" - "ubfx r6, r3, #0, #9\n\t" - "addw r6, r6, #511\n\t" - "orrs r3, r6\n\t" - "ubfx r3, r3, #9, #3\n\t" - "movs r6, #0xC8\n\t" - "lsrs r6, r3\n\t" - "ands r6, #1\n\t" - "adds r0, r6\n\t" - "adcs r1, #0\n\t" - "\n\t" - "@Plug in the exponent with an addition.\n\t" - "adds r1, r1, r4, lsl #20\n\t" - "\n\t" - "@ If the new exponent is negative or zero, then it underflowed\n\t" - "@ and we must clear the whole mantissa and exponent.\n\t" - "rsbs r4, r4, #0\n\t" - "ands r0, r0, r4, asr #31\n\t" - "ands r1, r1, r4, asr #31\n\t" - "\n\t" - "@ Put back the sign. This is the sign of x: thanks to the\n\t" - "@ conditional swap at the start, this is always correct.\n\t" - "bfi r1, r5, #31, #1\n\t" - "\n\t" - "pop { r4, r5, r6, r7, r8, r10, r11, pc }\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - -fpr -fpr_add(fpr x, fpr y) -{ - uint64_t m, xu, yu, za; - uint32_t cs; - int ex, ey, sx, sy, cc; - - /* - * Make sure that the first operand (x) has the larger absolute - * value. This guarantees that the exponent of y is less than - * or equal to the exponent of x, and, if they are equal, then - * the mantissa of y will not be greater than the mantissa of x. - * - * After this swap, the result will have the sign x, except in - * the following edge case: abs(x) = abs(y), and x and y have - * opposite sign bits; in that case, the result shall be +0 - * even if the sign bit of x is 1. To handle this case properly, - * we do the swap is abs(x) = abs(y) AND the sign of x is 1. - */ - m = ((uint64_t)1 << 63) - 1; - za = (x & m) - (y & m); - cs = (uint32_t)(za >> 63) - | ((1U - (uint32_t)(-za >> 63)) & (uint32_t)(x >> 63)); - m = (x ^ y) & -(uint64_t)cs; - x ^= m; - y ^= m; - - /* - * Extract sign bits, exponents and mantissas. The mantissas are - * scaled up to 2^55..2^56-1, and the exponent is unbiased. If - * an operand is zero, its mantissa is set to 0 at this step, and - * its exponent will be -1078. - */ - ex = (int)(x >> 52); - sx = ex >> 11; - ex &= 0x7FF; - m = (uint64_t)(uint32_t)((ex + 0x7FF) >> 11) << 52; - xu = ((x & (((uint64_t)1 << 52) - 1)) | m) << 3; - ex -= 1078; - ey = (int)(y >> 52); - sy = ey >> 11; - ey &= 0x7FF; - m = (uint64_t)(uint32_t)((ey + 0x7FF) >> 11) << 52; - yu = ((y & (((uint64_t)1 << 52) - 1)) | m) << 3; - ey -= 1078; - - /* - * x has the larger exponent; hence, we only need to right-shift y. - * If the shift count is larger than 59 bits then we clamp the - * value to zero. - */ - cc = ex - ey; - yu &= -(uint64_t)((uint32_t)(cc - 60) >> 31); - cc &= 63; - - /* - * The lowest bit of yu is "sticky". - */ - m = fpr_ulsh(1, cc) - 1; - yu |= (yu & m) + m; - yu = fpr_ursh(yu, cc); - - /* - * If the operands have the same sign, then we add the mantissas; - * otherwise, we subtract the mantissas. - */ - xu += yu - ((yu << 1) & -(uint64_t)(sx ^ sy)); - - /* - * The result may be smaller, or slightly larger. We normalize - * it to the 2^63..2^64-1 range (if xu is zero, then it stays - * at zero). - */ - FPR_NORM64(xu, ex); - - /* - * Scale down the value to 2^54..s^55-1, handling the last bit - * as sticky. - */ - xu |= ((uint32_t)xu & 0x1FF) + 0x1FF; - xu >>= 9; - ex += 9; - - /* - * In general, the result has the sign of x. However, if the - * result is exactly zero, then the following situations may - * be encountered: - * x > 0, y = -x -> result should be +0 - * x < 0, y = -x -> result should be +0 - * x = +0, y = +0 -> result should be +0 - * x = -0, y = +0 -> result should be +0 - * x = +0, y = -0 -> result should be +0 - * x = -0, y = -0 -> result should be -0 - * - * But at the conditional swap step at the start of the - * function, we ensured that if abs(x) = abs(y) and the - * sign of x was 1, then x and y were swapped. Thus, the - * two following cases cannot actually happen: - * x < 0, y = -x - * x = -0, y = +0 - * In all other cases, the sign bit of x is conserved, which - * is what the FPR() function does. The FPR() function also - * properly clamps values to zero when the exponent is too - * low, but does not alter the sign in that case. - */ - return FPR(sx, ex, xu); -} - -#endif // yyyASM_CORTEXM4- - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -__attribute__((naked)) -fpr -fpr_mul(fpr x __attribute__((unused)), fpr y __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, r7, r8, r10, r11, lr }\n\t" - "\n\t" - "@ Extract mantissas: x.m = r4:r5, y.m = r6:r7\n\t" - "@ r4 and r6 contain only 25 bits each.\n\t" - "bics r4, r0, #0xFE000000\n\t" - "lsls r5, r1, #7\n\t" - "orrs r5, r5, r0, lsr #25\n\t" - "orrs r5, r5, #0x08000000\n\t" - "bics r5, r5, #0xF0000000\n\t" - "bics r6, r2, #0xFE000000\n\t" - "lsls r7, r3, #7\n\t" - "orrs r7, r7, r2, lsr #25\n\t" - "orrs r7, r7, #0x08000000\n\t" - "bics r7, r7, #0xF0000000\n\t" - "\n\t" - "@ Perform product. Values are in the 2^52..2^53-1 range, so\n\t" - "@ the product is at most 106-bit long. Of the low 50 bits,\n\t" - "@ we only want to know if they are all zeros or not. Here,\n\t" - "@ we get the top 56 bits in r10:r11, and r8 will be non-zero\n\t" - "@ if and only if at least one of the low 50 bits is non-zero.\n\t" - "umull r8, r10, r4, r6 @ x0*y0\n\t" - "lsls r10, #7\n\t" - "orrs r10, r10, r8, lsr #25\n\t" - "eors r11, r11\n\t" - "umlal r10, r11, r4, r7 @ x0*y1\n\t" - "umlal r10, r11, r5, r6 @ x1*y0\n\t" - "orrs r8, r8, r10, lsl #7\n\t" - "lsrs r10, #25\n\t" - "orrs r10, r10, r11, lsl #7\n\t" - "eors r11, r11\n\t" - "umlal r10, r11, r5, r7 @ x1*y1\n\t" - "\n\t" - "@ Now r0, r2, r4, r5, r6 and r7 are free.\n\t" - "@ If any of the low 50 bits was non-zero, then we force the\n\t" - "@ low bit of r10 to 1.\n\t" - "rsbs r4, r8, #0\n\t" - "orrs r8, r8, r4\n\t" - "orrs r10, r10, r8, lsr #31\n\t" - "\n\t" - "@ r8 is free.\n\t" - "@ r10:r11 contains the product in the 2^54..2^56-1 range. We\n\t" - "@ normalize it to 2^54..2^55-1 (into r6:r7) with a conditional\n\t" - "@ shift (low bit is sticky). r5 contains -1 if the shift was done,\n\t" - "@ 0 otherwise.\n\t" - "ands r6, r10, #1\n\t" - "lsrs r5, r11, #23\n\t" - "rsbs r5, r5, #0\n\t" - "orrs r6, r6, r10, lsr #1\n\t" - "orrs r6, r6, r11, lsl #31\n\t" - "lsrs r7, r11, #1\n\t" - "eors r10, r10, r6\n\t" - "eors r11, r11, r7\n\t" - "bics r10, r10, r5\n\t" - "bics r11, r11, r5\n\t" - "eors r6, r6, r10\n\t" - "eors r7, r7, r11\n\t" - "\n\t" - "@ Compute aggregate exponent: ex + ey - 1023 + w\n\t" - "@ (where w = 1 if the conditional shift was done, 0 otherwise)\n\t" - "@ But we subtract 1 because the injection of the mantissa high\n\t" - "@ bit will increment the exponent by 1.\n\t" - "lsls r0, r1, #1\n\t" - "lsls r2, r3, #1\n\t" - "lsrs r0, #21\n\t" - "addw r4, r0, #0x7FF @ save ex + 2047 in r4\n\t" - "lsrs r2, #21\n\t" - "addw r8, r2, #0x7FF @ save ey + 2047 in r8\n\t" - "adds r2, r0\n\t" - "subw r2, r2, #1024\n\t" - "subs r2, r5\n\t" - "\n\t" - "@ r5 is free.\n\t" - "@ Also, if either of the source exponents is 0, or the result\n\t" - "@ exponent is 0 or negative, then the result is zero and the\n\t" - "@ mantissa and the exponent shall be clamped to zero. Since\n\t" - "@ r2 contains the result exponent minus 1, we test on r2\n\t" - "@ being strictly negative.\n\t" - "ands r4, r8 @ if bit 11 = 0 then one of the exponents was 0\n\t" - "mvns r5, r2\n\t" - "ands r5, r5, r4, lsl #20\n\t" - "ands r2, r2, r5, asr #31\n\t" - "ands r6, r6, r5, asr #31\n\t" - "ands r7, r7, r5, asr #31\n\t" - "\n\t" - "@ Sign is the XOR of the sign of the operands. This is true in\n\t" - "@ all cases, including very small results (exponent underflow)\n\t" - "@ and zeros.\n\t" - "eors r1, r3\n\t" - "bfc r1, #0, #31\n\t" - "\n\t" - "@ Plug in the exponent.\n\t" - "bfi r1, r2, #20, #11\n\t" - "\n\t" - "@ r2 and r3 are free.\n\t" - "@ Shift back to the normal 53-bit mantissa, with rounding.\n\t" - "@ Mantissa goes into r0:r1. For r1, we must use an addition\n\t" - "@ because the rounding may have triggered a carry, that should\n\t" - "@ be added to the exponent.\n\t" - "movs r4, r6\n\t" - "lsrs r0, r6, #2\n\t" - "orrs r0, r0, r7, lsl #30\n\t" - "adds r1, r1, r7, lsr #2\n\t" - "ands r4, #0x7\n\t" - "movs r3, #0xC8\n\t" - "lsrs r3, r4\n\t" - "ands r3, #1\n\t" - "adds r0, r3\n\t" - "adcs r1, #0\n\t" - "\n\t" - "pop { r4, r5, r6, r7, r8, r10, r11, pc }\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - -fpr -fpr_mul(fpr x, fpr y) -{ - uint64_t xu, yu, w, zu, zv; - uint32_t x0, x1, y0, y1, z0, z1, z2; - int ex, ey, d, e, s; - - /* - * Extract absolute values as scaled unsigned integers. We - * don't extract exponents yet. - */ - xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52); - yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52); - - /* - * We have two 53-bit integers to multiply; we need to split - * each into a lower half and a upper half. Moreover, we - * prefer to have lower halves to be of 25 bits each, for - * reasons explained later on. - */ - x0 = (uint32_t)xu & 0x01FFFFFF; - x1 = (uint32_t)(xu >> 25); - y0 = (uint32_t)yu & 0x01FFFFFF; - y1 = (uint32_t)(yu >> 25); - w = (uint64_t)x0 * (uint64_t)y0; - z0 = (uint32_t)w & 0x01FFFFFF; - z1 = (uint32_t)(w >> 25); - w = (uint64_t)x0 * (uint64_t)y1; - z1 += (uint32_t)w & 0x01FFFFFF; - z2 = (uint32_t)(w >> 25); - w = (uint64_t)x1 * (uint64_t)y0; - z1 += (uint32_t)w & 0x01FFFFFF; - z2 += (uint32_t)(w >> 25); - zu = (uint64_t)x1 * (uint64_t)y1; - z2 += (z1 >> 25); - z1 &= 0x01FFFFFF; - zu += z2; - - /* - * Since xu and yu are both in the 2^52..2^53-1 range, the - * product is in the 2^104..2^106-1 range. We first reassemble - * it and round it into the 2^54..2^56-1 range; the bottom bit - * is made "sticky". Since the low limbs z0 and z1 are 25 bits - * each, we just take the upper part (zu), and consider z0 and - * z1 only for purposes of stickiness. - * (This is the reason why we chose 25-bit limbs above.) - */ - zu |= ((z0 | z1) + 0x01FFFFFF) >> 25; - - /* - * We normalize zu to the 2^54..s^55-1 range: it could be one - * bit too large at this point. This is done with a conditional - * right-shift that takes into account the sticky bit. - */ - zv = (zu >> 1) | (zu & 1); - w = zu >> 55; - zu ^= (zu ^ zv) & -w; - - /* - * Get the aggregate scaling factor: - * - * - Each exponent is biased by 1023. - * - * - Integral mantissas are scaled by 2^52, hence an - * extra 52 bias for each exponent. - * - * - However, we right-shifted z by 50 bits, and then - * by 0 or 1 extra bit (depending on the value of w). - * - * In total, we must add the exponents, then subtract - * 2 * (1023 + 52), then add 50 + w. - */ - ex = (int)((x >> 52) & 0x7FF); - ey = (int)((y >> 52) & 0x7FF); - e = ex + ey - 2100 + (int)w; - - /* - * Sign bit is the XOR of the operand sign bits. - */ - s = (int)((x ^ y) >> 63); - - /* - * Corrective actions for zeros: if either of the operands is - * zero, then the computations above were wrong. Test for zero - * is whether ex or ey is zero. We just have to set the mantissa - * (zu) to zero, the FPR() function will normalize e. - */ - d = ((ex + 0x7FF) & (ey + 0x7FF)) >> 11; - zu &= -(uint64_t)d; - - /* - * FPR() packs the result and applies proper rounding. - */ - return FPR(s, e, zu); -} - -#endif // yyyASM_CORTEXM4- - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -__attribute__((naked)) -fpr -fpr_div(fpr x __attribute__((unused)), fpr y __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, r7, r8, r10, r11, lr }\n\t" - - "@ Extract mantissas of x and y, in r0:r4 and r2:r5, respectively.\n\t" - "@ We don't touch r1 and r3 as they contain the exponents and\n\t" - "@ signs, which we'll need later on.\n\t" - "ubfx r4, r1, #0, #20\n\t" - "ubfx r5, r3, #0, #20\n\t" - "orrs r4, r4, #0x00100000\n\t" - "orrs r5, r5, #0x00100000\n\t" - "\n\t" - "@ Perform bit-by-bit division. We want a 56-bit result in r8:r10\n\t" - "@ (low bit is 0). Bits come from the carry flag and are\n\t" - "@ injected with rrx, i.e. in position 31; we thus get bits in\n\t" - "@ the reverse order. Bits accumulate in r8; after the first 24\n\t" - "@ bits, we move the quotient bits to r10.\n\t" - "eors r8, r8\n\t" - "\n\t" - -#define DIVSTEP \ - "subs r6, r0, r2\n\t" \ - "sbcs r7, r4, r5\n\t" \ - "rrx r8, r8\n\t" \ - "ands r6, r2, r8, asr #31\n\t" \ - "ands r7, r5, r8, asr #31\n\t" \ - "subs r0, r6\n\t" \ - "sbcs r4, r7\n\t" \ - "adds r0, r0, r0\n\t" \ - "adcs r4, r4, r4\n\t" - -#define DIVSTEP4 DIVSTEP DIVSTEP DIVSTEP DIVSTEP -#define DIVSTEP8 DIVSTEP4 DIVSTEP4 - - DIVSTEP8 - DIVSTEP8 - DIVSTEP8 - - "\n\t" - "@ We have the first 24 bits of the quotient, move them to r10.\n\t" - "rbit r10, r8\n\t" - "\n\t" - - DIVSTEP8 - DIVSTEP8 - DIVSTEP8 - DIVSTEP4 DIVSTEP DIVSTEP DIVSTEP - -#undef DIVSTEP -#undef DIVSTEP4 -#undef DIVSTEP8 - - "\n\t" - "@ Lowest bit will be set if remainder is non-zero at this point\n\t" - "@ (this is the 'sticky' bit).\n\t" - "subs r0, #1\n\t" - "sbcs r4, #0\n\t" - "rrx r8, r8\n\t" - "\n\t" - "@ We now have the next (low) 32 bits of the quotient.\n\t" - "rbit r8, r8\n\t" - "\n\t" - "@ Since both operands had their top bit set, we know that the\n\t" - "@ result at this point is in 2^54..2^56-1. We scale it down\n\t" - "@ to 2^54..2^55-1 with a conditional shift. We also write the\n\t" - "@ result in r4:r5. If the shift is done, r6 will contain -1.\n\t" - "ands r4, r8, #1\n\t" - "lsrs r6, r10, #23\n\t" - "rsbs r6, r6, #0\n\t" - "orrs r4, r4, r8, lsr #1\n\t" - "orrs r4, r4, r10, lsl #31\n\t" - "lsrs r5, r10, #1\n\t" - "eors r8, r8, r4\n\t" - "eors r10, r10, r5\n\t" - "bics r8, r8, r6\n\t" - "bics r10, r10, r6\n\t" - "eors r4, r4, r8\n\t" - "eors r5, r5, r10\n\t" - "\n\t" - "@ Compute aggregate exponent: ex - ey + 1022 + w\n\t" - "@ (where w = 1 if the conditional shift was done, 0 otherwise)\n\t" - "@ But we subtract 1 because the injection of the mantissa high\n\t" - "@ bit will increment the exponent by 1.\n\t" - "lsls r0, r1, #1\n\t" - "lsls r2, r3, #1\n\t" - "lsrs r0, r0, #21\n\t" - "addw r7, r0, #0x7FF @ save ex + 2047 in r7\n\t" - "subs r0, r0, r2, lsr #21\n\t" - "addw r0, r0, #1021\n\t" - "subs r0, r6\n\t" - "\n\t" - "@ If the x operand was zero, then the computation was wrong and\n\t" - "@ the result is zero. Also, if the result exponent is zero or\n\t" - "@ negative, then the mantissa shall be clamped to zero. Since r0\n\t" - "@ contains the result exponent minus 1, we test on r0 being\n\t" - "@ strictly negative.\n\t" - "mvns r2, r0\n\t" - "ands r2, r2, r7, lsl #20\n\t" - "ands r0, r0, r2, asr #31\n\t" - "ands r4, r4, r2, asr #31\n\t" - "ands r5, r5, r2, asr #31\n\t" - "\n\t" - "@ Sign is the XOR of the sign of the operands. This is true in\n\t" - "@ all cases, including very small results (exponent underflow)\n\t" - "@ and zeros.\n\t" - "eors r1, r3\n\t" - "bfc r1, #0, #31\n\t" - "\n\t" - "@ Plug in the exponent.\n\t" - "bfi r1, r0, #20, #11\n\t" - "\n\t" - "@ Shift back to the normal 53-bit mantissa, with rounding.\n\t" - "@ Mantissa goes into r0:r1. For r1, we must use an addition\n\t" - "@ because the rounding may have triggered a carry, that should\n\t" - "@ be added to the exponent.\n\t" - "movs r6, r4\n\t" - "lsrs r0, r4, #2\n\t" - "orrs r0, r0, r5, lsl #30\n\t" - "adds r1, r1, r5, lsr #2\n\t" - "ands r6, #0x7\n\t" - "movs r3, #0xC8\n\t" - "lsrs r3, r6\n\t" - "ands r3, #1\n\t" - "adds r0, r3\n\t" - "adcs r1, #0\n\t" - "\n\t" - "pop { r4, r5, r6, r7, r8, r10, r11, pc }\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - -fpr -fpr_div(fpr x, fpr y) -{ - uint64_t xu, yu, q, q2, w; - int i, ex, ey, e, d, s; - - /* - * Extract mantissas of x and y (unsigned). - */ - xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52); - yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52); - - /* - * Perform bit-by-bit division of xu by yu. We run it for 55 bits. - */ - q = 0; - for (i = 0; i < 55; i ++) { - /* - * If yu is less than or equal xu, then subtract it and - * push a 1 in the quotient; otherwise, leave xu unchanged - * and push a 0. - */ - uint64_t b; - - b = ((xu - yu) >> 63) - 1; - xu -= b & yu; - q |= b & 1; - xu <<= 1; - q <<= 1; - } - - /* - * We got 55 bits in the quotient, followed by an extra zero. We - * want that 56th bit to be "sticky": it should be a 1 if and - * only if the remainder (xu) is non-zero. - */ - q |= (xu | -xu) >> 63; - - /* - * Quotient is at most 2^56-1. Its top bit may be zero, but in - * that case the next-to-top bit will be a one, since the - * initial xu and yu were both in the 2^52..2^53-1 range. - * We perform a conditional shift to normalize q to the - * 2^54..2^55-1 range (with the bottom bit being sticky). - */ - q2 = (q >> 1) | (q & 1); - w = q >> 55; - q ^= (q ^ q2) & -w; - - /* - * Extract exponents to compute the scaling factor: - * - * - Each exponent is biased and we scaled them up by - * 52 bits; but these biases will cancel out. - * - * - The division loop produced a 55-bit shifted result, - * so we must scale it down by 55 bits. - * - * - If w = 1, we right-shifted the integer by 1 bit, - * hence we must add 1 to the scaling. - */ - ex = (int)((x >> 52) & 0x7FF); - ey = (int)((y >> 52) & 0x7FF); - e = ex - ey - 55 + (int)w; - - /* - * Sign is the XOR of the signs of the operands. - */ - s = (int)((x ^ y) >> 63); - - /* - * Corrective actions for zeros: if x = 0, then the computation - * is wrong, and we must clamp e and q to 0. We do not care - * about the case y = 0 (as per assumptions in this module, - * the caller does not perform divisions by zero). - */ - d = (ex + 0x7FF) >> 11; - s &= d; - e &= -d; - q &= -(uint64_t)d; - - /* - * FPR() packs the result and applies proper rounding. - */ - return FPR(s, e, q); -} - -#endif // yyyASM_CORTEXM4- - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -__attribute__((naked)) -fpr -fpr_sqrt(fpr x __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, r7, r8, r10, r11, lr }\n\t" - "\n\t" - "@ Extract mantissa (r0:r1) and exponent (r2). We assume that the\n\t" - "@ sign is positive. If the source is zero, then the mantissa is\n\t" - "@ set to 0.\n\t" - "lsrs r2, r1, #20\n\t" - "bfc r1, #20, #12\n\t" - "addw r3, r2, #0x7FF\n\t" - "subw r2, r2, #1023\n\t" - "lsrs r3, r3, #11\n\t" - "orrs r1, r1, r3, lsl #20\n\t" - "\n\t" - "@ If the exponent is odd, then multiply mantissa by 2 and subtract\n\t" - "@ 1 from the exponent.\n\t" - "ands r3, r2, #1\n\t" - "subs r2, r2, r3\n\t" - "rsbs r3, r3, #0\n\t" - "ands r4, r1, r3\n\t" - "ands r3, r0\n\t" - "adds r0, r3\n\t" - "adcs r1, r4\n\t" - "\n\t" - "@ Left-shift the mantissa by 9 bits to put it in the\n\t" - "@ 2^61..2^63-1 range (unless it is exactly 0).\n\t" - "lsls r1, r1, #9\n\t" - "orrs r1, r1, r0, lsr #23\n\t" - "lsls r0, r0, #9\n\t" - "\n\t" - "@ Compute the square root bit-by-bit.\n\t" - "@ There are 54 iterations; first 30 can work on top word only.\n\t" - "@ q = r3 (bit-reversed)\n\t" - "@ s = r5\n\t" - "eors r3, r3\n\t" - "eors r5, r5\n\t" - -#define SQRT_STEP_HI(bit) \ - "orrs r6, r5, #(1 << (" #bit "))\n\t" \ - "subs r7, r1, r6\n\t" \ - "rrx r3, r3\n\t" \ - "ands r6, r6, r3, asr #31\n\t" \ - "subs r1, r1, r6\n\t" \ - "lsrs r6, r3, #31\n\t" \ - "orrs r5, r5, r6, lsl #((" #bit ") + 1)\n\t" \ - "adds r0, r0\n\t" \ - "adcs r1, r1\n\t" - -#define SQRT_STEP_HIx5(b) \ - SQRT_STEP_HI((b)+4) \ - SQRT_STEP_HI((b)+3) \ - SQRT_STEP_HI((b)+2) \ - SQRT_STEP_HI((b)+1) \ - SQRT_STEP_HI(b) - - SQRT_STEP_HIx5(25) - SQRT_STEP_HIx5(20) - SQRT_STEP_HIx5(15) - SQRT_STEP_HIx5(10) - SQRT_STEP_HIx5(5) - SQRT_STEP_HIx5(0) - -#undef SQRT_STEP_HI -#undef SQRT_STEP_HIx5 - - "@ Top 30 bits of the result must be reversed: they were\n\t" - "@ accumulated with rrx (hence from the top bit).\n\t" - "rbit r3, r3\n\t" - "\n\t" - "@ For the next 24 iterations, we must use two-word operations.\n\t" - "@ bits of q now accumulate in r4\n\t" - "@ s is in r6:r5\n\t" - "eors r4, r4\n\t" - "eors r6, r6\n\t" - "\n\t" - "@ First iteration is special because the potential bit goes into\n\t" - "@ r5, not r6.\n\t" - "orrs r7, r6, #(1 << 31)\n\t" - "subs r8, r0, r7\n\t" - "sbcs r10, r1, r5\n\t" - "rrx r4, r4\n\t" - "ands r7, r7, r4, asr #31\n\t" - "ands r8, r5, r4, asr #31\n\t" - "subs r0, r0, r7\n\t" - "sbcs r1, r1, r8\n\t" - "lsrs r7, r4, #31\n\t" - "orrs r5, r5, r4, lsr #31\n\t" - "adds r0, r0\n\t" - "adcs r1, r1\n\t" - -#define SQRT_STEP_LO(bit) \ - "orrs r7, r6, #(1 << (" #bit "))\n\t" \ - "subs r8, r0, r7\n\t" \ - "sbcs r10, r1, r5\n\t" \ - "rrx r4, r4\n\t" \ - "ands r7, r7, r4, asr #31\n\t" \ - "ands r8, r5, r4, asr #31\n\t" \ - "subs r0, r0, r7\n\t" \ - "sbcs r1, r1, r8\n\t" \ - "lsrs r7, r4, #31\n\t" \ - "orrs r6, r6, r7, lsl #((" #bit ") + 1)\n\t" \ - "adds r0, r0\n\t" \ - "adcs r1, r1\n\t" - -#define SQRT_STEP_LOx4(b) \ - SQRT_STEP_LO((b)+3) \ - SQRT_STEP_LO((b)+2) \ - SQRT_STEP_LO((b)+1) \ - SQRT_STEP_LO(b) - - SQRT_STEP_LO(30) - SQRT_STEP_LO(29) - SQRT_STEP_LO(28) - SQRT_STEP_LOx4(24) - SQRT_STEP_LOx4(20) - SQRT_STEP_LOx4(16) - SQRT_STEP_LOx4(12) - SQRT_STEP_LOx4(8) - -#undef SQRT_STEP_LO -#undef SQRT_STEP_LOx4 - - "@ Put low 24 bits in the right order.\n\t" - "rbit r4, r4\n\t" - "\n\t" - "@ We have a 54-bit result; compute the 55-th bit as the 'sticky'\n\t" - "@ bit: it is non-zero if and only if r0:r1 is non-zero. We put the\n\t" - "@ three low bits (including the sticky bit) in r5.\n\t" - "orrs r0, r1\n\t" - "rsbs r1, r0, #0\n\t" - "orrs r0, r1\n\t" - "lsls r5, r4, #1\n\t" - "orrs r5, r5, r0, lsr #31\n\t" - "ands r5, #0x7\n\t" - "\n\t" - "@ Compute the rounding: r6 is set to 0 or 1, and will be added\n\t" - "@ to the mantissa.\n\t" - "movs r6, #0xC8\n\t" - "lsrs r6, r5\n\t" - "ands r6, #1\n\t" - "\n\t" - "@ Put the mantissa (53 bits, in the 2^52..2^53-1 range) in r0:r1\n\t" - "@ (rounding not applied yet).\n\t" - "lsrs r0, r4, #1\n\t" - "orrs r0, r0, r3, lsl #23\n\t" - "lsrs r1, r3, #9\n\t" - "\n\t" - "@ Compute new exponent. This is half the old one (then reencoded\n\t" - "@ by adding 1023). Exception: if the mantissa is zero, then the\n\t" - "@ encoded exponent is set to 0. At that point, if the mantissa\n\t" - "@ is non-zero, then its high bit (bit 52, i.e. bit 20 of r1) is\n\t" - "@ non-zero. Note that the exponent cannot go out of range.\n\t" - "lsrs r2, r2, #1\n\t" - "addw r2, r2, #1023\n\t" - "lsrs r5, r1, #20\n\t" - "rsbs r5, r5, #0\n\t" - "ands r2, r5\n\t" - "\n\t" - "@ Place exponent. This overwrites the high bit of the mantissa.\n\t" - "bfi r1, r2, #20, #11\n\t" - "\n\t" - "@ Apply rounding. This may create a carry that will spill into\n\t" - "@ the exponent, which is exactly what should be done in that case\n\t" - "@ (i.e. increment the exponent).\n\t" - "adds r0, r0, r6\n\t" - "adcs r1, r1, #0\n\t" - "\n\t" - "pop { r4, r5, r6, r7, r8, r10, r11, pc }\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - -fpr -fpr_sqrt(fpr x) -{ - uint64_t xu, q, s, r; - int ex, e; - - /* - * Extract the mantissa and the exponent. We don't care about - * the sign: by assumption, the operand is nonnegative. - * We want the "true" exponent corresponding to a mantissa - * in the 1..2 range. - */ - xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52); - ex = (int)((x >> 52) & 0x7FF); - e = ex - 1023; - - /* - * If the exponent is odd, double the mantissa and decrement - * the exponent. The exponent is then halved to account for - * the square root. - */ - xu += xu & -(uint64_t)(e & 1); - e >>= 1; - - /* - * Double the mantissa. - */ - xu <<= 1; - - /* - * We now have a mantissa in the 2^53..2^55-1 range. It - * represents a value between 1 (inclusive) and 4 (exclusive) - * in fixed point notation (with 53 fractional bits). We - * compute the square root bit by bit. - */ - q = 0; - s = 0; - r = (uint64_t)1 << 53; - for (int i = 0; i < 54; i ++) { - uint64_t t, b; - - t = s + r; - b = ((xu - t) >> 63) - 1; - s += (r << 1) & b; - xu -= t & b; - q += r & b; - xu <<= 1; - r >>= 1; - } - - /* - * Now, q is a rounded-low 54-bit value, with a leading 1, - * 52 fractional digits, and an additional guard bit. We add - * an extra sticky bit to account for what remains of the operand. - */ - q <<= 1; - q |= (xu | -xu) >> 63; - - /* - * Result q is in the 2^54..2^55-1 range; we bias the exponent - * by 54 bits (the value e at that point contains the "true" - * exponent, but q is now considered an integer, i.e. scaled - * up. - */ - e -= 54; - - /* - * Corrective action for an operand of value zero. - */ - q &= -(uint64_t)((ex + 0x7FF) >> 11); - - /* - * Apply rounding and back result. - */ - return FPR(0, e, q); -} - -#endif // yyyASM_CORTEXM4- - -uint64_t -fpr_expm_p63(fpr x, fpr ccs) -{ - /* - * Polynomial approximation of exp(-x) is taken from FACCT: - * https://eprint.iacr.org/2018/1234 - * Specifically, values are extracted from the implementation - * referenced from the FACCT article, and available at: - * https://github.com/raykzhao/gaussian - * Here, the coefficients have been scaled up by 2^63 and - * converted to integers. - * - * Tests over more than 24 billions of random inputs in the - * 0..log(2) range have never shown a deviation larger than - * 2^(-50) from the true mathematical value. - */ - static const uint64_t C[] = { - 0x00000004741183A3u, - 0x00000036548CFC06u, - 0x0000024FDCBF140Au, - 0x0000171D939DE045u, - 0x0000D00CF58F6F84u, - 0x000680681CF796E3u, - 0x002D82D8305B0FEAu, - 0x011111110E066FD0u, - 0x0555555555070F00u, - 0x155555555581FF00u, - 0x400000000002B400u, - 0x7FFFFFFFFFFF4800u, - 0x8000000000000000u - }; - - uint64_t z, y; - unsigned u; - uint32_t z0, z1, y0, y1; - uint64_t a, b; - - y = C[0]; - z = (uint64_t)fpr_trunc(fpr_mul(x, fpr_ptwo63)) << 1; - for (u = 1; u < (sizeof C) / sizeof(C[0]); u ++) { - /* - * Compute product z * y over 128 bits, but keep only - * the top 64 bits. - * - * TODO: On some architectures/compilers we could use - * some intrinsics (__umulh() on MSVC) or other compiler - * extensions (unsigned __int128 on GCC / Clang) for - * improved speed; however, most 64-bit architectures - * also have appropriate IEEE754 floating-point support, - * which is better. - */ - uint64_t c; - - z0 = (uint32_t)z; - z1 = (uint32_t)(z >> 32); - y0 = (uint32_t)y; - y1 = (uint32_t)(y >> 32); - a = ((uint64_t)z0 * (uint64_t)y1) - + (((uint64_t)z0 * (uint64_t)y0) >> 32); - b = ((uint64_t)z1 * (uint64_t)y0); - c = (a >> 32) + (b >> 32); - c += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32); - c += (uint64_t)z1 * (uint64_t)y1; - y = C[u] - c; - } - - /* - * The scaling factor must be applied at the end. Since y is now - * in fixed-point notation, we have to convert the factor to the - * same format, and do an extra integer multiplication. - */ - z = (uint64_t)fpr_trunc(fpr_mul(ccs, fpr_ptwo63)) << 1; - z0 = (uint32_t)z; - z1 = (uint32_t)(z >> 32); - y0 = (uint32_t)y; - y1 = (uint32_t)(y >> 32); - a = ((uint64_t)z0 * (uint64_t)y1) - + (((uint64_t)z0 * (uint64_t)y0) >> 32); - b = ((uint64_t)z1 * (uint64_t)y0); - y = (a >> 32) + (b >> 32); - y += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32); - y += (uint64_t)z1 * (uint64_t)y1; - - return y; -} - -const fpr fpr_gm_tab[] = { - 0, 0, - 9223372036854775808U, 4607182418800017408U, - 4604544271217802189U, 4604544271217802189U, - 13827916308072577997U, 4604544271217802189U, - 4606496786581982534U, 4600565431771507043U, - 13823937468626282851U, 4606496786581982534U, - 4600565431771507043U, 4606496786581982534U, - 13829868823436758342U, 4600565431771507043U, - 4607009347991985328U, 4596196889902818827U, - 13819568926757594635U, 4607009347991985328U, - 4603179351334086856U, 4605664432017547683U, - 13829036468872323491U, 4603179351334086856U, - 4605664432017547683U, 4603179351334086856U, - 13826551388188862664U, 4605664432017547683U, - 4596196889902818827U, 4607009347991985328U, - 13830381384846761136U, 4596196889902818827U, - 4607139046673687846U, 4591727299969791020U, - 13815099336824566828U, 4607139046673687846U, - 4603889326261607894U, 4605137878724712257U, - 13828509915579488065U, 4603889326261607894U, - 4606118860100255153U, 4602163548591158843U, - 13825535585445934651U, 4606118860100255153U, - 4598900923775164166U, 4606794571824115162U, - 13830166608678890970U, 4598900923775164166U, - 4606794571824115162U, 4598900923775164166U, - 13822272960629939974U, 4606794571824115162U, - 4602163548591158843U, 4606118860100255153U, - 13829490896955030961U, 4602163548591158843U, - 4605137878724712257U, 4603889326261607894U, - 13827261363116383702U, 4605137878724712257U, - 4591727299969791020U, 4607139046673687846U, - 13830511083528463654U, 4591727299969791020U, - 4607171569234046334U, 4587232218149935124U, - 13810604255004710932U, 4607171569234046334U, - 4604224084862889120U, 4604849113969373103U, - 13828221150824148911U, 4604224084862889120U, - 4606317631232591731U, 4601373767755717824U, - 13824745804610493632U, 4606317631232591731U, - 4599740487990714333U, 4606655894547498725U, - 13830027931402274533U, 4599740487990714333U, - 4606912484326125783U, 4597922303871901467U, - 13821294340726677275U, 4606912484326125783U, - 4602805845399633902U, 4605900952042040894U, - 13829272988896816702U, 4602805845399633902U, - 4605409869824231233U, 4603540801876750389U, - 13826912838731526197U, 4605409869824231233U, - 4594454542771183930U, 4607084929468638487U, - 13830456966323414295U, 4594454542771183930U, - 4607084929468638487U, 4594454542771183930U, - 13817826579625959738U, 4607084929468638487U, - 4603540801876750389U, 4605409869824231233U, - 13828781906679007041U, 4603540801876750389U, - 4605900952042040894U, 4602805845399633902U, - 13826177882254409710U, 4605900952042040894U, - 4597922303871901467U, 4606912484326125783U, - 13830284521180901591U, 4597922303871901467U, - 4606655894547498725U, 4599740487990714333U, - 13823112524845490141U, 4606655894547498725U, - 4601373767755717824U, 4606317631232591731U, - 13829689668087367539U, 4601373767755717824U, - 4604849113969373103U, 4604224084862889120U, - 13827596121717664928U, 4604849113969373103U, - 4587232218149935124U, 4607171569234046334U, - 13830543606088822142U, 4587232218149935124U, - 4607179706000002317U, 4582730748936808062U, - 13806102785791583870U, 4607179706000002317U, - 4604386048625945823U, 4604698657331085206U, - 13828070694185861014U, 4604386048625945823U, - 4606409688975526202U, 4600971798440897930U, - 13824343835295673738U, 4606409688975526202U, - 4600154912527631775U, 4606578871587619388U, - 13829950908442395196U, 4600154912527631775U, - 4606963563043808649U, 4597061974398750563U, - 13820434011253526371U, 4606963563043808649U, - 4602994049708411683U, 4605784983948558848U, - 13829157020803334656U, 4602994049708411683U, - 4605539368864982914U, 4603361638657888991U, - 13826733675512664799U, 4605539368864982914U, - 4595327571478659014U, 4607049811591515049U, - 13830421848446290857U, 4595327571478659014U, - 4607114680469659603U, 4593485039402578702U, - 13816857076257354510U, 4607114680469659603U, - 4603716733069447353U, 4605276012900672507U, - 13828648049755448315U, 4603716733069447353U, - 4606012266443150634U, 4602550884377336506U, - 13825922921232112314U, 4606012266443150634U, - 4598476289818621559U, 4606856142606846307U, - 13830228179461622115U, 4598476289818621559U, - 4606727809065869586U, 4599322407794599425U, - 13822694444649375233U, 4606727809065869586U, - 4601771097584682078U, 4606220668805321205U, - 13829592705660097013U, 4601771097584682078U, - 4604995550503212910U, 4604058477489546729U, - 13827430514344322537U, 4604995550503212910U, - 4589965306122607094U, 4607158013403433018U, - 13830530050258208826U, 4589965306122607094U, - 4607158013403433018U, 4589965306122607094U, - 13813337342977382902U, 4607158013403433018U, - 4604058477489546729U, 4604995550503212910U, - 13828367587357988718U, 4604058477489546729U, - 4606220668805321205U, 4601771097584682078U, - 13825143134439457886U, 4606220668805321205U, - 4599322407794599425U, 4606727809065869586U, - 13830099845920645394U, 4599322407794599425U, - 4606856142606846307U, 4598476289818621559U, - 13821848326673397367U, 4606856142606846307U, - 4602550884377336506U, 4606012266443150634U, - 13829384303297926442U, 4602550884377336506U, - 4605276012900672507U, 4603716733069447353U, - 13827088769924223161U, 4605276012900672507U, - 4593485039402578702U, 4607114680469659603U, - 13830486717324435411U, 4593485039402578702U, - 4607049811591515049U, 4595327571478659014U, - 13818699608333434822U, 4607049811591515049U, - 4603361638657888991U, 4605539368864982914U, - 13828911405719758722U, 4603361638657888991U, - 4605784983948558848U, 4602994049708411683U, - 13826366086563187491U, 4605784983948558848U, - 4597061974398750563U, 4606963563043808649U, - 13830335599898584457U, 4597061974398750563U, - 4606578871587619388U, 4600154912527631775U, - 13823526949382407583U, 4606578871587619388U, - 4600971798440897930U, 4606409688975526202U, - 13829781725830302010U, 4600971798440897930U, - 4604698657331085206U, 4604386048625945823U, - 13827758085480721631U, 4604698657331085206U, - 4582730748936808062U, 4607179706000002317U, - 13830551742854778125U, 4582730748936808062U, - 4607181740574479067U, 4578227681973159812U, - 13801599718827935620U, 4607181740574479067U, - 4604465633578481725U, 4604621949701367983U, - 13827993986556143791U, 4604465633578481725U, - 4606453861145241227U, 4600769149537129431U, - 13824141186391905239U, 4606453861145241227U, - 4600360675823176935U, 4606538458821337243U, - 13829910495676113051U, 4600360675823176935U, - 4606987119037722413U, 4596629994023683153U, - 13820002030878458961U, 4606987119037722413U, - 4603087070374583113U, 4605725276488455441U, - 13829097313343231249U, 4603087070374583113U, - 4605602459698789090U, 4603270878689749849U, - 13826642915544525657U, 4605602459698789090U, - 4595762727260045105U, 4607030246558998647U, - 13830402283413774455U, 4595762727260045105U, - 4607127537664763515U, 4592606767730311893U, - 13815978804585087701U, 4607127537664763515U, - 4603803453461190356U, 4605207475328619533U, - 13828579512183395341U, 4603803453461190356U, - 4606066157444814153U, 4602357870542944470U, - 13825729907397720278U, 4606066157444814153U, - 4598688984595225406U, 4606826008603986804U, - 13830198045458762612U, 4598688984595225406U, - 4606761837001494797U, 4599112075441176914U, - 13822484112295952722U, 4606761837001494797U, - 4601967947786150793U, 4606170366472647579U, - 13829542403327423387U, 4601967947786150793U, - 4605067233569943231U, 4603974338538572089U, - 13827346375393347897U, 4605067233569943231U, - 4590846768565625881U, 4607149205763218185U, - 13830521242617993993U, 4590846768565625881U, - 4607165468267934125U, 4588998070480937184U, - 13812370107335712992U, 4607165468267934125U, - 4604141730443515286U, 4604922840319727473U, - 13828294877174503281U, 4604141730443515286U, - 4606269759522929756U, 4601573027631668967U, - 13824945064486444775U, 4606269759522929756U, - 4599531889160152938U, 4606692493141721470U, - 13830064529996497278U, 4599531889160152938U, - 4606884969294623682U, 4598262871476403630U, - 13821634908331179438U, 4606884969294623682U, - 4602710690099904183U, 4605957195211051218U, - 13829329232065827026U, 4602710690099904183U, - 4605343481119364930U, 4603629178146150899U, - 13827001215000926707U, 4605343481119364930U, - 4594016801320007031U, 4607100477024622401U, - 13830472513879398209U, 4594016801320007031U, - 4607068040143112603U, 4594891488091520602U, - 13818263524946296410U, 4607068040143112603U, - 4603451617570386922U, 4605475169017376660U, - 13828847205872152468U, 4603451617570386922U, - 4605843545406134034U, 4602900303344142735U, - 13826272340198918543U, 4605843545406134034U, - 4597492765973365521U, 4606938683557690074U, - 13830310720412465882U, 4597492765973365521U, - 4606618018794815019U, 4599948172872067014U, - 13823320209726842822U, 4606618018794815019U, - 4601173347964633034U, 4606364276725003740U, - 13829736313579779548U, 4601173347964633034U, - 4604774382555066977U, 4604305528345395596U, - 13827677565200171404U, 4604774382555066977U, - 4585465300892538317U, 4607176315382986589U, - 13830548352237762397U, 4585465300892538317U, - 4607176315382986589U, 4585465300892538317U, - 13808837337747314125U, 4607176315382986589U, - 4604305528345395596U, 4604774382555066977U, - 13828146419409842785U, 4604305528345395596U, - 4606364276725003740U, 4601173347964633034U, - 13824545384819408842U, 4606364276725003740U, - 4599948172872067014U, 4606618018794815019U, - 13829990055649590827U, 4599948172872067014U, - 4606938683557690074U, 4597492765973365521U, - 13820864802828141329U, 4606938683557690074U, - 4602900303344142735U, 4605843545406134034U, - 13829215582260909842U, 4602900303344142735U, - 4605475169017376660U, 4603451617570386922U, - 13826823654425162730U, 4605475169017376660U, - 4594891488091520602U, 4607068040143112603U, - 13830440076997888411U, 4594891488091520602U, - 4607100477024622401U, 4594016801320007031U, - 13817388838174782839U, 4607100477024622401U, - 4603629178146150899U, 4605343481119364930U, - 13828715517974140738U, 4603629178146150899U, - 4605957195211051218U, 4602710690099904183U, - 13826082726954679991U, 4605957195211051218U, - 4598262871476403630U, 4606884969294623682U, - 13830257006149399490U, 4598262871476403630U, - 4606692493141721470U, 4599531889160152938U, - 13822903926014928746U, 4606692493141721470U, - 4601573027631668967U, 4606269759522929756U, - 13829641796377705564U, 4601573027631668967U, - 4604922840319727473U, 4604141730443515286U, - 13827513767298291094U, 4604922840319727473U, - 4588998070480937184U, 4607165468267934125U, - 13830537505122709933U, 4588998070480937184U, - 4607149205763218185U, 4590846768565625881U, - 13814218805420401689U, 4607149205763218185U, - 4603974338538572089U, 4605067233569943231U, - 13828439270424719039U, 4603974338538572089U, - 4606170366472647579U, 4601967947786150793U, - 13825339984640926601U, 4606170366472647579U, - 4599112075441176914U, 4606761837001494797U, - 13830133873856270605U, 4599112075441176914U, - 4606826008603986804U, 4598688984595225406U, - 13822061021450001214U, 4606826008603986804U, - 4602357870542944470U, 4606066157444814153U, - 13829438194299589961U, 4602357870542944470U, - 4605207475328619533U, 4603803453461190356U, - 13827175490315966164U, 4605207475328619533U, - 4592606767730311893U, 4607127537664763515U, - 13830499574519539323U, 4592606767730311893U, - 4607030246558998647U, 4595762727260045105U, - 13819134764114820913U, 4607030246558998647U, - 4603270878689749849U, 4605602459698789090U, - 13828974496553564898U, 4603270878689749849U, - 4605725276488455441U, 4603087070374583113U, - 13826459107229358921U, 4605725276488455441U, - 4596629994023683153U, 4606987119037722413U, - 13830359155892498221U, 4596629994023683153U, - 4606538458821337243U, 4600360675823176935U, - 13823732712677952743U, 4606538458821337243U, - 4600769149537129431U, 4606453861145241227U, - 13829825898000017035U, 4600769149537129431U, - 4604621949701367983U, 4604465633578481725U, - 13827837670433257533U, 4604621949701367983U, - 4578227681973159812U, 4607181740574479067U, - 13830553777429254875U, 4578227681973159812U, - 4607182249242036882U, 4573724215515480177U, - 13797096252370255985U, 4607182249242036882U, - 4604505071555817232U, 4604583231088591477U, - 13827955267943367285U, 4604505071555817232U, - 4606475480113671417U, 4600667422348321968U, - 13824039459203097776U, 4606475480113671417U, - 4600463181646572228U, 4606517779747998088U, - 13829889816602773896U, 4600463181646572228U, - 4606998399608725124U, 4596413578358834022U, - 13819785615213609830U, 4606998399608725124U, - 4603133304188877240U, 4605694995810664660U, - 13829067032665440468U, 4603133304188877240U, - 4605633586259814045U, 4603225210076562971U, - 13826597246931338779U, 4605633586259814045U, - 4595979936813835462U, 4607019963775302583U, - 13830392000630078391U, 4595979936813835462U, - 4607133460805585796U, 4592167175087283203U, - 13815539211942059011U, 4607133460805585796U, - 4603846496621587377U, 4605172808754305228U, - 13828544845609081036U, 4603846496621587377U, - 4606092657816072624U, 4602260871257280788U, - 13825632908112056596U, 4606092657816072624U, - 4598795050632330097U, 4606810452769876110U, - 13830182489624651918U, 4598795050632330097U, - 4606778366364612594U, 4599006600037663623U, - 13822378636892439431U, 4606778366364612594U, - 4602065906208722008U, 4606144763310860551U, - 13829516800165636359U, 4602065906208722008U, - 4605102686554936490U, 4603931940768740167U, - 13827303977623515975U, 4605102686554936490U, - 4591287158938884897U, 4607144295058764886U, - 13830516331913540694U, 4591287158938884897U, - 4607168688050493276U, 4588115294056142819U, - 13811487330910918627U, 4607168688050493276U, - 4604183020748362039U, 4604886103475043762U, - 13828258140329819570U, 4604183020748362039U, - 4606293848208650998U, 4601473544562720001U, - 13824845581417495809U, 4606293848208650998U, - 4599636300858866724U, 4606674353838411301U, - 13830046390693187109U, 4599636300858866724U, - 4606898891031025132U, 4598136582470364665U, - 13821508619325140473U, 4606898891031025132U, - 4602758354025980442U, 4605929219593405673U, - 13829301256448181481U, 4602758354025980442U, - 4605376811039722786U, 4603585091850767959U, - 13826957128705543767U, 4605376811039722786U, - 4594235767444503503U, 4607092871118901179U, - 13830464907973676987U, 4594235767444503503U, - 4607076652372832968U, 4594673119063280916U, - 13818045155918056724U, 4607076652372832968U, - 4603496309891590679U, 4605442656228245717U, - 13828814693083021525U, 4603496309891590679U, - 4605872393621214213U, 4602853162432841185U, - 13826225199287616993U, 4605872393621214213U, - 4597707695679609371U, 4606925748668145757U, - 13830297785522921565U, 4597707695679609371U, - 4606637115963965612U, 4599844446633109139U, - 13823216483487884947U, 4606637115963965612U, - 4601273700967202825U, 4606341107699334546U, - 13829713144554110354U, 4601273700967202825U, - 4604811873195349477U, 4604264921241055824U, - 13827636958095831632U, 4604811873195349477U, - 4586348876009622851U, 4607174111710118367U, - 13830546148564894175U, 4586348876009622851U, - 4607178180169683960U, 4584498631466405633U, - 13807870668321181441U, 4607178180169683960U, - 4604345904647073908U, 4604736643460027021U, - 13828108680314802829U, 4604345904647073908U, - 4606387137437298591U, 4601072712526242277U, - 13824444749381018085U, 4606387137437298591U, - 4600051662802353687U, 4606598603759044570U, - 13829970640613820378U, 4600051662802353687U, - 4606951288507767453U, 4597277522845151878U, - 13820649559699927686U, 4606951288507767453U, - 4602947266358709886U, 4605814408482919348U, - 13829186445337695156U, 4602947266358709886U, - 4605507406967535927U, 4603406726595779752U, - 13826778763450555560U, 4605507406967535927U, - 4595109641634432498U, 4607059093103722971U, - 13830431129958498779U, 4595109641634432498U, - 4607107746899444102U, 4593797652641645341U, - 13817169689496421149U, 4607107746899444102U, - 4603673059103075106U, 4605309881318010327U, - 13828681918172786135U, 4603673059103075106U, - 4605984877841711338U, 4602646891659203088U, - 13826018928513978896U, 4605984877841711338U, - 4598369669086960528U, 4606870719641066940U, - 13830242756495842748U, 4598369669086960528U, - 4606710311774494716U, 4599427256825614420U, - 13822799293680390228U, 4606710311774494716U, - 4601672213217083403U, 4606245366082353408U, - 13829617402937129216U, 4601672213217083403U, - 4604959323120302796U, 4604100215502905499U, - 13827472252357681307U, 4604959323120302796U, - 4589524267239410099U, 4607161910007591876U, - 13830533946862367684U, 4589524267239410099U, - 4607153778602162496U, 4590406145430462614U, - 13813778182285238422U, 4607153778602162496U, - 4604016517974851588U, 4605031521104517324U, - 13828403557959293132U, 4604016517974851588U, - 4606195668621671667U, 4601869677011524443U, - 13825241713866300251U, 4606195668621671667U, - 4599217346014614711U, 4606744984357082948U, - 13830117021211858756U, 4599217346014614711U, - 4606841238740778884U, 4598582729657176439U, - 13821954766511952247U, 4606841238740778884U, - 4602454542796181607U, 4606039359984203741U, - 13829411396838979549U, 4602454542796181607U, - 4605241877142478242U, 4603760198400967492U, - 13827132235255743300U, 4605241877142478242U, - 4593046061348462537U, 4607121277474223905U, - 13830493314328999713U, 4593046061348462537U, - 4607040195955932526U, 4595545269419264690U, - 13818917306274040498U, 4607040195955932526U, - 4603316355454250015U, 4605571053506370248U, - 13828943090361146056U, 4603316355454250015U, - 4605755272910869620U, 4603040651631881451U, - 13826412688486657259U, 4605755272910869620U, - 4596846128749438754U, 4606975506703684317U, - 13830347543558460125U, 4596846128749438754U, - 4606558823023444576U, 4600257918160607478U, - 13823629955015383286U, 4606558823023444576U, - 4600870609507958271U, 4606431930490633905U, - 13829803967345409713U, 4600870609507958271U, - 4604660425598397818U, 4604425958770613225U, - 13827797995625389033U, 4604660425598397818U, - 4580962600092897021U, 4607180892816495009U, - 13830552929671270817U, 4580962600092897021U, - 4607180892816495009U, 4580962600092897021U, - 13804334636947672829U, 4607180892816495009U, - 4604425958770613225U, 4604660425598397818U, - 13828032462453173626U, 4604425958770613225U, - 4606431930490633905U, 4600870609507958271U, - 13824242646362734079U, 4606431930490633905U, - 4600257918160607478U, 4606558823023444576U, - 13829930859878220384U, 4600257918160607478U, - 4606975506703684317U, 4596846128749438754U, - 13820218165604214562U, 4606975506703684317U, - 4603040651631881451U, 4605755272910869620U, - 13829127309765645428U, 4603040651631881451U, - 4605571053506370248U, 4603316355454250015U, - 13826688392309025823U, 4605571053506370248U, - 4595545269419264690U, 4607040195955932526U, - 13830412232810708334U, 4595545269419264690U, - 4607121277474223905U, 4593046061348462537U, - 13816418098203238345U, 4607121277474223905U, - 4603760198400967492U, 4605241877142478242U, - 13828613913997254050U, 4603760198400967492U, - 4606039359984203741U, 4602454542796181607U, - 13825826579650957415U, 4606039359984203741U, - 4598582729657176439U, 4606841238740778884U, - 13830213275595554692U, 4598582729657176439U, - 4606744984357082948U, 4599217346014614711U, - 13822589382869390519U, 4606744984357082948U, - 4601869677011524443U, 4606195668621671667U, - 13829567705476447475U, 4601869677011524443U, - 4605031521104517324U, 4604016517974851588U, - 13827388554829627396U, 4605031521104517324U, - 4590406145430462614U, 4607153778602162496U, - 13830525815456938304U, 4590406145430462614U, - 4607161910007591876U, 4589524267239410099U, - 13812896304094185907U, 4607161910007591876U, - 4604100215502905499U, 4604959323120302796U, - 13828331359975078604U, 4604100215502905499U, - 4606245366082353408U, 4601672213217083403U, - 13825044250071859211U, 4606245366082353408U, - 4599427256825614420U, 4606710311774494716U, - 13830082348629270524U, 4599427256825614420U, - 4606870719641066940U, 4598369669086960528U, - 13821741705941736336U, 4606870719641066940U, - 4602646891659203088U, 4605984877841711338U, - 13829356914696487146U, 4602646891659203088U, - 4605309881318010327U, 4603673059103075106U, - 13827045095957850914U, 4605309881318010327U, - 4593797652641645341U, 4607107746899444102U, - 13830479783754219910U, 4593797652641645341U, - 4607059093103722971U, 4595109641634432498U, - 13818481678489208306U, 4607059093103722971U, - 4603406726595779752U, 4605507406967535927U, - 13828879443822311735U, 4603406726595779752U, - 4605814408482919348U, 4602947266358709886U, - 13826319303213485694U, 4605814408482919348U, - 4597277522845151878U, 4606951288507767453U, - 13830323325362543261U, 4597277522845151878U, - 4606598603759044570U, 4600051662802353687U, - 13823423699657129495U, 4606598603759044570U, - 4601072712526242277U, 4606387137437298591U, - 13829759174292074399U, 4601072712526242277U, - 4604736643460027021U, 4604345904647073908U, - 13827717941501849716U, 4604736643460027021U, - 4584498631466405633U, 4607178180169683960U, - 13830550217024459768U, 4584498631466405633U, - 4607174111710118367U, 4586348876009622851U, - 13809720912864398659U, 4607174111710118367U, - 4604264921241055824U, 4604811873195349477U, - 13828183910050125285U, 4604264921241055824U, - 4606341107699334546U, 4601273700967202825U, - 13824645737821978633U, 4606341107699334546U, - 4599844446633109139U, 4606637115963965612U, - 13830009152818741420U, 4599844446633109139U, - 4606925748668145757U, 4597707695679609371U, - 13821079732534385179U, 4606925748668145757U, - 4602853162432841185U, 4605872393621214213U, - 13829244430475990021U, 4602853162432841185U, - 4605442656228245717U, 4603496309891590679U, - 13826868346746366487U, 4605442656228245717U, - 4594673119063280916U, 4607076652372832968U, - 13830448689227608776U, 4594673119063280916U, - 4607092871118901179U, 4594235767444503503U, - 13817607804299279311U, 4607092871118901179U, - 4603585091850767959U, 4605376811039722786U, - 13828748847894498594U, 4603585091850767959U, - 4605929219593405673U, 4602758354025980442U, - 13826130390880756250U, 4605929219593405673U, - 4598136582470364665U, 4606898891031025132U, - 13830270927885800940U, 4598136582470364665U, - 4606674353838411301U, 4599636300858866724U, - 13823008337713642532U, 4606674353838411301U, - 4601473544562720001U, 4606293848208650998U, - 13829665885063426806U, 4601473544562720001U, - 4604886103475043762U, 4604183020748362039U, - 13827555057603137847U, 4604886103475043762U, - 4588115294056142819U, 4607168688050493276U, - 13830540724905269084U, 4588115294056142819U, - 4607144295058764886U, 4591287158938884897U, - 13814659195793660705U, 4607144295058764886U, - 4603931940768740167U, 4605102686554936490U, - 13828474723409712298U, 4603931940768740167U, - 4606144763310860551U, 4602065906208722008U, - 13825437943063497816U, 4606144763310860551U, - 4599006600037663623U, 4606778366364612594U, - 13830150403219388402U, 4599006600037663623U, - 4606810452769876110U, 4598795050632330097U, - 13822167087487105905U, 4606810452769876110U, - 4602260871257280788U, 4606092657816072624U, - 13829464694670848432U, 4602260871257280788U, - 4605172808754305228U, 4603846496621587377U, - 13827218533476363185U, 4605172808754305228U, - 4592167175087283203U, 4607133460805585796U, - 13830505497660361604U, 4592167175087283203U, - 4607019963775302583U, 4595979936813835462U, - 13819351973668611270U, 4607019963775302583U, - 4603225210076562971U, 4605633586259814045U, - 13829005623114589853U, 4603225210076562971U, - 4605694995810664660U, 4603133304188877240U, - 13826505341043653048U, 4605694995810664660U, - 4596413578358834022U, 4606998399608725124U, - 13830370436463500932U, 4596413578358834022U, - 4606517779747998088U, 4600463181646572228U, - 13823835218501348036U, 4606517779747998088U, - 4600667422348321968U, 4606475480113671417U, - 13829847516968447225U, 4600667422348321968U, - 4604583231088591477U, 4604505071555817232U, - 13827877108410593040U, 4604583231088591477U, - 4573724215515480177U, 4607182249242036882U, - 13830554286096812690U, 4573724215515480177U, - 4607182376410422530U, 4569220649180767418U, - 13792592686035543226U, 4607182376410422530U, - 4604524701268679793U, 4604563781218984604U, - 13827935818073760412U, 4604524701268679793U, - 4606486172460753999U, 4600616459743653188U, - 13823988496598428996U, 4606486172460753999U, - 4600514338912178239U, 4606507322377452870U, - 13829879359232228678U, 4600514338912178239U, - 4607003915349878877U, 4596305267720071930U, - 13819677304574847738U, 4607003915349878877U, - 4603156351203636159U, 4605679749231851918U, - 13829051786086627726U, 4603156351203636159U, - 4605649044311923410U, 4603202304363743346U, - 13826574341218519154U, 4605649044311923410U, - 4596088445927168004U, 4607014697483910382U, - 13830386734338686190U, 4596088445927168004U, - 4607136295912168606U, 4591947271803021404U, - 13815319308657797212U, 4607136295912168606U, - 4603867938232615808U, 4605155376589456981U, - 13828527413444232789U, 4603867938232615808U, - 4606105796280968177U, 4602212250118051877U, - 13825584286972827685U, 4606105796280968177U, - 4598848011564831930U, 4606802552898869248U, - 13830174589753645056U, 4598848011564831930U, - 4606786509620734768U, 4598953786765296928U, - 13822325823620072736U, 4606786509620734768U, - 4602114767134999006U, 4606131849150971908U, - 13829503886005747716U, 4602114767134999006U, - 4605120315324767624U, 4603910660507251362U, - 13827282697362027170U, 4605120315324767624U, - 4591507261658050721U, 4607141713064252300U, - 13830513749919028108U, 4591507261658050721U, - 4607170170974224083U, 4587673791460508439U, - 13811045828315284247U, 4607170170974224083U, - 4604203581176243359U, 4604867640218014515U, - 13828239677072790323U, 4604203581176243359U, - 4606305777984577632U, 4601423692641949331U, - 13824795729496725139U, 4606305777984577632U, - 4599688422741010356U, 4606665164148251002U, - 13830037201003026810U, 4599688422741010356U, - 4606905728766014348U, 4598029484874872834U, - 13821401521729648642U, 4606905728766014348U, - 4602782121393764535U, 4605915122243179241U, - 13829287159097955049U, 4602782121393764535U, - 4605393374401988274U, 4603562972219549215U, - 13826935009074325023U, 4605393374401988274U, - 4594345179472540681U, 4607088942243446236U, - 13830460979098222044U, 4594345179472540681U, - 4607080832832247697U, 4594563856311064231U, - 13817935893165840039U, 4607080832832247697U, - 4603518581031047189U, 4605426297151190466U, - 13828798334005966274U, 4603518581031047189U, - 4605886709123365959U, 4602829525820289164U, - 13826201562675064972U, 4605886709123365959U, - 4597815040470278984U, 4606919157647773535U, - 13830291194502549343U, 4597815040470278984U, - 4606646545123403481U, 4599792496117920694U, - 13823164532972696502U, 4606646545123403481U, - 4601323770373937522U, 4606329407841126011U, - 13829701444695901819U, 4601323770373937522U, - 4604830524903495634U, 4604244531615310815U, - 13827616568470086623U, 4604830524903495634U, - 4586790578280679046U, 4607172882816799076U, - 13830544919671574884U, 4586790578280679046U, - 4607178985458280057U, 4583614727651146525U, - 13806986764505922333U, 4607178985458280057U, - 4604366005771528720U, 4604717681185626434U, - 13828089718040402242U, 4604366005771528720U, - 4606398451906509788U, 4601022290077223616U, - 13824394326931999424U, 4606398451906509788U, - 4600103317933788342U, 4606588777269136769U, - 13829960814123912577U, 4600103317933788342U, - 4606957467106717424U, 4597169786279785693U, - 13820541823134561501U, 4606957467106717424U, - 4602970680601913687U, 4605799732098147061U, - 13829171768952922869U, 4602970680601913687U, - 4605523422498301790U, 4603384207141321914U, - 13826756243996097722U, 4605523422498301790U, - 4595218635031890910U, 4607054494135176056U, - 13830426530989951864U, 4595218635031890910U, - 4607111255739239816U, 4593688012422887515U, - 13817060049277663323U, 4607111255739239816U, - 4603694922063032361U, 4605292980606880364U, - 13828665017461656172U, 4603694922063032361U, - 4605998608960791335U, 4602598930031891166U, - 13825970966886666974U, 4605998608960791335U, - 4598423001813699022U, 4606863472012527185U, - 13830235508867302993U, 4598423001813699022U, - 4606719100629313491U, 4599374859150636784U, - 13822746896005412592U, 4606719100629313491U, - 4601721693286060937U, 4606233055365547081U, - 13829605092220322889U, 4601721693286060937U, - 4604977468824438271U, 4604079374282302598U, - 13827451411137078406U, 4604977468824438271U, - 4589744810590291021U, 4607160003989618959U, - 13830532040844394767U, 4589744810590291021U, - 4607155938267770208U, 4590185751760970393U, - 13813557788615746201U, 4607155938267770208U, - 4604037525321326463U, 4605013567986435066U, - 13828385604841210874U, 4604037525321326463U, - 4606208206518262803U, 4601820425647934753U, - 13825192462502710561U, 4606208206518262803U, - 4599269903251194481U, 4606736437002195879U, - 13830108473856971687U, 4599269903251194481U, - 4606848731493011465U, 4598529532600161144U, - 13821901569454936952U, 4606848731493011465U, - 4602502755147763107U, 4606025850160239809U, - 13829397887015015617U, 4602502755147763107U, - 4605258978359093269U, 4603738491917026584U, - 13827110528771802392U, 4605258978359093269U, - 4593265590854265407U, 4607118021058468598U, - 13830490057913244406U, 4593265590854265407U, - 4607045045516813836U, 4595436449949385485U, - 13818808486804161293U, 4607045045516813836U, - 4603339021357904144U, 4605555245917486022U, - 13828927282772261830U, 4603339021357904144U, - 4605770164172969910U, 4603017373458244943U, - 13826389410313020751U, 4605770164172969910U, - 4596954088216812973U, 4606969576261663845U, - 13830341613116439653U, 4596954088216812973U, - 4606568886807728474U, 4600206446098256018U, - 13823578482953031826U, 4606568886807728474U, - 4600921238092511730U, 4606420848538580260U, - 13829792885393356068U, 4600921238092511730U, - 4604679572075463103U, 4604406033021674239U, - 13827778069876450047U, 4604679572075463103U, - 4581846703643734566U, 4607180341788068727U, - 13830552378642844535U, 4581846703643734566U, - 4607181359080094673U, 4579996072175835083U, - 13803368109030610891U, 4607181359080094673U, - 4604445825685214043U, 4604641218080103285U, - 13828013254934879093U, 4604445825685214043U, - 4606442934727379583U, 4600819913163773071U, - 13824191950018548879U, 4606442934727379583U, - 4600309328230211502U, 4606548680329491866U, - 13829920717184267674U, 4600309328230211502U, - 4606981354314050484U, 4596738097012783531U, - 13820110133867559339U, 4606981354314050484U, - 4603063884010218172U, 4605740310302420207U, - 13829112347157196015U, 4603063884010218172U, - 4605586791482848547U, 4603293641160266722U, - 13826665678015042530U, 4605586791482848547U, - 4595654028864046335U, 4607035262954517034U, - 13830407299809292842U, 4595654028864046335U, - 4607124449686274900U, 4592826452951465409U, - 13816198489806241217U, 4607124449686274900U, - 4603781852316960384U, 4605224709411790590U, - 13828596746266566398U, 4603781852316960384U, - 4606052795787882823U, 4602406247776385022U, - 13825778284631160830U, 4606052795787882823U, - 4598635880488956483U, 4606833664420673202U, - 13830205701275449010U, 4598635880488956483U, - 4606753451050079834U, 4599164736579548843U, - 13822536773434324651U, 4606753451050079834U, - 4601918851211878557U, 4606183055233559255U, - 13829555092088335063U, 4601918851211878557U, - 4605049409688478101U, 4603995455647851249U, - 13827367492502627057U, 4605049409688478101U, - 4590626485056654602U, 4607151534426937478U, - 13830523571281713286U, 4590626485056654602U, - 4607163731439411601U, 4589303678145802340U, - 13812675715000578148U, 4607163731439411601U, - 4604121000955189926U, 4604941113561600762U, - 13828313150416376570U, 4604121000955189926U, - 4606257600839867033U, 4601622657843474729U, - 13824994694698250537U, 4606257600839867033U, - 4599479600326345459U, 4606701442584137310U, - 13830073479438913118U, 4599479600326345459U, - 4606877885424248132U, 4598316292140394014U, - 13821688328995169822U, 4606877885424248132U, - 4602686793990243041U, 4605971073215153165U, - 13829343110069928973U, 4602686793990243041U, - 4605326714874986465U, 4603651144395358093U, - 13827023181250133901U, 4605326714874986465U, - 4593907249284540294U, 4607104153983298999U, - 13830476190838074807U, 4593907249284540294U, - 4607063608453868552U, 4595000592312171144U, - 13818372629166946952U, 4607063608453868552U, - 4603429196809300824U, 4605491322423429598U, - 13828863359278205406U, 4603429196809300824U, - 4605829012964735987U, 4602923807199184054U, - 13826295844053959862U, 4605829012964735987U, - 4597385183080791534U, 4606945027305114062U, - 13830317064159889870U, 4597385183080791534U, - 4606608350964852124U, 4599999947619525579U, - 13823371984474301387U, 4606608350964852124U, - 4601123065313358619U, 4606375745674388705U, - 13829747782529164513U, 4601123065313358619U, - 4604755543975806820U, 4604325745441780828U, - 13827697782296556636U, 4604755543975806820U, - 4585023436363055487U, 4607177290141793710U, - 13830549326996569518U, 4585023436363055487U, - 4607175255902437396U, 4585907115494236537U, - 13809279152349012345U, 4607175255902437396U, - 4604285253548209224U, 4604793159020491611U, - 13828165195875267419U, 4604285253548209224U, - 4606352730697093817U, 4601223560006786057U, - 13824595596861561865U, 4606352730697093817U, - 4599896339047301634U, 4606627607157935956U, - 13829999644012711764U, 4599896339047301634U, - 4606932257325205256U, 4597600270510262682U, - 13820972307365038490U, 4606932257325205256U, - 4602876755014813164U, 4605858005670328613U, - 13829230042525104421U, 4602876755014813164U, - 4605458946901419122U, 4603473988668005304U, - 13826846025522781112U, 4605458946901419122U, - 4594782329999411347U, 4607072388129742377U, - 13830444424984518185U, 4594782329999411347U, - 4607096716058023245U, 4594126307716900071U, - 13817498344571675879U, 4607096716058023245U, - 4603607160562208225U, 4605360179893335444U, - 13828732216748111252U, 4603607160562208225U, - 4605943243960030558U, 4602734543519989142U, - 13826106580374764950U, 4605943243960030558U, - 4598209407597805010U, 4606891971185517504U, - 13830264008040293312U, 4598209407597805010U, - 4606683463531482757U, 4599584122834874440U, - 13822956159689650248U, 4606683463531482757U, - 4601523323048804569U, 4606281842017099424U, - 13829653878871875232U, 4601523323048804569U, - 4604904503566677638U, 4604162403772767740U, - 13827534440627543548U, 4604904503566677638U, - 4588556721781247689U, 4607167120476811757U, - 13830539157331587565U, 4588556721781247689U, - 4607146792632922887U, 4591066993883984169U, - 13814439030738759977U, 4607146792632922887U, - 4603953166845776383U, 4605084992581147553U, - 13828457029435923361U, 4603953166845776383U, - 4606157602458368090U, 4602016966272225497U, - 13825389003127001305U, 4606157602458368090U, - 4599059363095165615U, 4606770142132396069U, - 13830142178987171877U, 4599059363095165615U, - 4606818271362779153U, 4598742041476147134U, - 13822114078330922942U, 4606818271362779153U, - 4602309411551204896U, 4606079444829232727U, - 13829451481684008535U, 4602309411551204896U, - 4605190175055178825U, 4603825001630339212U, - 13827197038485115020U, 4605190175055178825U, - 4592387007752762956U, 4607130541380624519U, - 13830502578235400327U, 4592387007752762956U, - 4607025146816593591U, 4595871363584150300U, - 13819243400438926108U, 4607025146816593591U, - 4603248068256948438U, 4605618058006716661U, - 13828990094861492469U, 4603248068256948438U, - 4605710171610479304U, 4603110210506737381U, - 13826482247361513189U, 4605710171610479304U, - 4596521820799644122U, 4606992800820440327U, - 13830364837675216135U, 4596521820799644122U, - 4606528158595189433U, 4600411960456200676U, - 13823783997310976484U, 4606528158595189433U, - 4600718319105833937U, 4606464709641375231U, - 13829836746496151039U, 4600718319105833937U, - 4604602620643553229U, 4604485382263976838U, - 13827857419118752646U, 4604602620643553229U, - 4576459225186735875U, 4607182037296057423U, - 13830554074150833231U, 4576459225186735875U, - 4607182037296057423U, 4576459225186735875U, - 13799831262041511683U, 4607182037296057423U, - 4604485382263976838U, 4604602620643553229U, - 13827974657498329037U, 4604485382263976838U, - 4606464709641375231U, 4600718319105833937U, - 13824090355960609745U, 4606464709641375231U, - 4600411960456200676U, 4606528158595189433U, - 13829900195449965241U, 4600411960456200676U, - 4606992800820440327U, 4596521820799644122U, - 13819893857654419930U, 4606992800820440327U, - 4603110210506737381U, 4605710171610479304U, - 13829082208465255112U, 4603110210506737381U, - 4605618058006716661U, 4603248068256948438U, - 13826620105111724246U, 4605618058006716661U, - 4595871363584150300U, 4607025146816593591U, - 13830397183671369399U, 4595871363584150300U, - 4607130541380624519U, 4592387007752762956U, - 13815759044607538764U, 4607130541380624519U, - 4603825001630339212U, 4605190175055178825U, - 13828562211909954633U, 4603825001630339212U, - 4606079444829232727U, 4602309411551204896U, - 13825681448405980704U, 4606079444829232727U, - 4598742041476147134U, 4606818271362779153U, - 13830190308217554961U, 4598742041476147134U, - 4606770142132396069U, 4599059363095165615U, - 13822431399949941423U, 4606770142132396069U, - 4602016966272225497U, 4606157602458368090U, - 13829529639313143898U, 4602016966272225497U, - 4605084992581147553U, 4603953166845776383U, - 13827325203700552191U, 4605084992581147553U, - 4591066993883984169U, 4607146792632922887U, - 13830518829487698695U, 4591066993883984169U, - 4607167120476811757U, 4588556721781247689U, - 13811928758636023497U, 4607167120476811757U, - 4604162403772767740U, 4604904503566677638U, - 13828276540421453446U, 4604162403772767740U, - 4606281842017099424U, 4601523323048804569U, - 13824895359903580377U, 4606281842017099424U, - 4599584122834874440U, 4606683463531482757U, - 13830055500386258565U, 4599584122834874440U, - 4606891971185517504U, 4598209407597805010U, - 13821581444452580818U, 4606891971185517504U, - 4602734543519989142U, 4605943243960030558U, - 13829315280814806366U, 4602734543519989142U, - 4605360179893335444U, 4603607160562208225U, - 13826979197416984033U, 4605360179893335444U, - 4594126307716900071U, 4607096716058023245U, - 13830468752912799053U, 4594126307716900071U, - 4607072388129742377U, 4594782329999411347U, - 13818154366854187155U, 4607072388129742377U, - 4603473988668005304U, 4605458946901419122U, - 13828830983756194930U, 4603473988668005304U, - 4605858005670328613U, 4602876755014813164U, - 13826248791869588972U, 4605858005670328613U, - 4597600270510262682U, 4606932257325205256U, - 13830304294179981064U, 4597600270510262682U, - 4606627607157935956U, 4599896339047301634U, - 13823268375902077442U, 4606627607157935956U, - 4601223560006786057U, 4606352730697093817U, - 13829724767551869625U, 4601223560006786057U, - 4604793159020491611U, 4604285253548209224U, - 13827657290402985032U, 4604793159020491611U, - 4585907115494236537U, 4607175255902437396U, - 13830547292757213204U, 4585907115494236537U, - 4607177290141793710U, 4585023436363055487U, - 13808395473217831295U, 4607177290141793710U, - 4604325745441780828U, 4604755543975806820U, - 13828127580830582628U, 4604325745441780828U, - 4606375745674388705U, 4601123065313358619U, - 13824495102168134427U, 4606375745674388705U, - 4599999947619525579U, 4606608350964852124U, - 13829980387819627932U, 4599999947619525579U, - 4606945027305114062U, 4597385183080791534U, - 13820757219935567342U, 4606945027305114062U, - 4602923807199184054U, 4605829012964735987U, - 13829201049819511795U, 4602923807199184054U, - 4605491322423429598U, 4603429196809300824U, - 13826801233664076632U, 4605491322423429598U, - 4595000592312171144U, 4607063608453868552U, - 13830435645308644360U, 4595000592312171144U, - 4607104153983298999U, 4593907249284540294U, - 13817279286139316102U, 4607104153983298999U, - 4603651144395358093U, 4605326714874986465U, - 13828698751729762273U, 4603651144395358093U, - 4605971073215153165U, 4602686793990243041U, - 13826058830845018849U, 4605971073215153165U, - 4598316292140394014U, 4606877885424248132U, - 13830249922279023940U, 4598316292140394014U, - 4606701442584137310U, 4599479600326345459U, - 13822851637181121267U, 4606701442584137310U, - 4601622657843474729U, 4606257600839867033U, - 13829629637694642841U, 4601622657843474729U, - 4604941113561600762U, 4604121000955189926U, - 13827493037809965734U, 4604941113561600762U, - 4589303678145802340U, 4607163731439411601U, - 13830535768294187409U, 4589303678145802340U, - 4607151534426937478U, 4590626485056654602U, - 13813998521911430410U, 4607151534426937478U, - 4603995455647851249U, 4605049409688478101U, - 13828421446543253909U, 4603995455647851249U, - 4606183055233559255U, 4601918851211878557U, - 13825290888066654365U, 4606183055233559255U, - 4599164736579548843U, 4606753451050079834U, - 13830125487904855642U, 4599164736579548843U, - 4606833664420673202U, 4598635880488956483U, - 13822007917343732291U, 4606833664420673202U, - 4602406247776385022U, 4606052795787882823U, - 13829424832642658631U, 4602406247776385022U, - 4605224709411790590U, 4603781852316960384U, - 13827153889171736192U, 4605224709411790590U, - 4592826452951465409U, 4607124449686274900U, - 13830496486541050708U, 4592826452951465409U, - 4607035262954517034U, 4595654028864046335U, - 13819026065718822143U, 4607035262954517034U, - 4603293641160266722U, 4605586791482848547U, - 13828958828337624355U, 4603293641160266722U, - 4605740310302420207U, 4603063884010218172U, - 13826435920864993980U, 4605740310302420207U, - 4596738097012783531U, 4606981354314050484U, - 13830353391168826292U, 4596738097012783531U, - 4606548680329491866U, 4600309328230211502U, - 13823681365084987310U, 4606548680329491866U, - 4600819913163773071U, 4606442934727379583U, - 13829814971582155391U, 4600819913163773071U, - 4604641218080103285U, 4604445825685214043U, - 13827817862539989851U, 4604641218080103285U, - 4579996072175835083U, 4607181359080094673U, - 13830553395934870481U, 4579996072175835083U, - 4607180341788068727U, 4581846703643734566U, - 13805218740498510374U, 4607180341788068727U, - 4604406033021674239U, 4604679572075463103U, - 13828051608930238911U, 4604406033021674239U, - 4606420848538580260U, 4600921238092511730U, - 13824293274947287538U, 4606420848538580260U, - 4600206446098256018U, 4606568886807728474U, - 13829940923662504282U, 4600206446098256018U, - 4606969576261663845U, 4596954088216812973U, - 13820326125071588781U, 4606969576261663845U, - 4603017373458244943U, 4605770164172969910U, - 13829142201027745718U, 4603017373458244943U, - 4605555245917486022U, 4603339021357904144U, - 13826711058212679952U, 4605555245917486022U, - 4595436449949385485U, 4607045045516813836U, - 13830417082371589644U, 4595436449949385485U, - 4607118021058468598U, 4593265590854265407U, - 13816637627709041215U, 4607118021058468598U, - 4603738491917026584U, 4605258978359093269U, - 13828631015213869077U, 4603738491917026584U, - 4606025850160239809U, 4602502755147763107U, - 13825874792002538915U, 4606025850160239809U, - 4598529532600161144U, 4606848731493011465U, - 13830220768347787273U, 4598529532600161144U, - 4606736437002195879U, 4599269903251194481U, - 13822641940105970289U, 4606736437002195879U, - 4601820425647934753U, 4606208206518262803U, - 13829580243373038611U, 4601820425647934753U, - 4605013567986435066U, 4604037525321326463U, - 13827409562176102271U, 4605013567986435066U, - 4590185751760970393U, 4607155938267770208U, - 13830527975122546016U, 4590185751760970393U, - 4607160003989618959U, 4589744810590291021U, - 13813116847445066829U, 4607160003989618959U, - 4604079374282302598U, 4604977468824438271U, - 13828349505679214079U, 4604079374282302598U, - 4606233055365547081U, 4601721693286060937U, - 13825093730140836745U, 4606233055365547081U, - 4599374859150636784U, 4606719100629313491U, - 13830091137484089299U, 4599374859150636784U, - 4606863472012527185U, 4598423001813699022U, - 13821795038668474830U, 4606863472012527185U, - 4602598930031891166U, 4605998608960791335U, - 13829370645815567143U, 4602598930031891166U, - 4605292980606880364U, 4603694922063032361U, - 13827066958917808169U, 4605292980606880364U, - 4593688012422887515U, 4607111255739239816U, - 13830483292594015624U, 4593688012422887515U, - 4607054494135176056U, 4595218635031890910U, - 13818590671886666718U, 4607054494135176056U, - 4603384207141321914U, 4605523422498301790U, - 13828895459353077598U, 4603384207141321914U, - 4605799732098147061U, 4602970680601913687U, - 13826342717456689495U, 4605799732098147061U, - 4597169786279785693U, 4606957467106717424U, - 13830329503961493232U, 4597169786279785693U, - 4606588777269136769U, 4600103317933788342U, - 13823475354788564150U, 4606588777269136769U, - 4601022290077223616U, 4606398451906509788U, - 13829770488761285596U, 4601022290077223616U, - 4604717681185626434U, 4604366005771528720U, - 13827738042626304528U, 4604717681185626434U, - 4583614727651146525U, 4607178985458280057U, - 13830551022313055865U, 4583614727651146525U, - 4607172882816799076U, 4586790578280679046U, - 13810162615135454854U, 4607172882816799076U, - 4604244531615310815U, 4604830524903495634U, - 13828202561758271442U, 4604244531615310815U, - 4606329407841126011U, 4601323770373937522U, - 13824695807228713330U, 4606329407841126011U, - 4599792496117920694U, 4606646545123403481U, - 13830018581978179289U, 4599792496117920694U, - 4606919157647773535U, 4597815040470278984U, - 13821187077325054792U, 4606919157647773535U, - 4602829525820289164U, 4605886709123365959U, - 13829258745978141767U, 4602829525820289164U, - 4605426297151190466U, 4603518581031047189U, - 13826890617885822997U, 4605426297151190466U, - 4594563856311064231U, 4607080832832247697U, - 13830452869687023505U, 4594563856311064231U, - 4607088942243446236U, 4594345179472540681U, - 13817717216327316489U, 4607088942243446236U, - 4603562972219549215U, 4605393374401988274U, - 13828765411256764082U, 4603562972219549215U, - 4605915122243179241U, 4602782121393764535U, - 13826154158248540343U, 4605915122243179241U, - 4598029484874872834U, 4606905728766014348U, - 13830277765620790156U, 4598029484874872834U, - 4606665164148251002U, 4599688422741010356U, - 13823060459595786164U, 4606665164148251002U, - 4601423692641949331U, 4606305777984577632U, - 13829677814839353440U, 4601423692641949331U, - 4604867640218014515U, 4604203581176243359U, - 13827575618031019167U, 4604867640218014515U, - 4587673791460508439U, 4607170170974224083U, - 13830542207828999891U, 4587673791460508439U, - 4607141713064252300U, 4591507261658050721U, - 13814879298512826529U, 4607141713064252300U, - 4603910660507251362U, 4605120315324767624U, - 13828492352179543432U, 4603910660507251362U, - 4606131849150971908U, 4602114767134999006U, - 13825486803989774814U, 4606131849150971908U, - 4598953786765296928U, 4606786509620734768U, - 13830158546475510576U, 4598953786765296928U, - 4606802552898869248U, 4598848011564831930U, - 13822220048419607738U, 4606802552898869248U, - 4602212250118051877U, 4606105796280968177U, - 13829477833135743985U, 4602212250118051877U, - 4605155376589456981U, 4603867938232615808U, - 13827239975087391616U, 4605155376589456981U, - 4591947271803021404U, 4607136295912168606U, - 13830508332766944414U, 4591947271803021404U, - 4607014697483910382U, 4596088445927168004U, - 13819460482781943812U, 4607014697483910382U, - 4603202304363743346U, 4605649044311923410U, - 13829021081166699218U, 4603202304363743346U, - 4605679749231851918U, 4603156351203636159U, - 13826528388058411967U, 4605679749231851918U, - 4596305267720071930U, 4607003915349878877U, - 13830375952204654685U, 4596305267720071930U, - 4606507322377452870U, 4600514338912178239U, - 13823886375766954047U, 4606507322377452870U, - 4600616459743653188U, 4606486172460753999U, - 13829858209315529807U, 4600616459743653188U, - 4604563781218984604U, 4604524701268679793U, - 13827896738123455601U, 4604563781218984604U, - 4569220649180767418U, 4607182376410422530U, - 13830554413265198338U, 4569220649180767418U -}; - -const fpr fpr_p2_tab[] = { - 4611686018427387904U, - 4607182418800017408U, - 4602678819172646912U, - 4598175219545276416U, - 4593671619917905920U, - 4589168020290535424U, - 4584664420663164928U, - 4580160821035794432U, - 4575657221408423936U, - 4571153621781053440U, - 4566650022153682944U -}; - -#elif FALCON_FPNATIVE // yyyFPEMU+0 yyyFPNATIVE+1 - -const fpr fpr_gm_tab[] = { - {0}, {0}, /* unused */ - {-0.000000000000000000000000000}, { 1.000000000000000000000000000}, - { 0.707106781186547524400844362}, { 0.707106781186547524400844362}, - {-0.707106781186547524400844362}, { 0.707106781186547524400844362}, - { 0.923879532511286756128183189}, { 0.382683432365089771728459984}, - {-0.382683432365089771728459984}, { 0.923879532511286756128183189}, - { 0.382683432365089771728459984}, { 0.923879532511286756128183189}, - {-0.923879532511286756128183189}, { 0.382683432365089771728459984}, - { 0.980785280403230449126182236}, { 0.195090322016128267848284868}, - {-0.195090322016128267848284868}, { 0.980785280403230449126182236}, - { 0.555570233019602224742830814}, { 0.831469612302545237078788378}, - {-0.831469612302545237078788378}, { 0.555570233019602224742830814}, - { 0.831469612302545237078788378}, { 0.555570233019602224742830814}, - {-0.555570233019602224742830814}, { 0.831469612302545237078788378}, - { 0.195090322016128267848284868}, { 0.980785280403230449126182236}, - {-0.980785280403230449126182236}, { 0.195090322016128267848284868}, - { 0.995184726672196886244836953}, { 0.098017140329560601994195564}, - {-0.098017140329560601994195564}, { 0.995184726672196886244836953}, - { 0.634393284163645498215171613}, { 0.773010453362736960810906610}, - {-0.773010453362736960810906610}, { 0.634393284163645498215171613}, - { 0.881921264348355029712756864}, { 0.471396736825997648556387626}, - {-0.471396736825997648556387626}, { 0.881921264348355029712756864}, - { 0.290284677254462367636192376}, { 0.956940335732208864935797887}, - {-0.956940335732208864935797887}, { 0.290284677254462367636192376}, - { 0.956940335732208864935797887}, { 0.290284677254462367636192376}, - {-0.290284677254462367636192376}, { 0.956940335732208864935797887}, - { 0.471396736825997648556387626}, { 0.881921264348355029712756864}, - {-0.881921264348355029712756864}, { 0.471396736825997648556387626}, - { 0.773010453362736960810906610}, { 0.634393284163645498215171613}, - {-0.634393284163645498215171613}, { 0.773010453362736960810906610}, - { 0.098017140329560601994195564}, { 0.995184726672196886244836953}, - {-0.995184726672196886244836953}, { 0.098017140329560601994195564}, - { 0.998795456205172392714771605}, { 0.049067674327418014254954977}, - {-0.049067674327418014254954977}, { 0.998795456205172392714771605}, - { 0.671558954847018400625376850}, { 0.740951125354959091175616897}, - {-0.740951125354959091175616897}, { 0.671558954847018400625376850}, - { 0.903989293123443331586200297}, { 0.427555093430282094320966857}, - {-0.427555093430282094320966857}, { 0.903989293123443331586200297}, - { 0.336889853392220050689253213}, { 0.941544065183020778412509403}, - {-0.941544065183020778412509403}, { 0.336889853392220050689253213}, - { 0.970031253194543992603984207}, { 0.242980179903263889948274162}, - {-0.242980179903263889948274162}, { 0.970031253194543992603984207}, - { 0.514102744193221726593693839}, { 0.857728610000272069902269984}, - {-0.857728610000272069902269984}, { 0.514102744193221726593693839}, - { 0.803207531480644909806676513}, { 0.595699304492433343467036529}, - {-0.595699304492433343467036529}, { 0.803207531480644909806676513}, - { 0.146730474455361751658850130}, { 0.989176509964780973451673738}, - {-0.989176509964780973451673738}, { 0.146730474455361751658850130}, - { 0.989176509964780973451673738}, { 0.146730474455361751658850130}, - {-0.146730474455361751658850130}, { 0.989176509964780973451673738}, - { 0.595699304492433343467036529}, { 0.803207531480644909806676513}, - {-0.803207531480644909806676513}, { 0.595699304492433343467036529}, - { 0.857728610000272069902269984}, { 0.514102744193221726593693839}, - {-0.514102744193221726593693839}, { 0.857728610000272069902269984}, - { 0.242980179903263889948274162}, { 0.970031253194543992603984207}, - {-0.970031253194543992603984207}, { 0.242980179903263889948274162}, - { 0.941544065183020778412509403}, { 0.336889853392220050689253213}, - {-0.336889853392220050689253213}, { 0.941544065183020778412509403}, - { 0.427555093430282094320966857}, { 0.903989293123443331586200297}, - {-0.903989293123443331586200297}, { 0.427555093430282094320966857}, - { 0.740951125354959091175616897}, { 0.671558954847018400625376850}, - {-0.671558954847018400625376850}, { 0.740951125354959091175616897}, - { 0.049067674327418014254954977}, { 0.998795456205172392714771605}, - {-0.998795456205172392714771605}, { 0.049067674327418014254954977}, - { 0.999698818696204220115765650}, { 0.024541228522912288031734529}, - {-0.024541228522912288031734529}, { 0.999698818696204220115765650}, - { 0.689540544737066924616730630}, { 0.724247082951466920941069243}, - {-0.724247082951466920941069243}, { 0.689540544737066924616730630}, - { 0.914209755703530654635014829}, { 0.405241314004989870908481306}, - {-0.405241314004989870908481306}, { 0.914209755703530654635014829}, - { 0.359895036534988148775104572}, { 0.932992798834738887711660256}, - {-0.932992798834738887711660256}, { 0.359895036534988148775104572}, - { 0.975702130038528544460395766}, { 0.219101240156869797227737547}, - {-0.219101240156869797227737547}, { 0.975702130038528544460395766}, - { 0.534997619887097210663076905}, { 0.844853565249707073259571205}, - {-0.844853565249707073259571205}, { 0.534997619887097210663076905}, - { 0.817584813151583696504920884}, { 0.575808191417845300745972454}, - {-0.575808191417845300745972454}, { 0.817584813151583696504920884}, - { 0.170961888760301226363642357}, { 0.985277642388941244774018433}, - {-0.985277642388941244774018433}, { 0.170961888760301226363642357}, - { 0.992479534598709998156767252}, { 0.122410675199216198498704474}, - {-0.122410675199216198498704474}, { 0.992479534598709998156767252}, - { 0.615231590580626845484913563}, { 0.788346427626606262009164705}, - {-0.788346427626606262009164705}, { 0.615231590580626845484913563}, - { 0.870086991108711418652292404}, { 0.492898192229784036873026689}, - {-0.492898192229784036873026689}, { 0.870086991108711418652292404}, - { 0.266712757474898386325286515}, { 0.963776065795439866686464356}, - {-0.963776065795439866686464356}, { 0.266712757474898386325286515}, - { 0.949528180593036667195936074}, { 0.313681740398891476656478846}, - {-0.313681740398891476656478846}, { 0.949528180593036667195936074}, - { 0.449611329654606600046294579}, { 0.893224301195515320342416447}, - {-0.893224301195515320342416447}, { 0.449611329654606600046294579}, - { 0.757208846506484547575464054}, { 0.653172842953776764084203014}, - {-0.653172842953776764084203014}, { 0.757208846506484547575464054}, - { 0.073564563599667423529465622}, { 0.997290456678690216135597140}, - {-0.997290456678690216135597140}, { 0.073564563599667423529465622}, - { 0.997290456678690216135597140}, { 0.073564563599667423529465622}, - {-0.073564563599667423529465622}, { 0.997290456678690216135597140}, - { 0.653172842953776764084203014}, { 0.757208846506484547575464054}, - {-0.757208846506484547575464054}, { 0.653172842953776764084203014}, - { 0.893224301195515320342416447}, { 0.449611329654606600046294579}, - {-0.449611329654606600046294579}, { 0.893224301195515320342416447}, - { 0.313681740398891476656478846}, { 0.949528180593036667195936074}, - {-0.949528180593036667195936074}, { 0.313681740398891476656478846}, - { 0.963776065795439866686464356}, { 0.266712757474898386325286515}, - {-0.266712757474898386325286515}, { 0.963776065795439866686464356}, - { 0.492898192229784036873026689}, { 0.870086991108711418652292404}, - {-0.870086991108711418652292404}, { 0.492898192229784036873026689}, - { 0.788346427626606262009164705}, { 0.615231590580626845484913563}, - {-0.615231590580626845484913563}, { 0.788346427626606262009164705}, - { 0.122410675199216198498704474}, { 0.992479534598709998156767252}, - {-0.992479534598709998156767252}, { 0.122410675199216198498704474}, - { 0.985277642388941244774018433}, { 0.170961888760301226363642357}, - {-0.170961888760301226363642357}, { 0.985277642388941244774018433}, - { 0.575808191417845300745972454}, { 0.817584813151583696504920884}, - {-0.817584813151583696504920884}, { 0.575808191417845300745972454}, - { 0.844853565249707073259571205}, { 0.534997619887097210663076905}, - {-0.534997619887097210663076905}, { 0.844853565249707073259571205}, - { 0.219101240156869797227737547}, { 0.975702130038528544460395766}, - {-0.975702130038528544460395766}, { 0.219101240156869797227737547}, - { 0.932992798834738887711660256}, { 0.359895036534988148775104572}, - {-0.359895036534988148775104572}, { 0.932992798834738887711660256}, - { 0.405241314004989870908481306}, { 0.914209755703530654635014829}, - {-0.914209755703530654635014829}, { 0.405241314004989870908481306}, - { 0.724247082951466920941069243}, { 0.689540544737066924616730630}, - {-0.689540544737066924616730630}, { 0.724247082951466920941069243}, - { 0.024541228522912288031734529}, { 0.999698818696204220115765650}, - {-0.999698818696204220115765650}, { 0.024541228522912288031734529}, - { 0.999924701839144540921646491}, { 0.012271538285719926079408262}, - {-0.012271538285719926079408262}, { 0.999924701839144540921646491}, - { 0.698376249408972853554813503}, { 0.715730825283818654125532623}, - {-0.715730825283818654125532623}, { 0.698376249408972853554813503}, - { 0.919113851690057743908477789}, { 0.393992040061048108596188661}, - {-0.393992040061048108596188661}, { 0.919113851690057743908477789}, - { 0.371317193951837543411934967}, { 0.928506080473215565937167396}, - {-0.928506080473215565937167396}, { 0.371317193951837543411934967}, - { 0.978317370719627633106240097}, { 0.207111376192218549708116020}, - {-0.207111376192218549708116020}, { 0.978317370719627633106240097}, - { 0.545324988422046422313987347}, { 0.838224705554838043186996856}, - {-0.838224705554838043186996856}, { 0.545324988422046422313987347}, - { 0.824589302785025264474803737}, { 0.565731810783613197389765011}, - {-0.565731810783613197389765011}, { 0.824589302785025264474803737}, - { 0.183039887955140958516532578}, { 0.983105487431216327180301155}, - {-0.983105487431216327180301155}, { 0.183039887955140958516532578}, - { 0.993906970002356041546922813}, { 0.110222207293883058807899140}, - {-0.110222207293883058807899140}, { 0.993906970002356041546922813}, - { 0.624859488142386377084072816}, { 0.780737228572094478301588484}, - {-0.780737228572094478301588484}, { 0.624859488142386377084072816}, - { 0.876070094195406607095844268}, { 0.482183772079122748517344481}, - {-0.482183772079122748517344481}, { 0.876070094195406607095844268}, - { 0.278519689385053105207848526}, { 0.960430519415565811199035138}, - {-0.960430519415565811199035138}, { 0.278519689385053105207848526}, - { 0.953306040354193836916740383}, { 0.302005949319228067003463232}, - {-0.302005949319228067003463232}, { 0.953306040354193836916740383}, - { 0.460538710958240023633181487}, { 0.887639620402853947760181617}, - {-0.887639620402853947760181617}, { 0.460538710958240023633181487}, - { 0.765167265622458925888815999}, { 0.643831542889791465068086063}, - {-0.643831542889791465068086063}, { 0.765167265622458925888815999}, - { 0.085797312344439890461556332}, { 0.996312612182778012627226190}, - {-0.996312612182778012627226190}, { 0.085797312344439890461556332}, - { 0.998118112900149207125155861}, { 0.061320736302208577782614593}, - {-0.061320736302208577782614593}, { 0.998118112900149207125155861}, - { 0.662415777590171761113069817}, { 0.749136394523459325469203257}, - {-0.749136394523459325469203257}, { 0.662415777590171761113069817}, - { 0.898674465693953843041976744}, { 0.438616238538527637647025738}, - {-0.438616238538527637647025738}, { 0.898674465693953843041976744}, - { 0.325310292162262934135954708}, { 0.945607325380521325730945387}, - {-0.945607325380521325730945387}, { 0.325310292162262934135954708}, - { 0.966976471044852109087220226}, { 0.254865659604514571553980779}, - {-0.254865659604514571553980779}, { 0.966976471044852109087220226}, - { 0.503538383725717558691867071}, { 0.863972856121586737918147054}, - {-0.863972856121586737918147054}, { 0.503538383725717558691867071}, - { 0.795836904608883536262791915}, { 0.605511041404325513920626941}, - {-0.605511041404325513920626941}, { 0.795836904608883536262791915}, - { 0.134580708507126186316358409}, { 0.990902635427780025108237011}, - {-0.990902635427780025108237011}, { 0.134580708507126186316358409}, - { 0.987301418157858382399815802}, { 0.158858143333861441684385360}, - {-0.158858143333861441684385360}, { 0.987301418157858382399815802}, - { 0.585797857456438860328080838}, { 0.810457198252594791726703434}, - {-0.810457198252594791726703434}, { 0.585797857456438860328080838}, - { 0.851355193105265142261290312}, { 0.524589682678468906215098464}, - {-0.524589682678468906215098464}, { 0.851355193105265142261290312}, - { 0.231058108280671119643236018}, { 0.972939952205560145467720114}, - {-0.972939952205560145467720114}, { 0.231058108280671119643236018}, - { 0.937339011912574923201899593}, { 0.348418680249434568419308588}, - {-0.348418680249434568419308588}, { 0.937339011912574923201899593}, - { 0.416429560097637182562598911}, { 0.909167983090522376563884788}, - {-0.909167983090522376563884788}, { 0.416429560097637182562598911}, - { 0.732654271672412834615546649}, { 0.680600997795453050594430464}, - {-0.680600997795453050594430464}, { 0.732654271672412834615546649}, - { 0.036807222941358832324332691}, { 0.999322384588349500896221011}, - {-0.999322384588349500896221011}, { 0.036807222941358832324332691}, - { 0.999322384588349500896221011}, { 0.036807222941358832324332691}, - {-0.036807222941358832324332691}, { 0.999322384588349500896221011}, - { 0.680600997795453050594430464}, { 0.732654271672412834615546649}, - {-0.732654271672412834615546649}, { 0.680600997795453050594430464}, - { 0.909167983090522376563884788}, { 0.416429560097637182562598911}, - {-0.416429560097637182562598911}, { 0.909167983090522376563884788}, - { 0.348418680249434568419308588}, { 0.937339011912574923201899593}, - {-0.937339011912574923201899593}, { 0.348418680249434568419308588}, - { 0.972939952205560145467720114}, { 0.231058108280671119643236018}, - {-0.231058108280671119643236018}, { 0.972939952205560145467720114}, - { 0.524589682678468906215098464}, { 0.851355193105265142261290312}, - {-0.851355193105265142261290312}, { 0.524589682678468906215098464}, - { 0.810457198252594791726703434}, { 0.585797857456438860328080838}, - {-0.585797857456438860328080838}, { 0.810457198252594791726703434}, - { 0.158858143333861441684385360}, { 0.987301418157858382399815802}, - {-0.987301418157858382399815802}, { 0.158858143333861441684385360}, - { 0.990902635427780025108237011}, { 0.134580708507126186316358409}, - {-0.134580708507126186316358409}, { 0.990902635427780025108237011}, - { 0.605511041404325513920626941}, { 0.795836904608883536262791915}, - {-0.795836904608883536262791915}, { 0.605511041404325513920626941}, - { 0.863972856121586737918147054}, { 0.503538383725717558691867071}, - {-0.503538383725717558691867071}, { 0.863972856121586737918147054}, - { 0.254865659604514571553980779}, { 0.966976471044852109087220226}, - {-0.966976471044852109087220226}, { 0.254865659604514571553980779}, - { 0.945607325380521325730945387}, { 0.325310292162262934135954708}, - {-0.325310292162262934135954708}, { 0.945607325380521325730945387}, - { 0.438616238538527637647025738}, { 0.898674465693953843041976744}, - {-0.898674465693953843041976744}, { 0.438616238538527637647025738}, - { 0.749136394523459325469203257}, { 0.662415777590171761113069817}, - {-0.662415777590171761113069817}, { 0.749136394523459325469203257}, - { 0.061320736302208577782614593}, { 0.998118112900149207125155861}, - {-0.998118112900149207125155861}, { 0.061320736302208577782614593}, - { 0.996312612182778012627226190}, { 0.085797312344439890461556332}, - {-0.085797312344439890461556332}, { 0.996312612182778012627226190}, - { 0.643831542889791465068086063}, { 0.765167265622458925888815999}, - {-0.765167265622458925888815999}, { 0.643831542889791465068086063}, - { 0.887639620402853947760181617}, { 0.460538710958240023633181487}, - {-0.460538710958240023633181487}, { 0.887639620402853947760181617}, - { 0.302005949319228067003463232}, { 0.953306040354193836916740383}, - {-0.953306040354193836916740383}, { 0.302005949319228067003463232}, - { 0.960430519415565811199035138}, { 0.278519689385053105207848526}, - {-0.278519689385053105207848526}, { 0.960430519415565811199035138}, - { 0.482183772079122748517344481}, { 0.876070094195406607095844268}, - {-0.876070094195406607095844268}, { 0.482183772079122748517344481}, - { 0.780737228572094478301588484}, { 0.624859488142386377084072816}, - {-0.624859488142386377084072816}, { 0.780737228572094478301588484}, - { 0.110222207293883058807899140}, { 0.993906970002356041546922813}, - {-0.993906970002356041546922813}, { 0.110222207293883058807899140}, - { 0.983105487431216327180301155}, { 0.183039887955140958516532578}, - {-0.183039887955140958516532578}, { 0.983105487431216327180301155}, - { 0.565731810783613197389765011}, { 0.824589302785025264474803737}, - {-0.824589302785025264474803737}, { 0.565731810783613197389765011}, - { 0.838224705554838043186996856}, { 0.545324988422046422313987347}, - {-0.545324988422046422313987347}, { 0.838224705554838043186996856}, - { 0.207111376192218549708116020}, { 0.978317370719627633106240097}, - {-0.978317370719627633106240097}, { 0.207111376192218549708116020}, - { 0.928506080473215565937167396}, { 0.371317193951837543411934967}, - {-0.371317193951837543411934967}, { 0.928506080473215565937167396}, - { 0.393992040061048108596188661}, { 0.919113851690057743908477789}, - {-0.919113851690057743908477789}, { 0.393992040061048108596188661}, - { 0.715730825283818654125532623}, { 0.698376249408972853554813503}, - {-0.698376249408972853554813503}, { 0.715730825283818654125532623}, - { 0.012271538285719926079408262}, { 0.999924701839144540921646491}, - {-0.999924701839144540921646491}, { 0.012271538285719926079408262}, - { 0.999981175282601142656990438}, { 0.006135884649154475359640235}, - {-0.006135884649154475359640235}, { 0.999981175282601142656990438}, - { 0.702754744457225302452914421}, { 0.711432195745216441522130290}, - {-0.711432195745216441522130290}, { 0.702754744457225302452914421}, - { 0.921514039342041943465396332}, { 0.388345046698826291624993541}, - {-0.388345046698826291624993541}, { 0.921514039342041943465396332}, - { 0.377007410216418256726567823}, { 0.926210242138311341974793388}, - {-0.926210242138311341974793388}, { 0.377007410216418256726567823}, - { 0.979569765685440534439326110}, { 0.201104634842091911558443546}, - {-0.201104634842091911558443546}, { 0.979569765685440534439326110}, - { 0.550457972936604802977289893}, { 0.834862874986380056304401383}, - {-0.834862874986380056304401383}, { 0.550457972936604802977289893}, - { 0.828045045257755752067527592}, { 0.560661576197336023839710223}, - {-0.560661576197336023839710223}, { 0.828045045257755752067527592}, - { 0.189068664149806212754997837}, { 0.981963869109555264072848154}, - {-0.981963869109555264072848154}, { 0.189068664149806212754997837}, - { 0.994564570734255452119106243}, { 0.104121633872054579120943880}, - {-0.104121633872054579120943880}, { 0.994564570734255452119106243}, - { 0.629638238914927025372981341}, { 0.776888465673232450040827983}, - {-0.776888465673232450040827983}, { 0.629638238914927025372981341}, - { 0.879012226428633477831323711}, { 0.476799230063322133342158117}, - {-0.476799230063322133342158117}, { 0.879012226428633477831323711}, - { 0.284407537211271843618310615}, { 0.958703474895871555374645792}, - {-0.958703474895871555374645792}, { 0.284407537211271843618310615}, - { 0.955141168305770721498157712}, { 0.296150888243623824121786128}, - {-0.296150888243623824121786128}, { 0.955141168305770721498157712}, - { 0.465976495767966177902756065}, { 0.884797098430937780104007041}, - {-0.884797098430937780104007041}, { 0.465976495767966177902756065}, - { 0.769103337645579639346626069}, { 0.639124444863775743801488193}, - {-0.639124444863775743801488193}, { 0.769103337645579639346626069}, - { 0.091908956497132728624990979}, { 0.995767414467659793982495643}, - {-0.995767414467659793982495643}, { 0.091908956497132728624990979}, - { 0.998475580573294752208559038}, { 0.055195244349689939809447526}, - {-0.055195244349689939809447526}, { 0.998475580573294752208559038}, - { 0.666999922303637506650154222}, { 0.745057785441465962407907310}, - {-0.745057785441465962407907310}, { 0.666999922303637506650154222}, - { 0.901348847046022014570746093}, { 0.433093818853151968484222638}, - {-0.433093818853151968484222638}, { 0.901348847046022014570746093}, - { 0.331106305759876401737190737}, { 0.943593458161960361495301445}, - {-0.943593458161960361495301445}, { 0.331106305759876401737190737}, - { 0.968522094274417316221088329}, { 0.248927605745720168110682816}, - {-0.248927605745720168110682816}, { 0.968522094274417316221088329}, - { 0.508830142543107036931749324}, { 0.860866938637767279344583877}, - {-0.860866938637767279344583877}, { 0.508830142543107036931749324}, - { 0.799537269107905033500246232}, { 0.600616479383868926653875896}, - {-0.600616479383868926653875896}, { 0.799537269107905033500246232}, - { 0.140658239332849230714788846}, { 0.990058210262297105505906464}, - {-0.990058210262297105505906464}, { 0.140658239332849230714788846}, - { 0.988257567730749491404792538}, { 0.152797185258443427720336613}, - {-0.152797185258443427720336613}, { 0.988257567730749491404792538}, - { 0.590759701858874228423887908}, { 0.806847553543799272206514313}, - {-0.806847553543799272206514313}, { 0.590759701858874228423887908}, - { 0.854557988365400520767862276}, { 0.519355990165589587361829932}, - {-0.519355990165589587361829932}, { 0.854557988365400520767862276}, - { 0.237023605994367206867735915}, { 0.971503890986251775537099622}, - {-0.971503890986251775537099622}, { 0.237023605994367206867735915}, - { 0.939459223602189911962669246}, { 0.342660717311994397592781983}, - {-0.342660717311994397592781983}, { 0.939459223602189911962669246}, - { 0.422000270799799685941287941}, { 0.906595704514915365332960588}, - {-0.906595704514915365332960588}, { 0.422000270799799685941287941}, - { 0.736816568877369875090132520}, { 0.676092703575315960360419228}, - {-0.676092703575315960360419228}, { 0.736816568877369875090132520}, - { 0.042938256934940823077124540}, { 0.999077727752645382888781997}, - {-0.999077727752645382888781997}, { 0.042938256934940823077124540}, - { 0.999529417501093163079703322}, { 0.030674803176636625934021028}, - {-0.030674803176636625934021028}, { 0.999529417501093163079703322}, - { 0.685083667772700381362052545}, { 0.728464390448225196492035438}, - {-0.728464390448225196492035438}, { 0.685083667772700381362052545}, - { 0.911706032005429851404397325}, { 0.410843171057903942183466675}, - {-0.410843171057903942183466675}, { 0.911706032005429851404397325}, - { 0.354163525420490382357395796}, { 0.935183509938947577642207480}, - {-0.935183509938947577642207480}, { 0.354163525420490382357395796}, - { 0.974339382785575860518721668}, { 0.225083911359792835991642120}, - {-0.225083911359792835991642120}, { 0.974339382785575860518721668}, - { 0.529803624686294668216054671}, { 0.848120344803297251279133563}, - {-0.848120344803297251279133563}, { 0.529803624686294668216054671}, - { 0.814036329705948361654516690}, { 0.580813958095764545075595272}, - {-0.580813958095764545075595272}, { 0.814036329705948361654516690}, - { 0.164913120489969921418189113}, { 0.986308097244598647863297524}, - {-0.986308097244598647863297524}, { 0.164913120489969921418189113}, - { 0.991709753669099522860049931}, { 0.128498110793793172624415589}, - {-0.128498110793793172624415589}, { 0.991709753669099522860049931}, - { 0.610382806276309452716352152}, { 0.792106577300212351782342879}, - {-0.792106577300212351782342879}, { 0.610382806276309452716352152}, - { 0.867046245515692651480195629}, { 0.498227666972781852410983869}, - {-0.498227666972781852410983869}, { 0.867046245515692651480195629}, - { 0.260794117915275518280186509}, { 0.965394441697689374550843858}, - {-0.965394441697689374550843858}, { 0.260794117915275518280186509}, - { 0.947585591017741134653387321}, { 0.319502030816015677901518272}, - {-0.319502030816015677901518272}, { 0.947585591017741134653387321}, - { 0.444122144570429231642069418}, { 0.895966249756185155914560282}, - {-0.895966249756185155914560282}, { 0.444122144570429231642069418}, - { 0.753186799043612482483430486}, { 0.657806693297078656931182264}, - {-0.657806693297078656931182264}, { 0.753186799043612482483430486}, - { 0.067443919563664057897972422}, { 0.997723066644191609848546728}, - {-0.997723066644191609848546728}, { 0.067443919563664057897972422}, - { 0.996820299291165714972629398}, { 0.079682437971430121147120656}, - {-0.079682437971430121147120656}, { 0.996820299291165714972629398}, - { 0.648514401022112445084560551}, { 0.761202385484261814029709836}, - {-0.761202385484261814029709836}, { 0.648514401022112445084560551}, - { 0.890448723244757889952150560}, { 0.455083587126343823535869268}, - {-0.455083587126343823535869268}, { 0.890448723244757889952150560}, - { 0.307849640041534893682063646}, { 0.951435020969008369549175569}, - {-0.951435020969008369549175569}, { 0.307849640041534893682063646}, - { 0.962121404269041595429604316}, { 0.272621355449948984493347477}, - {-0.272621355449948984493347477}, { 0.962121404269041595429604316}, - { 0.487550160148435954641485027}, { 0.873094978418290098636085973}, - {-0.873094978418290098636085973}, { 0.487550160148435954641485027}, - { 0.784556597155575233023892575}, { 0.620057211763289178646268191}, - {-0.620057211763289178646268191}, { 0.784556597155575233023892575}, - { 0.116318630911904767252544319}, { 0.993211949234794533104601012}, - {-0.993211949234794533104601012}, { 0.116318630911904767252544319}, - { 0.984210092386929073193874387}, { 0.177004220412148756196839844}, - {-0.177004220412148756196839844}, { 0.984210092386929073193874387}, - { 0.570780745886967280232652864}, { 0.821102514991104679060430820}, - {-0.821102514991104679060430820}, { 0.570780745886967280232652864}, - { 0.841554977436898409603499520}, { 0.540171472729892881297845480}, - {-0.540171472729892881297845480}, { 0.841554977436898409603499520}, - { 0.213110319916091373967757518}, { 0.977028142657754351485866211}, - {-0.977028142657754351485866211}, { 0.213110319916091373967757518}, - { 0.930766961078983731944872340}, { 0.365612997804773870011745909}, - {-0.365612997804773870011745909}, { 0.930766961078983731944872340}, - { 0.399624199845646828544117031}, { 0.916679059921042663116457013}, - {-0.916679059921042663116457013}, { 0.399624199845646828544117031}, - { 0.720002507961381629076682999}, { 0.693971460889654009003734389}, - {-0.693971460889654009003734389}, { 0.720002507961381629076682999}, - { 0.018406729905804820927366313}, { 0.999830581795823422015722275}, - {-0.999830581795823422015722275}, { 0.018406729905804820927366313}, - { 0.999830581795823422015722275}, { 0.018406729905804820927366313}, - {-0.018406729905804820927366313}, { 0.999830581795823422015722275}, - { 0.693971460889654009003734389}, { 0.720002507961381629076682999}, - {-0.720002507961381629076682999}, { 0.693971460889654009003734389}, - { 0.916679059921042663116457013}, { 0.399624199845646828544117031}, - {-0.399624199845646828544117031}, { 0.916679059921042663116457013}, - { 0.365612997804773870011745909}, { 0.930766961078983731944872340}, - {-0.930766961078983731944872340}, { 0.365612997804773870011745909}, - { 0.977028142657754351485866211}, { 0.213110319916091373967757518}, - {-0.213110319916091373967757518}, { 0.977028142657754351485866211}, - { 0.540171472729892881297845480}, { 0.841554977436898409603499520}, - {-0.841554977436898409603499520}, { 0.540171472729892881297845480}, - { 0.821102514991104679060430820}, { 0.570780745886967280232652864}, - {-0.570780745886967280232652864}, { 0.821102514991104679060430820}, - { 0.177004220412148756196839844}, { 0.984210092386929073193874387}, - {-0.984210092386929073193874387}, { 0.177004220412148756196839844}, - { 0.993211949234794533104601012}, { 0.116318630911904767252544319}, - {-0.116318630911904767252544319}, { 0.993211949234794533104601012}, - { 0.620057211763289178646268191}, { 0.784556597155575233023892575}, - {-0.784556597155575233023892575}, { 0.620057211763289178646268191}, - { 0.873094978418290098636085973}, { 0.487550160148435954641485027}, - {-0.487550160148435954641485027}, { 0.873094978418290098636085973}, - { 0.272621355449948984493347477}, { 0.962121404269041595429604316}, - {-0.962121404269041595429604316}, { 0.272621355449948984493347477}, - { 0.951435020969008369549175569}, { 0.307849640041534893682063646}, - {-0.307849640041534893682063646}, { 0.951435020969008369549175569}, - { 0.455083587126343823535869268}, { 0.890448723244757889952150560}, - {-0.890448723244757889952150560}, { 0.455083587126343823535869268}, - { 0.761202385484261814029709836}, { 0.648514401022112445084560551}, - {-0.648514401022112445084560551}, { 0.761202385484261814029709836}, - { 0.079682437971430121147120656}, { 0.996820299291165714972629398}, - {-0.996820299291165714972629398}, { 0.079682437971430121147120656}, - { 0.997723066644191609848546728}, { 0.067443919563664057897972422}, - {-0.067443919563664057897972422}, { 0.997723066644191609848546728}, - { 0.657806693297078656931182264}, { 0.753186799043612482483430486}, - {-0.753186799043612482483430486}, { 0.657806693297078656931182264}, - { 0.895966249756185155914560282}, { 0.444122144570429231642069418}, - {-0.444122144570429231642069418}, { 0.895966249756185155914560282}, - { 0.319502030816015677901518272}, { 0.947585591017741134653387321}, - {-0.947585591017741134653387321}, { 0.319502030816015677901518272}, - { 0.965394441697689374550843858}, { 0.260794117915275518280186509}, - {-0.260794117915275518280186509}, { 0.965394441697689374550843858}, - { 0.498227666972781852410983869}, { 0.867046245515692651480195629}, - {-0.867046245515692651480195629}, { 0.498227666972781852410983869}, - { 0.792106577300212351782342879}, { 0.610382806276309452716352152}, - {-0.610382806276309452716352152}, { 0.792106577300212351782342879}, - { 0.128498110793793172624415589}, { 0.991709753669099522860049931}, - {-0.991709753669099522860049931}, { 0.128498110793793172624415589}, - { 0.986308097244598647863297524}, { 0.164913120489969921418189113}, - {-0.164913120489969921418189113}, { 0.986308097244598647863297524}, - { 0.580813958095764545075595272}, { 0.814036329705948361654516690}, - {-0.814036329705948361654516690}, { 0.580813958095764545075595272}, - { 0.848120344803297251279133563}, { 0.529803624686294668216054671}, - {-0.529803624686294668216054671}, { 0.848120344803297251279133563}, - { 0.225083911359792835991642120}, { 0.974339382785575860518721668}, - {-0.974339382785575860518721668}, { 0.225083911359792835991642120}, - { 0.935183509938947577642207480}, { 0.354163525420490382357395796}, - {-0.354163525420490382357395796}, { 0.935183509938947577642207480}, - { 0.410843171057903942183466675}, { 0.911706032005429851404397325}, - {-0.911706032005429851404397325}, { 0.410843171057903942183466675}, - { 0.728464390448225196492035438}, { 0.685083667772700381362052545}, - {-0.685083667772700381362052545}, { 0.728464390448225196492035438}, - { 0.030674803176636625934021028}, { 0.999529417501093163079703322}, - {-0.999529417501093163079703322}, { 0.030674803176636625934021028}, - { 0.999077727752645382888781997}, { 0.042938256934940823077124540}, - {-0.042938256934940823077124540}, { 0.999077727752645382888781997}, - { 0.676092703575315960360419228}, { 0.736816568877369875090132520}, - {-0.736816568877369875090132520}, { 0.676092703575315960360419228}, - { 0.906595704514915365332960588}, { 0.422000270799799685941287941}, - {-0.422000270799799685941287941}, { 0.906595704514915365332960588}, - { 0.342660717311994397592781983}, { 0.939459223602189911962669246}, - {-0.939459223602189911962669246}, { 0.342660717311994397592781983}, - { 0.971503890986251775537099622}, { 0.237023605994367206867735915}, - {-0.237023605994367206867735915}, { 0.971503890986251775537099622}, - { 0.519355990165589587361829932}, { 0.854557988365400520767862276}, - {-0.854557988365400520767862276}, { 0.519355990165589587361829932}, - { 0.806847553543799272206514313}, { 0.590759701858874228423887908}, - {-0.590759701858874228423887908}, { 0.806847553543799272206514313}, - { 0.152797185258443427720336613}, { 0.988257567730749491404792538}, - {-0.988257567730749491404792538}, { 0.152797185258443427720336613}, - { 0.990058210262297105505906464}, { 0.140658239332849230714788846}, - {-0.140658239332849230714788846}, { 0.990058210262297105505906464}, - { 0.600616479383868926653875896}, { 0.799537269107905033500246232}, - {-0.799537269107905033500246232}, { 0.600616479383868926653875896}, - { 0.860866938637767279344583877}, { 0.508830142543107036931749324}, - {-0.508830142543107036931749324}, { 0.860866938637767279344583877}, - { 0.248927605745720168110682816}, { 0.968522094274417316221088329}, - {-0.968522094274417316221088329}, { 0.248927605745720168110682816}, - { 0.943593458161960361495301445}, { 0.331106305759876401737190737}, - {-0.331106305759876401737190737}, { 0.943593458161960361495301445}, - { 0.433093818853151968484222638}, { 0.901348847046022014570746093}, - {-0.901348847046022014570746093}, { 0.433093818853151968484222638}, - { 0.745057785441465962407907310}, { 0.666999922303637506650154222}, - {-0.666999922303637506650154222}, { 0.745057785441465962407907310}, - { 0.055195244349689939809447526}, { 0.998475580573294752208559038}, - {-0.998475580573294752208559038}, { 0.055195244349689939809447526}, - { 0.995767414467659793982495643}, { 0.091908956497132728624990979}, - {-0.091908956497132728624990979}, { 0.995767414467659793982495643}, - { 0.639124444863775743801488193}, { 0.769103337645579639346626069}, - {-0.769103337645579639346626069}, { 0.639124444863775743801488193}, - { 0.884797098430937780104007041}, { 0.465976495767966177902756065}, - {-0.465976495767966177902756065}, { 0.884797098430937780104007041}, - { 0.296150888243623824121786128}, { 0.955141168305770721498157712}, - {-0.955141168305770721498157712}, { 0.296150888243623824121786128}, - { 0.958703474895871555374645792}, { 0.284407537211271843618310615}, - {-0.284407537211271843618310615}, { 0.958703474895871555374645792}, - { 0.476799230063322133342158117}, { 0.879012226428633477831323711}, - {-0.879012226428633477831323711}, { 0.476799230063322133342158117}, - { 0.776888465673232450040827983}, { 0.629638238914927025372981341}, - {-0.629638238914927025372981341}, { 0.776888465673232450040827983}, - { 0.104121633872054579120943880}, { 0.994564570734255452119106243}, - {-0.994564570734255452119106243}, { 0.104121633872054579120943880}, - { 0.981963869109555264072848154}, { 0.189068664149806212754997837}, - {-0.189068664149806212754997837}, { 0.981963869109555264072848154}, - { 0.560661576197336023839710223}, { 0.828045045257755752067527592}, - {-0.828045045257755752067527592}, { 0.560661576197336023839710223}, - { 0.834862874986380056304401383}, { 0.550457972936604802977289893}, - {-0.550457972936604802977289893}, { 0.834862874986380056304401383}, - { 0.201104634842091911558443546}, { 0.979569765685440534439326110}, - {-0.979569765685440534439326110}, { 0.201104634842091911558443546}, - { 0.926210242138311341974793388}, { 0.377007410216418256726567823}, - {-0.377007410216418256726567823}, { 0.926210242138311341974793388}, - { 0.388345046698826291624993541}, { 0.921514039342041943465396332}, - {-0.921514039342041943465396332}, { 0.388345046698826291624993541}, - { 0.711432195745216441522130290}, { 0.702754744457225302452914421}, - {-0.702754744457225302452914421}, { 0.711432195745216441522130290}, - { 0.006135884649154475359640235}, { 0.999981175282601142656990438}, - {-0.999981175282601142656990438}, { 0.006135884649154475359640235}, - { 0.999995293809576171511580126}, { 0.003067956762965976270145365}, - {-0.003067956762965976270145365}, { 0.999995293809576171511580126}, - { 0.704934080375904908852523758}, { 0.709272826438865651316533772}, - {-0.709272826438865651316533772}, { 0.704934080375904908852523758}, - { 0.922701128333878570437264227}, { 0.385516053843918864075607949}, - {-0.385516053843918864075607949}, { 0.922701128333878570437264227}, - { 0.379847208924051170576281147}, { 0.925049240782677590302371869}, - {-0.925049240782677590302371869}, { 0.379847208924051170576281147}, - { 0.980182135968117392690210009}, { 0.198098410717953586179324918}, - {-0.198098410717953586179324918}, { 0.980182135968117392690210009}, - { 0.553016705580027531764226988}, { 0.833170164701913186439915922}, - {-0.833170164701913186439915922}, { 0.553016705580027531764226988}, - { 0.829761233794523042469023765}, { 0.558118531220556115693702964}, - {-0.558118531220556115693702964}, { 0.829761233794523042469023765}, - { 0.192080397049892441679288205}, { 0.981379193313754574318224190}, - {-0.981379193313754574318224190}, { 0.192080397049892441679288205}, - { 0.994879330794805620591166107}, { 0.101069862754827824987887585}, - {-0.101069862754827824987887585}, { 0.994879330794805620591166107}, - { 0.632018735939809021909403706}, { 0.774953106594873878359129282}, - {-0.774953106594873878359129282}, { 0.632018735939809021909403706}, - { 0.880470889052160770806542929}, { 0.474100214650550014398580015}, - {-0.474100214650550014398580015}, { 0.880470889052160770806542929}, - { 0.287347459544729526477331841}, { 0.957826413027532890321037029}, - {-0.957826413027532890321037029}, { 0.287347459544729526477331841}, - { 0.956045251349996443270479823}, { 0.293219162694258650606608599}, - {-0.293219162694258650606608599}, { 0.956045251349996443270479823}, - { 0.468688822035827933697617870}, { 0.883363338665731594736308015}, - {-0.883363338665731594736308015}, { 0.468688822035827933697617870}, - { 0.771060524261813773200605759}, { 0.636761861236284230413943435}, - {-0.636761861236284230413943435}, { 0.771060524261813773200605759}, - { 0.094963495329638998938034312}, { 0.995480755491926941769171600}, - {-0.995480755491926941769171600}, { 0.094963495329638998938034312}, - { 0.998640218180265222418199049}, { 0.052131704680283321236358216}, - {-0.052131704680283321236358216}, { 0.998640218180265222418199049}, - { 0.669282588346636065720696366}, { 0.743007952135121693517362293}, - {-0.743007952135121693517362293}, { 0.669282588346636065720696366}, - { 0.902673318237258806751502391}, { 0.430326481340082633908199031}, - {-0.430326481340082633908199031}, { 0.902673318237258806751502391}, - { 0.333999651442009404650865481}, { 0.942573197601446879280758735}, - {-0.942573197601446879280758735}, { 0.333999651442009404650865481}, - { 0.969281235356548486048290738}, { 0.245955050335794611599924709}, - {-0.245955050335794611599924709}, { 0.969281235356548486048290738}, - { 0.511468850437970399504391001}, { 0.859301818357008404783582139}, - {-0.859301818357008404783582139}, { 0.511468850437970399504391001}, - { 0.801376171723140219430247777}, { 0.598160706996342311724958652}, - {-0.598160706996342311724958652}, { 0.801376171723140219430247777}, - { 0.143695033150294454819773349}, { 0.989622017463200834623694454}, - {-0.989622017463200834623694454}, { 0.143695033150294454819773349}, - { 0.988721691960323767604516485}, { 0.149764534677321517229695737}, - {-0.149764534677321517229695737}, { 0.988721691960323767604516485}, - { 0.593232295039799808047809426}, { 0.805031331142963597922659282}, - {-0.805031331142963597922659282}, { 0.593232295039799808047809426}, - { 0.856147328375194481019630732}, { 0.516731799017649881508753876}, - {-0.516731799017649881508753876}, { 0.856147328375194481019630732}, - { 0.240003022448741486568922365}, { 0.970772140728950302138169611}, - {-0.970772140728950302138169611}, { 0.240003022448741486568922365}, - { 0.940506070593268323787291309}, { 0.339776884406826857828825803}, - {-0.339776884406826857828825803}, { 0.940506070593268323787291309}, - { 0.424779681209108833357226189}, { 0.905296759318118774354048329}, - {-0.905296759318118774354048329}, { 0.424779681209108833357226189}, - { 0.738887324460615147933116508}, { 0.673829000378756060917568372}, - {-0.673829000378756060917568372}, { 0.738887324460615147933116508}, - { 0.046003182130914628814301788}, { 0.998941293186856850633930266}, - {-0.998941293186856850633930266}, { 0.046003182130914628814301788}, - { 0.999618822495178597116830637}, { 0.027608145778965741612354872}, - {-0.027608145778965741612354872}, { 0.999618822495178597116830637}, - { 0.687315340891759108199186948}, { 0.726359155084345976817494315}, - {-0.726359155084345976817494315}, { 0.687315340891759108199186948}, - { 0.912962190428398164628018233}, { 0.408044162864978680820747499}, - {-0.408044162864978680820747499}, { 0.912962190428398164628018233}, - { 0.357030961233430032614954036}, { 0.934092550404258914729877883}, - {-0.934092550404258914729877883}, { 0.357030961233430032614954036}, - { 0.975025345066994146844913468}, { 0.222093620973203534094094721}, - {-0.222093620973203534094094721}, { 0.975025345066994146844913468}, - { 0.532403127877197971442805218}, { 0.846490938774052078300544488}, - {-0.846490938774052078300544488}, { 0.532403127877197971442805218}, - { 0.815814410806733789010772660}, { 0.578313796411655563342245019}, - {-0.578313796411655563342245019}, { 0.815814410806733789010772660}, - { 0.167938294974731178054745536}, { 0.985797509167567424700995000}, - {-0.985797509167567424700995000}, { 0.167938294974731178054745536}, - { 0.992099313142191757112085445}, { 0.125454983411546238542336453}, - {-0.125454983411546238542336453}, { 0.992099313142191757112085445}, - { 0.612810082429409703935211936}, { 0.790230221437310055030217152}, - {-0.790230221437310055030217152}, { 0.612810082429409703935211936}, - { 0.868570705971340895340449876}, { 0.495565261825772531150266670}, - {-0.495565261825772531150266670}, { 0.868570705971340895340449876}, - { 0.263754678974831383611349322}, { 0.964589793289812723836432159}, - {-0.964589793289812723836432159}, { 0.263754678974831383611349322}, - { 0.948561349915730288158494826}, { 0.316593375556165867243047035}, - {-0.316593375556165867243047035}, { 0.948561349915730288158494826}, - { 0.446868840162374195353044389}, { 0.894599485631382678433072126}, - {-0.894599485631382678433072126}, { 0.446868840162374195353044389}, - { 0.755201376896536527598710756}, { 0.655492852999615385312679701}, - {-0.655492852999615385312679701}, { 0.755201376896536527598710756}, - { 0.070504573389613863027351471}, { 0.997511456140303459699448390}, - {-0.997511456140303459699448390}, { 0.070504573389613863027351471}, - { 0.997060070339482978987989949}, { 0.076623861392031492278332463}, - {-0.076623861392031492278332463}, { 0.997060070339482978987989949}, - { 0.650846684996380915068975573}, { 0.759209188978388033485525443}, - {-0.759209188978388033485525443}, { 0.650846684996380915068975573}, - { 0.891840709392342727796478697}, { 0.452349587233770874133026703}, - {-0.452349587233770874133026703}, { 0.891840709392342727796478697}, - { 0.310767152749611495835997250}, { 0.950486073949481721759926101}, - {-0.950486073949481721759926101}, { 0.310767152749611495835997250}, - { 0.962953266873683886347921481}, { 0.269668325572915106525464462}, - {-0.269668325572915106525464462}, { 0.962953266873683886347921481}, - { 0.490226483288291154229598449}, { 0.871595086655951034842481435}, - {-0.871595086655951034842481435}, { 0.490226483288291154229598449}, - { 0.786455213599085757522319464}, { 0.617647307937803932403979402}, - {-0.617647307937803932403979402}, { 0.786455213599085757522319464}, - { 0.119365214810991364593637790}, { 0.992850414459865090793563344}, - {-0.992850414459865090793563344}, { 0.119365214810991364593637790}, - { 0.984748501801904218556553176}, { 0.173983873387463827950700807}, - {-0.173983873387463827950700807}, { 0.984748501801904218556553176}, - { 0.573297166698042212820171239}, { 0.819347520076796960824689637}, - {-0.819347520076796960824689637}, { 0.573297166698042212820171239}, - { 0.843208239641845437161743865}, { 0.537587076295645482502214932}, - {-0.537587076295645482502214932}, { 0.843208239641845437161743865}, - { 0.216106797076219509948385131}, { 0.976369731330021149312732194}, - {-0.976369731330021149312732194}, { 0.216106797076219509948385131}, - { 0.931884265581668106718557199}, { 0.362755724367397216204854462}, - {-0.362755724367397216204854462}, { 0.931884265581668106718557199}, - { 0.402434650859418441082533934}, { 0.915448716088267819566431292}, - {-0.915448716088267819566431292}, { 0.402434650859418441082533934}, - { 0.722128193929215321243607198}, { 0.691759258364157774906734132}, - {-0.691759258364157774906734132}, { 0.722128193929215321243607198}, - { 0.021474080275469507418374898}, { 0.999769405351215321657617036}, - {-0.999769405351215321657617036}, { 0.021474080275469507418374898}, - { 0.999882347454212525633049627}, { 0.015339206284988101044151868}, - {-0.015339206284988101044151868}, { 0.999882347454212525633049627}, - { 0.696177131491462944788582591}, { 0.717870045055731736211325329}, - {-0.717870045055731736211325329}, { 0.696177131491462944788582591}, - { 0.917900775621390457642276297}, { 0.396809987416710328595290911}, - {-0.396809987416710328595290911}, { 0.917900775621390457642276297}, - { 0.368466829953372331712746222}, { 0.929640895843181265457918066}, - {-0.929640895843181265457918066}, { 0.368466829953372331712746222}, - { 0.977677357824509979943404762}, { 0.210111836880469621717489972}, - {-0.210111836880469621717489972}, { 0.977677357824509979943404762}, - { 0.542750784864515906586768661}, { 0.839893794195999504583383987}, - {-0.839893794195999504583383987}, { 0.542750784864515906586768661}, - { 0.822849781375826332046780034}, { 0.568258952670131549790548489}, - {-0.568258952670131549790548489}, { 0.822849781375826332046780034}, - { 0.180022901405699522679906590}, { 0.983662419211730274396237776}, - {-0.983662419211730274396237776}, { 0.180022901405699522679906590}, - { 0.993564135520595333782021697}, { 0.113270952177564349018228733}, - {-0.113270952177564349018228733}, { 0.993564135520595333782021697}, - { 0.622461279374149972519166721}, { 0.782650596166575738458949301}, - {-0.782650596166575738458949301}, { 0.622461279374149972519166721}, - { 0.874586652278176112634431897}, { 0.484869248000791101822951699}, - {-0.484869248000791101822951699}, { 0.874586652278176112634431897}, - { 0.275571819310958163076425168}, { 0.961280485811320641748659653}, - {-0.961280485811320641748659653}, { 0.275571819310958163076425168}, - { 0.952375012719765858529893608}, { 0.304929229735402406490728633}, - {-0.304929229735402406490728633}, { 0.952375012719765858529893608}, - { 0.457813303598877221904961155}, { 0.889048355854664562540777729}, - {-0.889048355854664562540777729}, { 0.457813303598877221904961155}, - { 0.763188417263381271704838297}, { 0.646176012983316364832802220}, - {-0.646176012983316364832802220}, { 0.763188417263381271704838297}, - { 0.082740264549375693111987083}, { 0.996571145790554847093566910}, - {-0.996571145790554847093566910}, { 0.082740264549375693111987083}, - { 0.997925286198596012623025462}, { 0.064382630929857460819324537}, - {-0.064382630929857460819324537}, { 0.997925286198596012623025462}, - { 0.660114342067420478559490747}, { 0.751165131909686411205819422}, - {-0.751165131909686411205819422}, { 0.660114342067420478559490747}, - { 0.897324580705418281231391836}, { 0.441371268731716692879988968}, - {-0.441371268731716692879988968}, { 0.897324580705418281231391836}, - { 0.322407678801069848384807478}, { 0.946600913083283570044599823}, - {-0.946600913083283570044599823}, { 0.322407678801069848384807478}, - { 0.966190003445412555433832961}, { 0.257831102162159005614471295}, - {-0.257831102162159005614471295}, { 0.966190003445412555433832961}, - { 0.500885382611240786241285004}, { 0.865513624090569082825488358}, - {-0.865513624090569082825488358}, { 0.500885382611240786241285004}, - { 0.793975477554337164895083757}, { 0.607949784967773667243642671}, - {-0.607949784967773667243642671}, { 0.793975477554337164895083757}, - { 0.131540028702883111103387493}, { 0.991310859846115418957349799}, - {-0.991310859846115418957349799}, { 0.131540028702883111103387493}, - { 0.986809401814185476970235952}, { 0.161886393780111837641387995}, - {-0.161886393780111837641387995}, { 0.986809401814185476970235952}, - { 0.583308652937698294392830961}, { 0.812250586585203913049744181}, - {-0.812250586585203913049744181}, { 0.583308652937698294392830961}, - { 0.849741768000852489471268395}, { 0.527199134781901348464274575}, - {-0.527199134781901348464274575}, { 0.849741768000852489471268395}, - { 0.228072083170885739254457379}, { 0.973644249650811925318383912}, - {-0.973644249650811925318383912}, { 0.228072083170885739254457379}, - { 0.936265667170278246576310996}, { 0.351292756085567125601307623}, - {-0.351292756085567125601307623}, { 0.936265667170278246576310996}, - { 0.413638312238434547471944324}, { 0.910441292258067196934095369}, - {-0.910441292258067196934095369}, { 0.413638312238434547471944324}, - { 0.730562769227827561177758850}, { 0.682845546385248068164596123}, - {-0.682845546385248068164596123}, { 0.730562769227827561177758850}, - { 0.033741171851377584833716112}, { 0.999430604555461772019008327}, - {-0.999430604555461772019008327}, { 0.033741171851377584833716112}, - { 0.999204758618363895492950001}, { 0.039872927587739811128578738}, - {-0.039872927587739811128578738}, { 0.999204758618363895492950001}, - { 0.678350043129861486873655042}, { 0.734738878095963464563223604}, - {-0.734738878095963464563223604}, { 0.678350043129861486873655042}, - { 0.907886116487666212038681480}, { 0.419216888363223956433010020}, - {-0.419216888363223956433010020}, { 0.907886116487666212038681480}, - { 0.345541324963989065539191723}, { 0.938403534063108112192420774}, - {-0.938403534063108112192420774}, { 0.345541324963989065539191723}, - { 0.972226497078936305708321144}, { 0.234041958583543423191242045}, - {-0.234041958583543423191242045}, { 0.972226497078936305708321144}, - { 0.521975292937154342694258318}, { 0.852960604930363657746588082}, - {-0.852960604930363657746588082}, { 0.521975292937154342694258318}, - { 0.808656181588174991946968128}, { 0.588281548222645304786439813}, - {-0.588281548222645304786439813}, { 0.808656181588174991946968128}, - { 0.155828397654265235743101486}, { 0.987784141644572154230969032}, - {-0.987784141644572154230969032}, { 0.155828397654265235743101486}, - { 0.990485084256457037998682243}, { 0.137620121586486044948441663}, - {-0.137620121586486044948441663}, { 0.990485084256457037998682243}, - { 0.603066598540348201693430617}, { 0.797690840943391108362662755}, - {-0.797690840943391108362662755}, { 0.603066598540348201693430617}, - { 0.862423956111040538690933878}, { 0.506186645345155291048942344}, - {-0.506186645345155291048942344}, { 0.862423956111040538690933878}, - { 0.251897818154216950498106628}, { 0.967753837093475465243391912}, - {-0.967753837093475465243391912}, { 0.251897818154216950498106628}, - { 0.944604837261480265659265493}, { 0.328209843579092526107916817}, - {-0.328209843579092526107916817}, { 0.944604837261480265659265493}, - { 0.435857079922255491032544080}, { 0.900015892016160228714535267}, - {-0.900015892016160228714535267}, { 0.435857079922255491032544080}, - { 0.747100605980180144323078847}, { 0.664710978203344868130324985}, - {-0.664710978203344868130324985}, { 0.747100605980180144323078847}, - { 0.058258264500435759613979782}, { 0.998301544933892840738782163}, - {-0.998301544933892840738782163}, { 0.058258264500435759613979782}, - { 0.996044700901251989887944810}, { 0.088853552582524596561586535}, - {-0.088853552582524596561586535}, { 0.996044700901251989887944810}, - { 0.641481012808583151988739898}, { 0.767138911935820381181694573}, - {-0.767138911935820381181694573}, { 0.641481012808583151988739898}, - { 0.886222530148880631647990821}, { 0.463259783551860197390719637}, - {-0.463259783551860197390719637}, { 0.886222530148880631647990821}, - { 0.299079826308040476750336973}, { 0.954228095109105629780430732}, - {-0.954228095109105629780430732}, { 0.299079826308040476750336973}, - { 0.959571513081984528335528181}, { 0.281464937925757984095231007}, - {-0.281464937925757984095231007}, { 0.959571513081984528335528181}, - { 0.479493757660153026679839798}, { 0.877545290207261291668470750}, - {-0.877545290207261291668470750}, { 0.479493757660153026679839798}, - { 0.778816512381475953374724325}, { 0.627251815495144113509622565}, - {-0.627251815495144113509622565}, { 0.778816512381475953374724325}, - { 0.107172424956808849175529148}, { 0.994240449453187946358413442}, - {-0.994240449453187946358413442}, { 0.107172424956808849175529148}, - { 0.982539302287441255907040396}, { 0.186055151663446648105438304}, - {-0.186055151663446648105438304}, { 0.982539302287441255907040396}, - { 0.563199344013834115007363772}, { 0.826321062845663480311195452}, - {-0.826321062845663480311195452}, { 0.563199344013834115007363772}, - { 0.836547727223511984524285790}, { 0.547894059173100165608820571}, - {-0.547894059173100165608820571}, { 0.836547727223511984524285790}, - { 0.204108966092816874181696950}, { 0.978948175319062194715480124}, - {-0.978948175319062194715480124}, { 0.204108966092816874181696950}, - { 0.927362525650401087274536959}, { 0.374164062971457997104393020}, - {-0.374164062971457997104393020}, { 0.927362525650401087274536959}, - { 0.391170384302253888687512949}, { 0.920318276709110566440076541}, - {-0.920318276709110566440076541}, { 0.391170384302253888687512949}, - { 0.713584868780793592903125099}, { 0.700568793943248366792866380}, - {-0.700568793943248366792866380}, { 0.713584868780793592903125099}, - { 0.009203754782059819315102378}, { 0.999957644551963866333120920}, - {-0.999957644551963866333120920}, { 0.009203754782059819315102378}, - { 0.999957644551963866333120920}, { 0.009203754782059819315102378}, - {-0.009203754782059819315102378}, { 0.999957644551963866333120920}, - { 0.700568793943248366792866380}, { 0.713584868780793592903125099}, - {-0.713584868780793592903125099}, { 0.700568793943248366792866380}, - { 0.920318276709110566440076541}, { 0.391170384302253888687512949}, - {-0.391170384302253888687512949}, { 0.920318276709110566440076541}, - { 0.374164062971457997104393020}, { 0.927362525650401087274536959}, - {-0.927362525650401087274536959}, { 0.374164062971457997104393020}, - { 0.978948175319062194715480124}, { 0.204108966092816874181696950}, - {-0.204108966092816874181696950}, { 0.978948175319062194715480124}, - { 0.547894059173100165608820571}, { 0.836547727223511984524285790}, - {-0.836547727223511984524285790}, { 0.547894059173100165608820571}, - { 0.826321062845663480311195452}, { 0.563199344013834115007363772}, - {-0.563199344013834115007363772}, { 0.826321062845663480311195452}, - { 0.186055151663446648105438304}, { 0.982539302287441255907040396}, - {-0.982539302287441255907040396}, { 0.186055151663446648105438304}, - { 0.994240449453187946358413442}, { 0.107172424956808849175529148}, - {-0.107172424956808849175529148}, { 0.994240449453187946358413442}, - { 0.627251815495144113509622565}, { 0.778816512381475953374724325}, - {-0.778816512381475953374724325}, { 0.627251815495144113509622565}, - { 0.877545290207261291668470750}, { 0.479493757660153026679839798}, - {-0.479493757660153026679839798}, { 0.877545290207261291668470750}, - { 0.281464937925757984095231007}, { 0.959571513081984528335528181}, - {-0.959571513081984528335528181}, { 0.281464937925757984095231007}, - { 0.954228095109105629780430732}, { 0.299079826308040476750336973}, - {-0.299079826308040476750336973}, { 0.954228095109105629780430732}, - { 0.463259783551860197390719637}, { 0.886222530148880631647990821}, - {-0.886222530148880631647990821}, { 0.463259783551860197390719637}, - { 0.767138911935820381181694573}, { 0.641481012808583151988739898}, - {-0.641481012808583151988739898}, { 0.767138911935820381181694573}, - { 0.088853552582524596561586535}, { 0.996044700901251989887944810}, - {-0.996044700901251989887944810}, { 0.088853552582524596561586535}, - { 0.998301544933892840738782163}, { 0.058258264500435759613979782}, - {-0.058258264500435759613979782}, { 0.998301544933892840738782163}, - { 0.664710978203344868130324985}, { 0.747100605980180144323078847}, - {-0.747100605980180144323078847}, { 0.664710978203344868130324985}, - { 0.900015892016160228714535267}, { 0.435857079922255491032544080}, - {-0.435857079922255491032544080}, { 0.900015892016160228714535267}, - { 0.328209843579092526107916817}, { 0.944604837261480265659265493}, - {-0.944604837261480265659265493}, { 0.328209843579092526107916817}, - { 0.967753837093475465243391912}, { 0.251897818154216950498106628}, - {-0.251897818154216950498106628}, { 0.967753837093475465243391912}, - { 0.506186645345155291048942344}, { 0.862423956111040538690933878}, - {-0.862423956111040538690933878}, { 0.506186645345155291048942344}, - { 0.797690840943391108362662755}, { 0.603066598540348201693430617}, - {-0.603066598540348201693430617}, { 0.797690840943391108362662755}, - { 0.137620121586486044948441663}, { 0.990485084256457037998682243}, - {-0.990485084256457037998682243}, { 0.137620121586486044948441663}, - { 0.987784141644572154230969032}, { 0.155828397654265235743101486}, - {-0.155828397654265235743101486}, { 0.987784141644572154230969032}, - { 0.588281548222645304786439813}, { 0.808656181588174991946968128}, - {-0.808656181588174991946968128}, { 0.588281548222645304786439813}, - { 0.852960604930363657746588082}, { 0.521975292937154342694258318}, - {-0.521975292937154342694258318}, { 0.852960604930363657746588082}, - { 0.234041958583543423191242045}, { 0.972226497078936305708321144}, - {-0.972226497078936305708321144}, { 0.234041958583543423191242045}, - { 0.938403534063108112192420774}, { 0.345541324963989065539191723}, - {-0.345541324963989065539191723}, { 0.938403534063108112192420774}, - { 0.419216888363223956433010020}, { 0.907886116487666212038681480}, - {-0.907886116487666212038681480}, { 0.419216888363223956433010020}, - { 0.734738878095963464563223604}, { 0.678350043129861486873655042}, - {-0.678350043129861486873655042}, { 0.734738878095963464563223604}, - { 0.039872927587739811128578738}, { 0.999204758618363895492950001}, - {-0.999204758618363895492950001}, { 0.039872927587739811128578738}, - { 0.999430604555461772019008327}, { 0.033741171851377584833716112}, - {-0.033741171851377584833716112}, { 0.999430604555461772019008327}, - { 0.682845546385248068164596123}, { 0.730562769227827561177758850}, - {-0.730562769227827561177758850}, { 0.682845546385248068164596123}, - { 0.910441292258067196934095369}, { 0.413638312238434547471944324}, - {-0.413638312238434547471944324}, { 0.910441292258067196934095369}, - { 0.351292756085567125601307623}, { 0.936265667170278246576310996}, - {-0.936265667170278246576310996}, { 0.351292756085567125601307623}, - { 0.973644249650811925318383912}, { 0.228072083170885739254457379}, - {-0.228072083170885739254457379}, { 0.973644249650811925318383912}, - { 0.527199134781901348464274575}, { 0.849741768000852489471268395}, - {-0.849741768000852489471268395}, { 0.527199134781901348464274575}, - { 0.812250586585203913049744181}, { 0.583308652937698294392830961}, - {-0.583308652937698294392830961}, { 0.812250586585203913049744181}, - { 0.161886393780111837641387995}, { 0.986809401814185476970235952}, - {-0.986809401814185476970235952}, { 0.161886393780111837641387995}, - { 0.991310859846115418957349799}, { 0.131540028702883111103387493}, - {-0.131540028702883111103387493}, { 0.991310859846115418957349799}, - { 0.607949784967773667243642671}, { 0.793975477554337164895083757}, - {-0.793975477554337164895083757}, { 0.607949784967773667243642671}, - { 0.865513624090569082825488358}, { 0.500885382611240786241285004}, - {-0.500885382611240786241285004}, { 0.865513624090569082825488358}, - { 0.257831102162159005614471295}, { 0.966190003445412555433832961}, - {-0.966190003445412555433832961}, { 0.257831102162159005614471295}, - { 0.946600913083283570044599823}, { 0.322407678801069848384807478}, - {-0.322407678801069848384807478}, { 0.946600913083283570044599823}, - { 0.441371268731716692879988968}, { 0.897324580705418281231391836}, - {-0.897324580705418281231391836}, { 0.441371268731716692879988968}, - { 0.751165131909686411205819422}, { 0.660114342067420478559490747}, - {-0.660114342067420478559490747}, { 0.751165131909686411205819422}, - { 0.064382630929857460819324537}, { 0.997925286198596012623025462}, - {-0.997925286198596012623025462}, { 0.064382630929857460819324537}, - { 0.996571145790554847093566910}, { 0.082740264549375693111987083}, - {-0.082740264549375693111987083}, { 0.996571145790554847093566910}, - { 0.646176012983316364832802220}, { 0.763188417263381271704838297}, - {-0.763188417263381271704838297}, { 0.646176012983316364832802220}, - { 0.889048355854664562540777729}, { 0.457813303598877221904961155}, - {-0.457813303598877221904961155}, { 0.889048355854664562540777729}, - { 0.304929229735402406490728633}, { 0.952375012719765858529893608}, - {-0.952375012719765858529893608}, { 0.304929229735402406490728633}, - { 0.961280485811320641748659653}, { 0.275571819310958163076425168}, - {-0.275571819310958163076425168}, { 0.961280485811320641748659653}, - { 0.484869248000791101822951699}, { 0.874586652278176112634431897}, - {-0.874586652278176112634431897}, { 0.484869248000791101822951699}, - { 0.782650596166575738458949301}, { 0.622461279374149972519166721}, - {-0.622461279374149972519166721}, { 0.782650596166575738458949301}, - { 0.113270952177564349018228733}, { 0.993564135520595333782021697}, - {-0.993564135520595333782021697}, { 0.113270952177564349018228733}, - { 0.983662419211730274396237776}, { 0.180022901405699522679906590}, - {-0.180022901405699522679906590}, { 0.983662419211730274396237776}, - { 0.568258952670131549790548489}, { 0.822849781375826332046780034}, - {-0.822849781375826332046780034}, { 0.568258952670131549790548489}, - { 0.839893794195999504583383987}, { 0.542750784864515906586768661}, - {-0.542750784864515906586768661}, { 0.839893794195999504583383987}, - { 0.210111836880469621717489972}, { 0.977677357824509979943404762}, - {-0.977677357824509979943404762}, { 0.210111836880469621717489972}, - { 0.929640895843181265457918066}, { 0.368466829953372331712746222}, - {-0.368466829953372331712746222}, { 0.929640895843181265457918066}, - { 0.396809987416710328595290911}, { 0.917900775621390457642276297}, - {-0.917900775621390457642276297}, { 0.396809987416710328595290911}, - { 0.717870045055731736211325329}, { 0.696177131491462944788582591}, - {-0.696177131491462944788582591}, { 0.717870045055731736211325329}, - { 0.015339206284988101044151868}, { 0.999882347454212525633049627}, - {-0.999882347454212525633049627}, { 0.015339206284988101044151868}, - { 0.999769405351215321657617036}, { 0.021474080275469507418374898}, - {-0.021474080275469507418374898}, { 0.999769405351215321657617036}, - { 0.691759258364157774906734132}, { 0.722128193929215321243607198}, - {-0.722128193929215321243607198}, { 0.691759258364157774906734132}, - { 0.915448716088267819566431292}, { 0.402434650859418441082533934}, - {-0.402434650859418441082533934}, { 0.915448716088267819566431292}, - { 0.362755724367397216204854462}, { 0.931884265581668106718557199}, - {-0.931884265581668106718557199}, { 0.362755724367397216204854462}, - { 0.976369731330021149312732194}, { 0.216106797076219509948385131}, - {-0.216106797076219509948385131}, { 0.976369731330021149312732194}, - { 0.537587076295645482502214932}, { 0.843208239641845437161743865}, - {-0.843208239641845437161743865}, { 0.537587076295645482502214932}, - { 0.819347520076796960824689637}, { 0.573297166698042212820171239}, - {-0.573297166698042212820171239}, { 0.819347520076796960824689637}, - { 0.173983873387463827950700807}, { 0.984748501801904218556553176}, - {-0.984748501801904218556553176}, { 0.173983873387463827950700807}, - { 0.992850414459865090793563344}, { 0.119365214810991364593637790}, - {-0.119365214810991364593637790}, { 0.992850414459865090793563344}, - { 0.617647307937803932403979402}, { 0.786455213599085757522319464}, - {-0.786455213599085757522319464}, { 0.617647307937803932403979402}, - { 0.871595086655951034842481435}, { 0.490226483288291154229598449}, - {-0.490226483288291154229598449}, { 0.871595086655951034842481435}, - { 0.269668325572915106525464462}, { 0.962953266873683886347921481}, - {-0.962953266873683886347921481}, { 0.269668325572915106525464462}, - { 0.950486073949481721759926101}, { 0.310767152749611495835997250}, - {-0.310767152749611495835997250}, { 0.950486073949481721759926101}, - { 0.452349587233770874133026703}, { 0.891840709392342727796478697}, - {-0.891840709392342727796478697}, { 0.452349587233770874133026703}, - { 0.759209188978388033485525443}, { 0.650846684996380915068975573}, - {-0.650846684996380915068975573}, { 0.759209188978388033485525443}, - { 0.076623861392031492278332463}, { 0.997060070339482978987989949}, - {-0.997060070339482978987989949}, { 0.076623861392031492278332463}, - { 0.997511456140303459699448390}, { 0.070504573389613863027351471}, - {-0.070504573389613863027351471}, { 0.997511456140303459699448390}, - { 0.655492852999615385312679701}, { 0.755201376896536527598710756}, - {-0.755201376896536527598710756}, { 0.655492852999615385312679701}, - { 0.894599485631382678433072126}, { 0.446868840162374195353044389}, - {-0.446868840162374195353044389}, { 0.894599485631382678433072126}, - { 0.316593375556165867243047035}, { 0.948561349915730288158494826}, - {-0.948561349915730288158494826}, { 0.316593375556165867243047035}, - { 0.964589793289812723836432159}, { 0.263754678974831383611349322}, - {-0.263754678974831383611349322}, { 0.964589793289812723836432159}, - { 0.495565261825772531150266670}, { 0.868570705971340895340449876}, - {-0.868570705971340895340449876}, { 0.495565261825772531150266670}, - { 0.790230221437310055030217152}, { 0.612810082429409703935211936}, - {-0.612810082429409703935211936}, { 0.790230221437310055030217152}, - { 0.125454983411546238542336453}, { 0.992099313142191757112085445}, - {-0.992099313142191757112085445}, { 0.125454983411546238542336453}, - { 0.985797509167567424700995000}, { 0.167938294974731178054745536}, - {-0.167938294974731178054745536}, { 0.985797509167567424700995000}, - { 0.578313796411655563342245019}, { 0.815814410806733789010772660}, - {-0.815814410806733789010772660}, { 0.578313796411655563342245019}, - { 0.846490938774052078300544488}, { 0.532403127877197971442805218}, - {-0.532403127877197971442805218}, { 0.846490938774052078300544488}, - { 0.222093620973203534094094721}, { 0.975025345066994146844913468}, - {-0.975025345066994146844913468}, { 0.222093620973203534094094721}, - { 0.934092550404258914729877883}, { 0.357030961233430032614954036}, - {-0.357030961233430032614954036}, { 0.934092550404258914729877883}, - { 0.408044162864978680820747499}, { 0.912962190428398164628018233}, - {-0.912962190428398164628018233}, { 0.408044162864978680820747499}, - { 0.726359155084345976817494315}, { 0.687315340891759108199186948}, - {-0.687315340891759108199186948}, { 0.726359155084345976817494315}, - { 0.027608145778965741612354872}, { 0.999618822495178597116830637}, - {-0.999618822495178597116830637}, { 0.027608145778965741612354872}, - { 0.998941293186856850633930266}, { 0.046003182130914628814301788}, - {-0.046003182130914628814301788}, { 0.998941293186856850633930266}, - { 0.673829000378756060917568372}, { 0.738887324460615147933116508}, - {-0.738887324460615147933116508}, { 0.673829000378756060917568372}, - { 0.905296759318118774354048329}, { 0.424779681209108833357226189}, - {-0.424779681209108833357226189}, { 0.905296759318118774354048329}, - { 0.339776884406826857828825803}, { 0.940506070593268323787291309}, - {-0.940506070593268323787291309}, { 0.339776884406826857828825803}, - { 0.970772140728950302138169611}, { 0.240003022448741486568922365}, - {-0.240003022448741486568922365}, { 0.970772140728950302138169611}, - { 0.516731799017649881508753876}, { 0.856147328375194481019630732}, - {-0.856147328375194481019630732}, { 0.516731799017649881508753876}, - { 0.805031331142963597922659282}, { 0.593232295039799808047809426}, - {-0.593232295039799808047809426}, { 0.805031331142963597922659282}, - { 0.149764534677321517229695737}, { 0.988721691960323767604516485}, - {-0.988721691960323767604516485}, { 0.149764534677321517229695737}, - { 0.989622017463200834623694454}, { 0.143695033150294454819773349}, - {-0.143695033150294454819773349}, { 0.989622017463200834623694454}, - { 0.598160706996342311724958652}, { 0.801376171723140219430247777}, - {-0.801376171723140219430247777}, { 0.598160706996342311724958652}, - { 0.859301818357008404783582139}, { 0.511468850437970399504391001}, - {-0.511468850437970399504391001}, { 0.859301818357008404783582139}, - { 0.245955050335794611599924709}, { 0.969281235356548486048290738}, - {-0.969281235356548486048290738}, { 0.245955050335794611599924709}, - { 0.942573197601446879280758735}, { 0.333999651442009404650865481}, - {-0.333999651442009404650865481}, { 0.942573197601446879280758735}, - { 0.430326481340082633908199031}, { 0.902673318237258806751502391}, - {-0.902673318237258806751502391}, { 0.430326481340082633908199031}, - { 0.743007952135121693517362293}, { 0.669282588346636065720696366}, - {-0.669282588346636065720696366}, { 0.743007952135121693517362293}, - { 0.052131704680283321236358216}, { 0.998640218180265222418199049}, - {-0.998640218180265222418199049}, { 0.052131704680283321236358216}, - { 0.995480755491926941769171600}, { 0.094963495329638998938034312}, - {-0.094963495329638998938034312}, { 0.995480755491926941769171600}, - { 0.636761861236284230413943435}, { 0.771060524261813773200605759}, - {-0.771060524261813773200605759}, { 0.636761861236284230413943435}, - { 0.883363338665731594736308015}, { 0.468688822035827933697617870}, - {-0.468688822035827933697617870}, { 0.883363338665731594736308015}, - { 0.293219162694258650606608599}, { 0.956045251349996443270479823}, - {-0.956045251349996443270479823}, { 0.293219162694258650606608599}, - { 0.957826413027532890321037029}, { 0.287347459544729526477331841}, - {-0.287347459544729526477331841}, { 0.957826413027532890321037029}, - { 0.474100214650550014398580015}, { 0.880470889052160770806542929}, - {-0.880470889052160770806542929}, { 0.474100214650550014398580015}, - { 0.774953106594873878359129282}, { 0.632018735939809021909403706}, - {-0.632018735939809021909403706}, { 0.774953106594873878359129282}, - { 0.101069862754827824987887585}, { 0.994879330794805620591166107}, - {-0.994879330794805620591166107}, { 0.101069862754827824987887585}, - { 0.981379193313754574318224190}, { 0.192080397049892441679288205}, - {-0.192080397049892441679288205}, { 0.981379193313754574318224190}, - { 0.558118531220556115693702964}, { 0.829761233794523042469023765}, - {-0.829761233794523042469023765}, { 0.558118531220556115693702964}, - { 0.833170164701913186439915922}, { 0.553016705580027531764226988}, - {-0.553016705580027531764226988}, { 0.833170164701913186439915922}, - { 0.198098410717953586179324918}, { 0.980182135968117392690210009}, - {-0.980182135968117392690210009}, { 0.198098410717953586179324918}, - { 0.925049240782677590302371869}, { 0.379847208924051170576281147}, - {-0.379847208924051170576281147}, { 0.925049240782677590302371869}, - { 0.385516053843918864075607949}, { 0.922701128333878570437264227}, - {-0.922701128333878570437264227}, { 0.385516053843918864075607949}, - { 0.709272826438865651316533772}, { 0.704934080375904908852523758}, - {-0.704934080375904908852523758}, { 0.709272826438865651316533772}, - { 0.003067956762965976270145365}, { 0.999995293809576171511580126}, - {-0.999995293809576171511580126}, { 0.003067956762965976270145365} -}; - -const fpr fpr_p2_tab[] = { - { 2.00000000000 }, - { 1.00000000000 }, - { 0.50000000000 }, - { 0.25000000000 }, - { 0.12500000000 }, - { 0.06250000000 }, - { 0.03125000000 }, - { 0.01562500000 }, - { 0.00781250000 }, - { 0.00390625000 }, - { 0.00195312500 } -}; - -#else // yyyFPNATIVE+0 yyyFPEMU+0 - -#error No FP implementation selected - -#endif // yyyFPNATIVE- yyyFPEMU- diff --git a/crypto_sign/falcon-512/m4-ct/fpr.h b/crypto_sign/falcon-512/m4-ct/fpr.h deleted file mode 100644 index 8176212d..00000000 --- a/crypto_sign/falcon-512/m4-ct/fpr.h +++ /dev/null @@ -1,893 +0,0 @@ -/* - * Floating-point operations. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#if FALCON_FPEMU // yyyFPEMU+1 yyyFPNATIVE+0 - -/* ====================================================================== */ -/* - * Custom floating-point implementation with integer arithmetics. We - * use IEEE-754 "binary64" format, with some simplifications: - * - * - Top bit is s = 1 for negative, 0 for positive. - * - * - Exponent e uses the next 11 bits (bits 52 to 62, inclusive). - * - * - Mantissa m uses the 52 low bits. - * - * Encoded value is, in general: (-1)^s * 2^(e-1023) * (1 + m*2^(-52)) - * i.e. the mantissa really is a 53-bit number (less than 2.0, but not - * less than 1.0), but the top bit (equal to 1 by definition) is omitted - * in the encoding. - * - * In IEEE-754, there are some special values: - * - * - If e = 2047, then the value is either an infinite (m = 0) or - * a NaN (m != 0). - * - * - If e = 0, then the value is either a zero (m = 0) or a subnormal, - * aka "denormalized number" (m != 0). - * - * Of these, we only need the zeros. The caller is responsible for not - * providing operands that would lead to infinites, NaNs or subnormals. - * If inputs are such that values go out of range, then indeterminate - * values are returned (it would still be deterministic, but no specific - * value may be relied upon). - * - * At the C level, the three parts are stored in a 64-bit unsigned - * word. - * - * One may note that a property of the IEEE-754 format is that order - * is preserved for positive values: if two positive floating-point - * values x and y are such that x < y, then their respective encodings - * as _signed_ 64-bit integers i64(x) and i64(y) will be such that - * i64(x) < i64(y). For negative values, order is reversed: if x < 0, - * y < 0, and x < y, then ia64(x) > ia64(y). - * - * IMPORTANT ASSUMPTIONS: - * ====================== - * - * For proper computations, and constant-time behaviour, we assume the - * following: - * - * - 32x32->64 multiplication (unsigned) has an execution time that - * is independent of its operands. This is true of most modern - * x86 and ARM cores. Notable exceptions are the ARM Cortex M0, M0+ - * and M3 (in the M0 and M0+, this is done in software, so it depends - * on that routine), and the PowerPC cores from the G3/G4 lines. - * For more info, see: https://www.bearssl.org/ctmul.html - * - * - Left-shifts and right-shifts of 32-bit values have an execution - * time which does not depend on the shifted value nor on the - * shift count. An historical exception is the Pentium IV, but most - * modern CPU have barrel shifters. Some small microcontrollers - * might have varying-time shifts (not the ARM Cortex M*, though). - * - * - Right-shift of a signed negative value performs a sign extension. - * As per the C standard, this operation returns an - * implementation-defined result (this is NOT an "undefined - * behaviour"). On most/all systems, an arithmetic shift is - * performed, because this is what makes most sense. - */ - -/* - * Normally we should declare the 'fpr' type to be a struct or union - * around the internal 64-bit value; however, we want to use the - * direct 64-bit integer type to enable a lighter call convention on - * ARM platforms. This means that direct (invalid) use of operators - * such as '*' or '+' will not be caught by the compiler. We rely on - * the "normal" (non-emulated) code to detect such instances. - */ -typedef uint64_t fpr; - -/* - * For computations, we split values into an integral mantissa in the - * 2^54..2^55 range, and an (adjusted) exponent. The lowest bit is - * "sticky" (it is set to 1 if any of the bits below it is 1); when - * re-encoding, the low two bits are dropped, but may induce an - * increment in the value for proper rounding. - */ - -/* - * Right-shift a 64-bit unsigned value by a possibly secret shift count. - * We assumed that the underlying architecture had a barrel shifter for - * 32-bit shifts, but for 64-bit shifts on a 32-bit system, this will - * typically invoke a software routine that is not necessarily - * constant-time; hence the function below. - * - * Shift count n MUST be in the 0..63 range. - */ -static inline uint64_t -fpr_ursh(uint64_t x, int n) -{ - x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5); - return x >> (n & 31); -} - -/* - * Right-shift a 64-bit signed value by a possibly secret shift count - * (see fpr_ursh() for the rationale). - * - * Shift count n MUST be in the 0..63 range. - */ -static inline int64_t -fpr_irsh(int64_t x, int n) -{ - x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5); - return x >> (n & 31); -} - -/* - * Left-shift a 64-bit unsigned value by a possibly secret shift count - * (see fpr_ursh() for the rationale). - * - * Shift count n MUST be in the 0..63 range. - */ -static inline uint64_t -fpr_ulsh(uint64_t x, int n) -{ - x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5); - return x << (n & 31); -} - -/* - * Expectations: - * s = 0 or 1 - * exponent e is "arbitrary" and unbiased - * 2^54 <= m < 2^55 - * Numerical value is (-1)^2 * m * 2^e - * - * Exponents which are too low lead to value zero. If the exponent is - * too large, the returned value is indeterminate. - * - * If m = 0, then a zero is returned (using the provided sign). - * If e < -1076, then a zero is returned (regardless of the value of m). - * If e >= -1076 and e != 0, m must be within the expected range - * (2^54 to 2^55-1). - */ -static inline fpr -FPR(int s, int e, uint64_t m) -{ - fpr x; - uint32_t t; - unsigned f; - - /* - * If e >= -1076, then the value is "normal"; otherwise, it - * should be a subnormal, which we clamp down to zero. - */ - e += 1076; - t = (uint32_t)e >> 31; - m &= (uint64_t)t - 1; - - /* - * If m = 0 then we want a zero; make e = 0 too, but conserve - * the sign. - */ - t = (uint32_t)(m >> 54); - e &= -(int)t; - - /* - * The 52 mantissa bits come from m. Value m has its top bit set - * (unless it is a zero); we leave it "as is": the top bit will - * increment the exponent by 1, except when m = 0, which is - * exactly what we want. - */ - x = (((uint64_t)s << 63) | (m >> 2)) + ((uint64_t)(uint32_t)e << 52); - - /* - * Rounding: if the low three bits of m are 011, 110 or 111, - * then the value should be incremented to get the next - * representable value. This implements the usual - * round-to-nearest rule (with preference to even values in case - * of a tie). Note that the increment may make a carry spill - * into the exponent field, which is again exactly what we want - * in that case. - */ - f = (unsigned)m & 7U; - x += (0xC8U >> f) & 1; - return x; -} - -#define fpr_scaled Zf(fpr_scaled) -fpr fpr_scaled(int64_t i, int sc); - -static inline fpr -fpr_of(int64_t i) -{ - return fpr_scaled(i, 0); -} - -static const fpr fpr_q = 4667981563525332992; -static const fpr fpr_inverse_of_q = 4545632735260551042; -static const fpr fpr_inv_2sqrsigma0 = 4594603506513722306; -static const fpr fpr_inv_sigma = 4573359825155195350; -static const fpr fpr_sigma_min_9 = 4608495221497168882; -static const fpr fpr_sigma_min_10 = 4608586345619182117; -static const fpr fpr_log2 = 4604418534313441775; -static const fpr fpr_inv_log2 = 4609176140021203710; -static const fpr fpr_bnorm_max = 4670353323383631276; -static const fpr fpr_zero = 0; -static const fpr fpr_one = 4607182418800017408; -static const fpr fpr_two = 4611686018427387904; -static const fpr fpr_onehalf = 4602678819172646912; -static const fpr fpr_invsqrt2 = 4604544271217802189; -static const fpr fpr_invsqrt8 = 4600040671590431693; -static const fpr fpr_ptwo31 = 4746794007248502784; -static const fpr fpr_ptwo31m1 = 4746794007244308480; -static const fpr fpr_mtwo31m1 = 13970166044099084288U; -static const fpr fpr_ptwo63m1 = 4890909195324358656; -static const fpr fpr_mtwo63m1 = 14114281232179134464U; -static const fpr fpr_ptwo63 = 4890909195324358656; - -static inline int64_t -fpr_rint(fpr x) -{ - uint64_t m, d; - int e; - uint32_t s, dd, f; - - /* - * We assume that the value fits in -(2^63-1)..+(2^63-1). We can - * thus extract the mantissa as a 63-bit integer, then right-shift - * it as needed. - */ - m = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1); - e = 1085 - ((int)(x >> 52) & 0x7FF); - - /* - * If a shift of more than 63 bits is needed, then simply set m - * to zero. This also covers the case of an input operand equal - * to zero. - */ - m &= -(uint64_t)((uint32_t)(e - 64) >> 31); - e &= 63; - - /* - * Right-shift m as needed. Shift count is e. Proper rounding - * mandates that: - * - If the highest dropped bit is zero, then round low. - * - If the highest dropped bit is one, and at least one of the - * other dropped bits is one, then round up. - * - If the highest dropped bit is one, and all other dropped - * bits are zero, then round up if the lowest kept bit is 1, - * or low otherwise (i.e. ties are broken by "rounding to even"). - * - * We thus first extract a word consisting of all the dropped bit - * AND the lowest kept bit; then we shrink it down to three bits, - * the lowest being "sticky". - */ - d = fpr_ulsh(m, 63 - e); - dd = (uint32_t)d | ((uint32_t)(d >> 32) & 0x1FFFFFFF); - f = (uint32_t)(d >> 61) | ((dd | -dd) >> 31); - m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U); - - /* - * Apply the sign bit. - */ - s = (uint32_t)(x >> 63); - return ((int64_t)m ^ -(int64_t)s) + (int64_t)s; -} - -static inline int64_t -fpr_floor(fpr x) -{ - uint64_t t; - int64_t xi; - int e, cc; - - /* - * We extract the integer as a _signed_ 64-bit integer with - * a scaling factor. Since we assume that the value fits - * in the -(2^63-1)..+(2^63-1) range, we can left-shift the - * absolute value to make it in the 2^62..2^63-1 range: we - * will only need a right-shift afterwards. - */ - e = (int)(x >> 52) & 0x7FF; - t = x >> 63; - xi = (int64_t)(((x << 10) | ((uint64_t)1 << 62)) - & (((uint64_t)1 << 63) - 1)); - xi = (xi ^ -(int64_t)t) + (int64_t)t; - cc = 1085 - e; - - /* - * We perform an arithmetic right-shift on the value. This - * applies floor() semantics on both positive and negative values - * (rounding toward minus infinity). - */ - xi = fpr_irsh(xi, cc & 63); - - /* - * If the true shift count was 64 or more, then we should instead - * replace xi with 0 (if nonnegative) or -1 (if negative). Edge - * case: -0 will be floored to -1, not 0 (whether this is correct - * is debatable; in any case, the other functions normalize zero - * to +0). - * - * For an input of zero, the non-shifted xi was incorrect (we used - * a top implicit bit of value 1, not 0), but this does not matter - * since this operation will clamp it down. - */ - xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31); - return xi; -} - -static inline int64_t -fpr_trunc(fpr x) -{ - uint64_t t, xu; - int e, cc; - - /* - * Extract the absolute value. Since we assume that the value - * fits in the -(2^63-1)..+(2^63-1) range, we can left-shift - * the absolute value into the 2^62..2^63-1 range, and then - * do a right shift afterwards. - */ - e = (int)(x >> 52) & 0x7FF; - xu = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1); - cc = 1085 - e; - xu = fpr_ursh(xu, cc & 63); - - /* - * If the exponent is too low (cc > 63), then the shift was wrong - * and we must clamp the value to 0. This also covers the case - * of an input equal to zero. - */ - xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31); - - /* - * Apply back the sign, if the source value is negative. - */ - t = x >> 63; - xu = (xu ^ -t) + t; - return *(int64_t *)&xu; -} - -#define fpr_add Zf(fpr_add) -fpr fpr_add(fpr x, fpr y); - -static inline fpr -fpr_sub(fpr x, fpr y) -{ - y ^= (uint64_t)1 << 63; - return fpr_add(x, y); -} - -static inline fpr -fpr_neg(fpr x) -{ - x ^= (uint64_t)1 << 63; - return x; -} - -static inline fpr -fpr_half(fpr x) -{ - /* - * To divide a value by 2, we just have to subtract 1 from its - * exponent, but we have to take care of zero. - */ - uint32_t t; - - x -= (uint64_t)1 << 52; - t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11; - x &= (uint64_t)t - 1; - return x; -} - -static inline fpr -fpr_double(fpr x) -{ - /* - * To double a value, we just increment by one the exponent. We - * don't care about infinites or NaNs; however, 0 is a - * special case. - */ - x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52; - return x; -} - -#define fpr_mul Zf(fpr_mul) -fpr fpr_mul(fpr x, fpr y); - -static inline fpr -fpr_sqr(fpr x) -{ - return fpr_mul(x, x); -} - -#define fpr_div Zf(fpr_div) -fpr fpr_div(fpr x, fpr y); - -static inline fpr -fpr_inv(fpr x) -{ - return fpr_div(4607182418800017408u, x); -} - -#define fpr_sqrt Zf(fpr_sqrt) -fpr fpr_sqrt(fpr x); - -static inline int -fpr_lt(fpr x, fpr y) -{ - /* - * If x >= 0 or y >= 0, a signed comparison yields the proper - * result: - * - For positive values, the order is preserved. - * - The sign bit is at the same place as in integers, so - * sign is preserved. - * - * If both x and y are negative, then the order is reversed. - * We cannot simply invert the comparison result in that case - * because it would not handle the edge case x = y properly. - */ - int cc0, cc1; - - cc0 = *(int64_t *)&x < *(int64_t *)&y; - cc1 = *(int64_t *)&x > *(int64_t *)&y; - return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63)); -} - -/* - * Compute exp(x) for x such that |x| <= ln 2. We want a precision of 50 - * bits or so. - */ -#define fpr_expm_p63 Zf(fpr_expm_p63) -uint64_t fpr_expm_p63(fpr x, fpr ccs); - -#define fpr_gm_tab Zf(fpr_gm_tab) -extern const fpr fpr_gm_tab[]; - -#define fpr_p2_tab Zf(fpr_p2_tab) -extern const fpr fpr_p2_tab[]; - -/* ====================================================================== */ - -#elif FALCON_FPNATIVE // yyyFPEMU+0 yyyFPNATIVE+1 - -/* ====================================================================== */ - -#include - -/* - * We wrap the native 'double' type into a structure so that the C compiler - * complains if we inadvertently use raw arithmetic operators on the 'fpr' - * type instead of using the inline functions below. This should have no - * extra runtime cost, since all the functions below are 'inline'. - */ -typedef struct { double v; } fpr; - -static inline fpr -FPR(double v) -{ - fpr x; - - x.v = v; - return x; -} - -static inline fpr -fpr_of(int64_t i) -{ - return FPR((double)i); -} - -static const fpr fpr_q = { 12289.0 }; -static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 }; -static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 }; -static const fpr fpr_inv_sigma = { .005819826392951607426919370871 }; -static const fpr fpr_sigma_min_9 = { 1.291500756233514568549480827642 }; -static const fpr fpr_sigma_min_10 = { 1.311734375905083682667395805765 }; -static const fpr fpr_log2 = { 0.69314718055994530941723212146 }; -static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 }; -static const fpr fpr_bnorm_max = { 16822.4121 }; -static const fpr fpr_zero = { 0.0 }; -static const fpr fpr_one = { 1.0 }; -static const fpr fpr_two = { 2.0 }; -static const fpr fpr_onehalf = { 0.5 }; -static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 }; -static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 }; -static const fpr fpr_ptwo31 = { 2147483648.0 }; -static const fpr fpr_ptwo31m1 = { 2147483647.0 }; -static const fpr fpr_mtwo31m1 = { -2147483647.0 }; -static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 }; -static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 }; -static const fpr fpr_ptwo63 = { 9223372036854775808.0 }; - -static inline int64_t -fpr_rint(fpr x) -{ - /* - * We do not want to use llrint() since it might be not - * constant-time. - * - * Suppose that x >= 0. If x >= 2^52, then it is already an - * integer. Otherwise, if x < 2^52, then computing x+2^52 will - * yield a value that will be rounded to the nearest integer - * with exactly the right rules (round-to-nearest-even). - * - * In order to have constant-time processing, we must do the - * computation for both x >= 0 and x < 0 cases, and use a - * cast to an integer to access the sign and select the proper - * value. Such casts also allow us to find out if |x| < 2^52. - */ - int64_t sx, tx, rp, rn, m; - uint32_t ub; - - sx = (int64_t)(x.v - 1.0); - tx = (int64_t)x.v; - rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496; - rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496; - - /* - * If tx >= 2^52 or tx < -2^52, then result is tx. - * Otherwise, if sx >= 0, then result is rp. - * Otherwise, result is rn. We use the fact that when x is - * close to 0 (|x| <= 0.25) then both rp and rn are correct; - * and if x is not close to 0, then trunc(x-1.0) yields the - * appropriate sign. - */ - - /* - * Clamp rp to zero if tx < 0. - * Clamp rn to zero if tx >= 0. - */ - m = sx >> 63; - rn &= m; - rp &= ~m; - - /* - * Get the 12 upper bits of tx; if they are not all zeros or - * all ones, then tx >= 2^52 or tx < -2^52, and we clamp both - * rp and rn to zero. Otherwise, we clamp tx to zero. - */ - ub = (uint32_t)((uint64_t)tx >> 52); - m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31); - rp &= m; - rn &= m; - tx &= ~m; - - /* - * Only one of tx, rn or rp (at most) can be non-zero at this - * point. - */ - return tx | rn | rp; -} - -static inline int64_t -fpr_floor(fpr x) -{ - int64_t r; - - /* - * The cast performs a trunc() (rounding toward 0) and thus is - * wrong by 1 for most negative values. The correction below is - * constant-time as long as the compiler turns the - * floating-point conversion result into a 0/1 integer without a - * conditional branch or another non-constant-time construction. - * This should hold on all modern architectures with an FPU (and - * if it is false on a given arch, then chances are that the FPU - * itself is not constant-time, making the point moot). - */ - r = (int64_t)x.v; - return r - (x.v < (double)r); -} - -static inline int64_t -fpr_trunc(fpr x) -{ - return (int64_t)x.v; -} - -static inline fpr -fpr_add(fpr x, fpr y) -{ - return FPR(x.v + y.v); -} - -static inline fpr -fpr_sub(fpr x, fpr y) -{ - return FPR(x.v - y.v); -} - -static inline fpr -fpr_neg(fpr x) -{ - return FPR(-x.v); -} - -static inline fpr -fpr_half(fpr x) -{ - return FPR(x.v * 0.5); -} - -static inline fpr -fpr_double(fpr x) -{ - return FPR(x.v + x.v); -} - -static inline fpr -fpr_mul(fpr x, fpr y) -{ - return FPR(x.v * y.v); -} - -static inline fpr -fpr_sqr(fpr x) -{ - return FPR(x.v * x.v); -} - -static inline fpr -fpr_inv(fpr x) -{ - return FPR(1.0 / x.v); -} - -static inline fpr -fpr_div(fpr x, fpr y) -{ - return FPR(x.v / y.v); -} - -#if FALCON_AVX2 // yyyAVX2+1 -TARGET_AVX2 -static inline void -fpr_sqrt_avx2(double *t) -{ - __m128d x; - - x = _mm_load1_pd(t); - x = _mm_sqrt_pd(x); - _mm_storel_pd(t, x); -} -#endif // yyyAVX2- - -static inline fpr -fpr_sqrt(fpr x) -{ - /* - * We prefer not to have a dependency on libm when it can be - * avoided. On x86, calling the sqrt() libm function inlines - * the relevant opcode (fsqrt or sqrtsd, depending on whether - * the 387 FPU or SSE2 is used for floating-point operations) - * but then makes an optional call to the library function - * for proper error handling, in case the operand is negative. - * - * To avoid this dependency, we use intrinsics or inline assembly - * on recognized platforms: - * - * - If AVX2 is explicitly enabled, then we use SSE2 intrinsics. - * - * - On GCC/Clang with SSE maths, we use SSE2 intrinsics. - * - * - On GCC/Clang on i386, or MSVC on i386, we use inline assembly - * to call the 387 FPU fsqrt opcode. - * - * - On GCC/Clang/XLC on PowerPC, we use inline assembly to call - * the fsqrt opcode (Clang needs a special hack). - * - * - On GCC/Clang on ARM with hardware floating-point, we use - * inline assembly to call the vqsrt.f64 opcode. Due to a - * complex ecosystem of compilers and assembly syntaxes, we - * have to call it "fsqrt" or "fsqrtd", depending on case. - * - * If the platform is not recognized, a call to the system - * library function sqrt() is performed. On some compilers, this - * may actually inline the relevant opcode, and call the library - * function only when the input is invalid (e.g. negative); - * Falcon never actually calls sqrt() on a negative value, but - * the dependency to libm will still be there. - */ - -#if FALCON_AVX2 // yyyAVX2+1 - fpr_sqrt_avx2(&x.v); - return x; -#else // yyyAVX2+0 -#if defined __GNUC__ && defined __SSE2_MATH__ - return FPR(_mm_cvtsd_f64(_mm_sqrt_pd(_mm_set1_pd(x.v)))); -#elif defined __GNUC__ && defined __i386__ - __asm__ __volatile__ ( - "fldl %0\n\t" - "fsqrt\n\t" - "fstpl %0\n\t" - : "+m" (x.v) : : ); - return x; -#elif defined _M_IX86 - __asm { - fld x.v - fsqrt - fstp x.v - } - return x; -#elif defined __PPC__ && defined __GNUC__ - fpr y; - -#if defined __clang__ - /* - * Normally we should use a 'd' constraint (register that contains - * a 'double' value) but Clang 3.8.1 chokes on it. Instead we use - * an 'f' constraint, counting on the fact that 'float' values - * are managed in double-precision registers anyway, and the - * compiler will not add extra rounding steps. - */ - __asm__ ( "fsqrt %0, %1" : "=f" (y.v) : "f" (x.v) : ); -#else - __asm__ ( "fsqrt %0, %1" : "=d" (y.v) : "d" (x.v) : ); -#endif - return y; -#elif (defined __ARM_FP && ((__ARM_FP & 0x08) == 0x08)) \ - || (!defined __ARM_FP && defined __ARM_VFPV2__) - /* - * On ARM, assembly syntaxes are a bit of a mess, depending on - * whether GCC or Clang is used, and the binutils version, and - * whether this is 32-bit or 64-bit mode. The code below appears - * to work on: - * 32-bit GCC-4.9.2 Clang-3.5 Binutils-2.25 - * 64-bit GCC-6.3.0 Clang-3.9 Binutils-2.28 - */ -#if defined __aarch64__ && __aarch64__ - __asm__ ( "fsqrt %d0, %d0" : "+w" (x.v) : : ); -#else - __asm__ ( "fsqrtd %P0, %P0" : "+w" (x.v) : : ); -#endif - return x; -#else - return FPR(sqrt(x.v)); -#endif -#endif // yyyAVX2- -} - -static inline int -fpr_lt(fpr x, fpr y) -{ - return x.v < y.v; -} - -TARGET_AVX2 -static inline uint64_t -fpr_expm_p63(fpr x, fpr ccs) -{ - /* - * Polynomial approximation of exp(-x) is taken from FACCT: - * https://eprint.iacr.org/2018/1234 - * Specifically, values are extracted from the implementation - * referenced from the FACCT article, and available at: - * https://github.com/raykzhao/gaussian - * Tests over more than 24 billions of random inputs in the - * 0..log(2) range have never shown a deviation larger than - * 2^(-50) from the true mathematical value. - */ - -#if FALCON_AVX2 // yyyAVX2+1 - - /* - * AVX2 implementation uses more operations than Horner's method, - * but with a lower expression tree depth. This helps because - * additions and multiplications have a latency of 4 cycles on - * a Skylake, but the CPU can issue two of them per cycle. - */ - - static const union { - double d[12]; - __m256d v[3]; - } c = { - { - 0.999999999999994892974086724280, - 0.500000000000019206858326015208, - 0.166666666666984014666397229121, - 0.041666666666110491190622155955, - 0.008333333327800835146903501993, - 0.001388888894063186997887560103, - 0.000198412739277311890541063977, - 0.000024801566833585381209939524, - 0.000002755586350219122514855659, - 0.000000275607356160477811864927, - 0.000000025299506379442070029551, - 0.000000002073772366009083061987 - } - }; - - double d1, d2, d4, d8, y; - __m256d d14, d58, d9c; - - d1 = -x.v; - d2 = d1 * d1; - d4 = d2 * d2; - d8 = d4 * d4; - d14 = _mm256_set_pd(d4, d2 * d1, d2, d1); - d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4)); - d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8)); - d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0])); - d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14); - d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58); - d9c = _mm256_hadd_pd(d9c, d9c); - y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c) - + _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1)); - y *= ccs.v; - - /* - * Final conversion goes through int64_t first, because that's what - * the underlying opcode (vcvttsd2si) will do, and we know that the - * result will fit, since x >= 0 and ccs < 1. If we did the - * conversion directly to uint64_t, then the compiler would add some - * extra code to cover the case of a source value of 2^63 or more, - * and though the alternate path would never be exercised, the - * extra comparison would cost us some cycles. - */ - return (uint64_t)(int64_t)(y * fpr_ptwo63.v); - -#else // yyyAVX2+0 - - /* - * Normal implementation uses Horner's method, which minimizes - * the number of operations. - */ - - double d, y; - - d = x.v; - y = 0.000000002073772366009083061987; - y = 0.000000025299506379442070029551 - y * d; - y = 0.000000275607356160477811864927 - y * d; - y = 0.000002755586350219122514855659 - y * d; - y = 0.000024801566833585381209939524 - y * d; - y = 0.000198412739277311890541063977 - y * d; - y = 0.001388888894063186997887560103 - y * d; - y = 0.008333333327800835146903501993 - y * d; - y = 0.041666666666110491190622155955 - y * d; - y = 0.166666666666984014666397229121 - y * d; - y = 0.500000000000019206858326015208 - y * d; - y = 0.999999999999994892974086724280 - y * d; - y = 1.000000000000000000000000000000 - y * d; - y *= ccs.v; - return (uint64_t)(y * fpr_ptwo63.v); - -#endif // yyyAVX2- -} - -#define fpr_gm_tab Zf(fpr_gm_tab) -extern const fpr fpr_gm_tab[]; - -#define fpr_p2_tab Zf(fpr_p2_tab) -extern const fpr fpr_p2_tab[]; - -/* ====================================================================== */ - -#else // yyyFPEMU+0 yyyFPNATIVE+0 - -#error No FP implementation selected - -#endif // yyyFPEMU- yyyFPNATIVE- diff --git a/crypto_sign/falcon-512/m4-ct/inner.h b/crypto_sign/falcon-512/m4-ct/inner.h deleted file mode 100644 index 1f7d0819..00000000 --- a/crypto_sign/falcon-512/m4-ct/inner.h +++ /dev/null @@ -1,1168 +0,0 @@ -#ifndef FALCON_INNER_H__ -#define FALCON_INNER_H__ - -/* - * Internal functions for Falcon. This is not the API intended to be - * used by applications; instead, this internal API provides all the - * primitives on which wrappers build to provide external APIs. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -/* - * IMPORTANT API RULES - * ------------------- - * - * This API has some non-trivial usage rules: - * - * - * - All public functions (i.e. the non-static ones) must be referenced - * with the Zf() macro (e.g. Zf(verify_raw) for the verify_raw() - * function). That macro adds a prefix to the name, which is - * configurable with the FALCON_PREFIX macro. This allows compiling - * the code into a specific "namespace" and potentially including - * several versions of this code into a single application (e.g. to - * have an AVX2 and a non-AVX2 variants and select the one to use at - * runtime based on availability of AVX2 opcodes). - * - * - Functions that need temporary buffers expects them as a final - * tmp[] array of type uint8_t*, with a size which is documented for - * each function. However, most have some alignment requirements, - * because they will use the array to store 16-bit, 32-bit or 64-bit - * values (e.g. uint64_t or double). The caller must ensure proper - * alignment. What happens on unaligned access depends on the - * underlying architecture, ranging from a slight time penalty - * to immediate termination of the process. - * - * - Some functions rely on specific rounding rules and precision for - * floating-point numbers. On some systems (in particular 32-bit x86 - * with the 387 FPU), this requires setting an hardware control - * word. The caller MUST use set_fpu_cw() to ensure proper precision: - * - * oldcw = set_fpu_cw(2); - * Zf(sign_dyn)(...); - * set_fpu_cw(oldcw); - * - * On systems where the native floating-point precision is already - * proper, or integer-based emulation is used, the set_fpu_cw() - * function does nothing, so it can be called systematically. - */ - -// yyyPQCLEAN+0 yyyNIST+0 yyySUPERCOP+0 -#include "config.h" -// yyyPQCLEAN- yyyNIST- yyySUPERCOP- -// yyySUPERCOP+1 -// yyyCONF* -// yyySUPERCOP- - -#include -#include -#include - -#if defined FALCON_AVX2 && FALCON_AVX2 // yyyAVX2+1 -/* - * This implementation uses AVX2 and optionally FMA intrinsics. - */ -#include -#ifndef FALCON_LE -#define FALCON_LE 1 -#endif -#ifndef FALCON_UNALIGNED -#define FALCON_UNALIGNED 1 -#endif -#if defined __GNUC__ -#if defined FALCON_FMA && FALCON_FMA -#define TARGET_AVX2 __attribute__((target("avx2,fma"))) -#else -#define TARGET_AVX2 __attribute__((target("avx2"))) -#endif -#elif defined _MSC_VER && _MSC_VER -#pragma warning( disable : 4752 ) -#endif -#if defined FALCON_FMA && FALCON_FMA -#define FMADD(a, b, c) _mm256_fmadd_pd(a, b, c) -#define FMSUB(a, b, c) _mm256_fmsub_pd(a, b, c) -#else -#define FMADD(a, b, c) _mm256_add_pd(_mm256_mul_pd(a, b), c) -#define FMSUB(a, b, c) _mm256_sub_pd(_mm256_mul_pd(a, b), c) -#endif -#endif // yyyAVX2- - -// yyyNIST+0 yyyPQCLEAN+0 -/* - * On MSVC, disable warning about applying unary minus on an unsigned - * type: this is perfectly defined standard behaviour and we do it - * quite often. - */ -#if defined _MSC_VER && _MSC_VER -#pragma warning( disable : 4146 ) -#endif - -// yyySUPERCOP+0 -/* - * Enable ARM assembly on any ARMv7m platform (if it was not done before). - */ -#ifndef FALCON_ASM_CORTEXM4 -#if (defined __ARM_ARCH_7EM__ && __ARM_ARCH_7EM__) \ - && (defined __ARM_FEATURE_DSP && __ARM_FEATURE_DSP) -#define FALCON_ASM_CORTEXM4 1 -#else -#define FALCON_ASM_CORTEXM4 0 -#endif -#endif -// yyySUPERCOP- - -#if defined __i386__ || defined _M_IX86 \ - || defined __x86_64__ || defined _M_X64 || \ - (defined _ARCH_PWR8 && \ - (defined __LITTLE_ENDIAN || defined __LITTLE_ENDIAN__)) - -#ifndef FALCON_LE -#define FALCON_LE 1 -#endif -#ifndef FALCON_UNALIGNED -#define FALCON_UNALIGNED 1 -#endif - -#elif defined FALCON_ASM_CORTEXM4 && FALCON_ASM_CORTEXM4 - -#ifndef FALCON_LE -#define FALCON_LE 1 -#endif -#ifndef FALCON_UNALIGNED -#define FALCON_UNALIGNED 0 -#endif - -#elif (defined __LITTLE_ENDIAN__ && __LITTLE_ENDIAN__) \ - || (defined __BYTE_ORDER__ && defined __ORDER_LITTLE_ENDIAN__ \ - && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) - -#ifndef FALCON_LE -#define FALCON_LE 1 -#endif -#ifndef FALCON_UNALIGNED -#define FALCON_UNALIGNED 0 -#endif - -#else - -#ifndef FALCON_LE -#define FALCON_LE 0 -#endif -#ifndef FALCON_UNALIGNED -#define FALCON_UNALIGNED 0 -#endif - -#endif - -/* - * We ensure that both FALCON_FPEMU and FALCON_FPNATIVE are defined, - * with compatible values (exactly one of them must be non-zero). - * If none is defined, then default FP implementation is 'native' - * except on ARM Cortex M4. - */ -#if !defined FALCON_FPEMU && !defined FALCON_FPNATIVE - -#if (defined __ARM_FP && ((__ARM_FP & 0x08) == 0x08)) \ - || (!defined __ARM_FP && defined __ARM_VFPV2__) -#define FALCON_FPEMU 0 -#define FALCON_FPNATIVE 1 -#elif defined FALCON_ASM_CORTEXM4 && FALCON_ASM_CORTEXM4 -#define FALCON_FPEMU 1 -#define FALCON_FPNATIVE 0 -#else -#define FALCON_FPEMU 0 -#define FALCON_FPNATIVE 1 -#endif - -#elif defined FALCON_FPEMU && !defined FALCON_FPNATIVE - -#if FALCON_FPEMU -#define FALCON_FPNATIVE 0 -#else -#define FALCON_FPNATIVE 1 -#endif - -#elif defined FALCON_FPNATIVE && !defined FALCON_FPEMU - -#if FALCON_FPNATIVE -#define FALCON_FPEMU 0 -#else -#define FALCON_FPEMU 1 -#endif - -#endif - -#if (FALCON_FPEMU && FALCON_FPNATIVE) || (!FALCON_FPEMU && !FALCON_FPNATIVE) -#error Exactly one of FALCON_FPEMU and FALCON_FPNATIVE must be selected -#endif - -// yyySUPERCOP+0 -/* - * For seed generation from the operating system: - * - On Linux and glibc-2.25+, FreeBSD 12+ and OpenBSD, use getentropy(). - * - On Unix-like systems, use /dev/urandom (including as a fallback - * for failed getentropy() calls). - * - On Windows, use CryptGenRandom(). - */ - -#ifndef FALCON_RAND_GETENTROPY -#if (defined __linux__ && defined __GLIBC__ \ - && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 25))) \ - || (defined __FreeBSD__ && __FreeBSD__ >= 12) \ - || defined __OpenBSD__ -#define FALCON_RAND_GETENTROPY 1 -#else -#define FALCON_RAND_GETENTROPY 0 -#endif -#endif - -#ifndef FALCON_RAND_URANDOM -#if defined _AIX \ - || defined __ANDROID__ \ - || defined __FreeBSD__ \ - || defined __NetBSD__ \ - || defined __OpenBSD__ \ - || defined __DragonFly__ \ - || defined __linux__ \ - || (defined __sun && (defined __SVR4 || defined __svr4__)) \ - || (defined __APPLE__ && defined __MACH__) -#define FALCON_RAND_URANDOM 1 -#else -#define FALCON_RAND_URANDOM 0 -#endif -#endif - -#ifndef FALCON_RAND_WIN32 -#if defined _WIN32 || defined _WIN64 -#define FALCON_RAND_WIN32 1 -#else -#define FALCON_RAND_WIN32 0 -#endif -#endif -// yyySUPERCOP- - -/* - * For still undefined compile-time macros, define them to 0 to avoid - * warnings with -Wundef. - */ -#ifndef FALCON_AVX2 -#define FALCON_AVX2 0 -#endif -#ifndef FALCON_FMA -#define FALCON_FMA 0 -#endif -#ifndef FALCON_KG_CHACHA20 -#define FALCON_KG_CHACHA20 0 -#endif -// yyyNIST- yyyPQCLEAN- - -// yyyPQCLEAN+0 yyySUPERCOP+0 -/* - * "Naming" macro used to apply a consistent prefix over all global - * symbols. - */ -#ifndef FALCON_PREFIX -#define FALCON_PREFIX falcon_inner -#endif -#define Zf(name) Zf_(FALCON_PREFIX, name) -#define Zf_(prefix, name) Zf__(prefix, name) -#define Zf__(prefix, name) prefix ## _ ## name -// yyyPQCLEAN- yyySUPERCOP- - -// yyyAVX2+1 -/* - * We use the TARGET_AVX2 macro to tag some functions which, in some - * configurations, may use AVX2 and FMA intrinsics; this depends on - * the compiler. In all other cases, we just define it to emptiness - * (i.e. it will have no effect). - */ -#ifndef TARGET_AVX2 -#define TARGET_AVX2 -#endif -// yyyAVX2- - -/* - * Some computations with floating-point elements, in particular - * rounding to the nearest integer, rely on operations using _exactly_ - * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit - * x86, the 387 FPU may be used (depending on the target OS) and, in - * that case, may use more precision bits (i.e. 64 bits, for an 80-bit - * total type length); to prevent miscomputations, we define an explicit - * function that modifies the precision in the FPU control word. - * - * set_fpu_cw() sets the precision to the provided value, and returns - * the previously set precision; callers are supposed to restore the - * previous precision on exit. The correct (52-bit) precision is - * configured with the value "2". On unsupported compilers, or on - * targets other than 32-bit x86, or when the native 'double' type is - * not used, the set_fpu_cw() function does nothing at all. - */ -#if FALCON_FPNATIVE // yyyFPNATIVE+1 -#if defined __GNUC__ && defined __i386__ -static inline unsigned -set_fpu_cw(unsigned x) -{ - unsigned short t; - unsigned old; - - __asm__ __volatile__ ("fstcw %0" : "=m" (t) : : ); - old = (t & 0x0300u) >> 8; - t = (unsigned short)((t & ~0x0300u) | (x << 8)); - __asm__ __volatile__ ("fldcw %0" : : "m" (t) : ); - return old; -} -#elif defined _M_IX86 -static inline unsigned -set_fpu_cw(unsigned x) -{ - unsigned short t; - unsigned old; - - __asm { fstcw t } - old = (t & 0x0300u) >> 8; - t = (unsigned short)((t & ~0x0300u) | (x << 8)); - __asm { fldcw t } - return old; -} -#else -static inline unsigned -set_fpu_cw(unsigned x) -{ - return x; -} -#endif -#else // yyyFPNATIVE+0 -static inline unsigned -set_fpu_cw(unsigned x) -{ - return x; -} -#endif // yyyFPNATIVE- - -#if FALCON_FPNATIVE && !FALCON_AVX2 // yyyFPNATIVE+1 yyyAVX2+0 -/* - * If using the native 'double' type but not AVX2 code, on an x86 - * machine with SSE2 activated for maths, then we will use the - * SSE2 intrinsics. - */ -#if defined __GNUC__ && defined __SSE2_MATH__ -#include -#endif -#endif // yyyFPNATIVE- yyyAVX2- - -#if FALCON_FPNATIVE // yyyFPNATIVE+1 -/* - * For optimal reproducibility of values, we need to disable contraction - * of floating-point expressions; otherwise, on some architectures (e.g. - * PowerPC), the compiler may generate fused-multiply-add opcodes that - * may round differently than two successive separate opcodes. C99 defines - * a standard pragma for that, but GCC-6.2.2 appears to ignore it, - * hence the GCC-specific pragma (that Clang does not support). - */ -#if defined __clang__ -#pragma STDC FP_CONTRACT OFF -#elif defined __GNUC__ -#pragma GCC optimize ("fp-contract=off") -#endif -#endif // yyyFPNATIVE- - -// yyyPQCLEAN+0 -/* - * MSVC 2015 does not know the C99 keyword 'restrict'. - */ -#if defined _MSC_VER && _MSC_VER -#ifndef restrict -#define restrict __restrict -#endif -#endif -// yyyPQCLEAN- - -/* ==================================================================== */ -/* - * SHAKE256 implementation (shake.c). - * - * API is defined to be easily replaced with the fips202.h API defined - * as part of PQClean. - */ - -// yyyPQCLEAN+0 -/* -typedef struct { - union { - uint64_t A[25]; - uint8_t dbuf[200]; - } st; - uint64_t dptr; -} inner_shake256_context; - -#define inner_shake256_init Zf(i_shake256_init) -#define inner_shake256_inject Zf(i_shake256_inject) -#define inner_shake256_flip Zf(i_shake256_flip) -#define inner_shake256_extract Zf(i_shake256_extract) - -void Zf(i_shake256_init)( - inner_shake256_context *sc); -void Zf(i_shake256_inject)( - inner_shake256_context *sc, const uint8_t *in, size_t len); -void Zf(i_shake256_flip)( - inner_shake256_context *sc); -void Zf(i_shake256_extract)( - inner_shake256_context *sc, uint8_t *out, size_t len); -*/ - -// yyyPQCLEAN+1 - -#include "fips202.h" - -#define inner_shake256_context shake256incctx -#define inner_shake256_init(sc) shake256_inc_init(sc) -#define inner_shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len) -#define inner_shake256_flip(sc) shake256_inc_finalize(sc) -#define inner_shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc) - -// yyyPQCLEAN+0 - -// yyyPQCLEAN- - -/* ==================================================================== */ -/* - * Encoding/decoding functions (codec.c). - * - * Encoding functions take as parameters an output buffer (out) with - * a given maximum length (max_out_len); returned value is the actual - * number of bytes which have been written. If the output buffer is - * not large enough, then 0 is returned (some bytes may have been - * written to the buffer). If 'out' is NULL, then 'max_out_len' is - * ignored; instead, the function computes and returns the actual - * required output length (in bytes). - * - * Decoding functions take as parameters an input buffer (in) with - * its maximum length (max_in_len); returned value is the actual number - * of bytes that have been read from the buffer. If the provided length - * is too short, then 0 is returned. - * - * Values to encode or decode are vectors of integers, with N = 2^logn - * elements. - * - * Three encoding formats are defined: - * - * - modq: sequence of values modulo 12289, each encoded over exactly - * 14 bits. The encoder and decoder verify that integers are within - * the valid range (0..12288). Values are arrays of uint16. - * - * - trim: sequence of signed integers, a specified number of bits - * each. The number of bits is provided as parameter and includes - * the sign bit. Each integer x must be such that |x| < 2^(bits-1) - * (which means that the -2^(bits-1) value is forbidden); encode and - * decode functions check that property. Values are arrays of - * int16_t or int8_t, corresponding to names 'trim_i16' and - * 'trim_i8', respectively. - * - * - comp: variable-length encoding for signed integers; each integer - * uses a minimum of 9 bits, possibly more. This is normally used - * only for signatures. - * - */ - -size_t Zf(modq_encode)(void *out, size_t max_out_len, - const uint16_t *x, unsigned logn); -size_t Zf(trim_i16_encode)(void *out, size_t max_out_len, - const int16_t *x, unsigned logn, unsigned bits); -size_t Zf(trim_i8_encode)(void *out, size_t max_out_len, - const int8_t *x, unsigned logn, unsigned bits); -size_t Zf(comp_encode)(void *out, size_t max_out_len, - const int16_t *x, unsigned logn); - -size_t Zf(modq_decode)(uint16_t *x, unsigned logn, - const void *in, size_t max_in_len); -size_t Zf(trim_i16_decode)(int16_t *x, unsigned logn, unsigned bits, - const void *in, size_t max_in_len); -size_t Zf(trim_i8_decode)(int8_t *x, unsigned logn, unsigned bits, - const void *in, size_t max_in_len); -size_t Zf(comp_decode)(int16_t *x, unsigned logn, - const void *in, size_t max_in_len); - -/* - * Number of bits for key elements, indexed by logn (1 to 10). This - * is at most 8 bits for all degrees, but some degrees may have shorter - * elements. - */ -extern const uint8_t Zf(max_fg_bits)[]; -extern const uint8_t Zf(max_FG_bits)[]; - -/* - * Maximum size, in bits, of elements in a signature, indexed by logn - * (1 to 10). The size includes the sign bit. - */ -extern const uint8_t Zf(max_sig_bits)[]; - -/* ==================================================================== */ -/* - * Support functions used for both signature generation and signature - * verification (common.c). - */ - -/* - * From a SHAKE256 context (must be already flipped), produce a new - * point. This is the non-constant-time version, which may leak enough - * information to serve as a stop condition on a brute force attack on - * the hashed message (provided that the nonce value is known). - */ -void Zf(hash_to_point_vartime)(inner_shake256_context *sc, - uint16_t *x, unsigned logn); - -/* - * From a SHAKE256 context (must be already flipped), produce a new - * point. The temporary buffer (tmp) must have room for 2*2^logn bytes. - * This function is constant-time but is typically more expensive than - * Zf(hash_to_point_vartime)(). - * - * tmp[] must have 16-bit alignment. - */ -void Zf(hash_to_point_ct)(inner_shake256_context *sc, - uint16_t *x, unsigned logn, uint8_t *tmp); - -/* - * Tell whether a given vector (2N coordinates, in two halves) is - * acceptable as a signature. This compares the appropriate norm of the - * vector with the acceptance bound. Returned value is 1 on success - * (vector is short enough to be acceptable), 0 otherwise. - */ -int Zf(is_short)(const int16_t *s1, const int16_t *s2, unsigned logn); - -/* - * Tell whether a given vector (2N coordinates, in two halves) is - * acceptable as a signature. Instead of the first half s1, this - * function receives the "saturated squared norm" of s1, i.e. the - * sum of the squares of the coordinates of s1 (saturated at 2^32-1 - * if the sum exceeds 2^31-1). - * - * Returned value is 1 on success (vector is short enough to be - * acceptable), 0 otherwise. - */ -int Zf(is_short_half)(uint32_t sqn, const int16_t *s2, unsigned logn); - -/* ==================================================================== */ -/* - * Signature verification functions (vrfy.c). - */ - -/* - * Convert a public key to NTT + Montgomery format. Conversion is done - * in place. - */ -void Zf(to_ntt_monty)(uint16_t *h, unsigned logn); - -/* - * Internal signature verification code: - * c0[] contains the hashed nonce+message - * s2[] is the decoded signature - * h[] contains the public key, in NTT + Montgomery format - * logn is the degree log - * tmp[] temporary, must have at least 2*2^logn bytes - * Returned value is 1 on success, 0 on error. - * - * tmp[] must have 16-bit alignment. - */ -int Zf(verify_raw)(const uint16_t *c0, const int16_t *s2, - const uint16_t *h, unsigned logn, uint8_t *tmp); - -/* - * Compute the public key h[], given the private key elements f[] and - * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial - * modulus. This function returns 1 on success, 0 on error (an error is - * reported if f is not invertible mod phi mod q). - * - * The tmp[] array must have room for at least 2*2^logn elements. - * tmp[] must have 16-bit alignment. - */ -int Zf(compute_public)(uint16_t *h, - const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp); - -/* - * Recompute the fourth private key element. Private key consists in - * four polynomials with small coefficients f, g, F and G, which are - * such that fG - gF = q mod phi; furthermore, f is invertible modulo - * phi and modulo q. This function recomputes G from f, g and F. - * - * The tmp[] array must have room for at least 4*2^logn bytes. - * - * Returned value is 1 in success, 0 on error (f not invertible). - * tmp[] must have 16-bit alignment. - */ -int Zf(complete_private)(int8_t *G, - const int8_t *f, const int8_t *g, const int8_t *F, - unsigned logn, uint8_t *tmp); - -/* - * Test whether a given polynomial is invertible modulo phi and q. - * Polynomial coefficients are small integers. - * - * tmp[] must have 16-bit alignment. - */ -int Zf(is_invertible)( - const int16_t *s2, unsigned logn, uint8_t *tmp); - -/* - * Count the number of elements of value zero in the NTT representation - * of the given polynomial: this is the number of primitive 2n-th roots - * of unity (modulo q = 12289) that are roots of the provided polynomial - * (taken modulo q). - * - * tmp[] must have 16-bit alignment. - */ -int Zf(count_nttzero)(const int16_t *sig, unsigned logn, uint8_t *tmp); - -/* - * Internal signature verification with public key recovery: - * h[] receives the public key (NOT in NTT/Montgomery format) - * c0[] contains the hashed nonce+message - * s1[] is the first signature half - * s2[] is the second signature half - * logn is the degree log - * tmp[] temporary, must have at least 2*2^logn bytes - * Returned value is 1 on success, 0 on error. Success is returned if - * the signature is a short enough vector; in that case, the public - * key has been written to h[]. However, the caller must still - * verify that h[] is the correct value (e.g. with regards to a known - * hash of the public key). - * - * h[] may not overlap with any of the other arrays. - * - * tmp[] must have 16-bit alignment. - */ -int Zf(verify_recover)(uint16_t *h, - const uint16_t *c0, const int16_t *s1, const int16_t *s2, - unsigned logn, uint8_t *tmp); - -/* ==================================================================== */ -/* - * Implementation of floating-point real numbers (fpr.h, fpr.c). - */ - -/* - * Real numbers are implemented by an extra header file, included below. - * This is meant to support pluggable implementations. The default - * implementation relies on the C type 'double'. - * - * The included file must define the following types, functions and - * constants: - * - * fpr - * type for a real number - * - * fpr fpr_of(int64_t i) - * cast an integer into a real number; source must be in the - * -(2^63-1)..+(2^63-1) range - * - * fpr fpr_scaled(int64_t i, int sc) - * compute i*2^sc as a real number; source 'i' must be in the - * -(2^63-1)..+(2^63-1) range - * - * fpr fpr_ldexp(fpr x, int e) - * compute x*2^e - * - * int64_t fpr_rint(fpr x) - * round x to the nearest integer; x must be in the -(2^63-1) - * to +(2^63-1) range - * - * int64_t fpr_trunc(fpr x) - * round to an integer; this rounds towards zero; value must - * be in the -(2^63-1) to +(2^63-1) range - * - * fpr fpr_add(fpr x, fpr y) - * compute x + y - * - * fpr fpr_sub(fpr x, fpr y) - * compute x - y - * - * fpr fpr_neg(fpr x) - * compute -x - * - * fpr fpr_half(fpr x) - * compute x/2 - * - * fpr fpr_double(fpr x) - * compute x*2 - * - * fpr fpr_mul(fpr x, fpr y) - * compute x * y - * - * fpr fpr_sqr(fpr x) - * compute x * x - * - * fpr fpr_inv(fpr x) - * compute 1/x - * - * fpr fpr_div(fpr x, fpr y) - * compute x/y - * - * fpr fpr_sqrt(fpr x) - * compute the square root of x - * - * int fpr_lt(fpr x, fpr y) - * return 1 if x < y, 0 otherwise - * - * uint64_t fpr_expm_p63(fpr x) - * return exp(x), assuming that 0 <= x < log(2). Returned value - * is scaled to 63 bits (i.e. it really returns 2^63*exp(-x), - * rounded to the nearest integer). Computation should have a - * precision of at least 45 bits. - * - * const fpr fpr_gm_tab[] - * array of constants for FFT / iFFT - * - * const fpr fpr_p2_tab[] - * precomputed powers of 2 (by index, 0 to 10) - * - * Constants of type 'fpr': - * - * fpr fpr_q 12289 - * fpr fpr_inverse_of_q 1/12289 - * fpr fpr_inv_2sqrsigma0 1/(2*(1.8205^2)) - * fpr fpr_inv_sigma 1/(1.55*sqrt(12289)) - * fpr fpr_sigma_min_9 1.291500756233514568549480827642 - * fpr fpr_sigma_min_10 1.311734375905083682667395805765 - * fpr fpr_log2 log(2) - * fpr fpr_inv_log2 1/log(2) - * fpr fpr_bnorm_max 16822.4121 - * fpr fpr_zero 0 - * fpr fpr_one 1 - * fpr fpr_two 2 - * fpr fpr_onehalf 0.5 - * fpr fpr_ptwo31 2^31 - * fpr fpr_ptwo31m1 2^31-1 - * fpr fpr_mtwo31m1 -(2^31-1) - * fpr fpr_ptwo63m1 2^63-1 - * fpr fpr_mtwo63m1 -(2^63-1) - * fpr fpr_ptwo63 2^63 - */ -#include "fpr.h" - -/* ==================================================================== */ -/* - * RNG (rng.c). - * - * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256 - * context (flipped) and is used for bulk pseudorandom generation. - * A system-dependent seed generator is also provided. - */ - -/* - * Obtain a random seed from the system RNG. - * - * Returned value is 1 on success, 0 on error. - */ -int Zf(get_seed)(void *seed, size_t seed_len); - -/* - * Structure for a PRNG. This includes a large buffer so that values - * get generated in advance. The 'state' is used to keep the current - * PRNG algorithm state (contents depend on the selected algorithm). - * - * The unions with 'dummy_u64' are there to ensure proper alignment for - * 64-bit direct access. - */ -typedef struct { - union { - uint8_t d[512]; /* MUST be 512, exactly */ - uint64_t dummy_u64; - } buf; - size_t ptr; - union { - uint8_t d[256]; - uint64_t dummy_u64; - } state; - int type; -} prng; - -/* - * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256 - * context (in "flipped" state) to obtain its initial state. - */ -void Zf(prng_init)(prng *p, inner_shake256_context *src); - -/* - * Refill the PRNG buffer. This is normally invoked automatically, and - * is declared here only so that prng_get_u64() may be inlined. - */ -void Zf(prng_refill)(prng *p); - -/* - * Get some bytes from a PRNG. - */ -void Zf(prng_get_bytes)(prng *p, void *dst, size_t len); - -/* - * Get a 64-bit random value from a PRNG. - */ -static inline uint64_t -prng_get_u64(prng *p) -{ - size_t u; - - /* - * If there are less than 9 bytes in the buffer, we refill it. - * This means that we may drop the last few bytes, but this allows - * for faster extraction code. Also, it means that we never leave - * an empty buffer. - */ - u = p->ptr; - if (u >= (sizeof p->buf.d) - 9) { - Zf(prng_refill)(p); - u = 0; - } - p->ptr = u + 8; - - /* - * On systems that use little-endian encoding and allow - * unaligned accesses, we can simply read the data where it is. - */ -#if FALCON_LE && FALCON_UNALIGNED // yyyLEU+1 - return *(uint64_t *)(p->buf.d + u); -#else // yyyLEU+0 - return (uint64_t)p->buf.d[u + 0] - | ((uint64_t)p->buf.d[u + 1] << 8) - | ((uint64_t)p->buf.d[u + 2] << 16) - | ((uint64_t)p->buf.d[u + 3] << 24) - | ((uint64_t)p->buf.d[u + 4] << 32) - | ((uint64_t)p->buf.d[u + 5] << 40) - | ((uint64_t)p->buf.d[u + 6] << 48) - | ((uint64_t)p->buf.d[u + 7] << 56); -#endif // yyyLEU- -} - -/* - * Get an 8-bit random value from a PRNG. - */ -static inline unsigned -prng_get_u8(prng *p) -{ - unsigned v; - - v = p->buf.d[p->ptr ++]; - if (p->ptr == sizeof p->buf.d) { - Zf(prng_refill)(p); - } - return v; -} - -/* ==================================================================== */ -/* - * FFT (falcon-fft.c). - * - * A real polynomial is represented as an array of N 'fpr' elements. - * The FFT representation of a real polynomial contains N/2 complex - * elements; each is stored as two real numbers, for the real and - * imaginary parts, respectively. See falcon-fft.c for details on the - * internal representation. - */ - -/* - * Compute FFT in-place: the source array should contain a real - * polynomial (N coefficients); its storage area is reused to store - * the FFT representation of that polynomial (N/2 complex numbers). - * - * 'logn' MUST lie between 1 and 10 (inclusive). - */ -void Zf(FFT)(fpr *f, unsigned logn); - -/* - * Compute the inverse FFT in-place: the source array should contain the - * FFT representation of a real polynomial (N/2 elements); the resulting - * real polynomial (N coefficients of type 'fpr') is written over the - * array. - * - * 'logn' MUST lie between 1 and 10 (inclusive). - */ -void Zf(iFFT)(fpr *f, unsigned logn); - -/* - * Add polynomial b to polynomial a. a and b MUST NOT overlap. This - * function works in both normal and FFT representations. - */ -void Zf(poly_add)(fpr *restrict a, const fpr *restrict b, unsigned logn); - -/* - * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This - * function works in both normal and FFT representations. - */ -void Zf(poly_sub)(fpr *restrict a, const fpr *restrict b, unsigned logn); - -/* - * Negate polynomial a. This function works in both normal and FFT - * representations. - */ -void Zf(poly_neg)(fpr *a, unsigned logn); - -/* - * Compute adjoint of polynomial a. This function works only in FFT - * representation. - */ -void Zf(poly_adj_fft)(fpr *a, unsigned logn); - -/* - * Multiply polynomial a with polynomial b. a and b MUST NOT overlap. - * This function works only in FFT representation. - */ -void Zf(poly_mul_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn); - -/* - * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT - * overlap. This function works only in FFT representation. - */ -void Zf(poly_muladj_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn); - -/* - * Multiply polynomial with its own adjoint. This function works only in FFT - * representation. - */ -void Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn); - -/* - * Multiply polynomial with a real constant. This function works in both - * normal and FFT representations. - */ -void Zf(poly_mulconst)(fpr *a, fpr x, unsigned logn); - -/* - * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation). - * a and b MUST NOT overlap. - */ -void Zf(poly_div_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn); - -/* - * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g)) - * (also in FFT representation). Since the result is auto-adjoint, all its - * coordinates in FFT representation are real; as such, only the first N/2 - * values of d[] are filled (the imaginary parts are skipped). - * - * Array d MUST NOT overlap with either a or b. - */ -void Zf(poly_invnorm2_fft)(fpr *restrict d, - const fpr *restrict a, const fpr *restrict b, unsigned logn); - -/* - * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g) - * (also in FFT representation). Destination d MUST NOT overlap with - * any of the source arrays. - */ -void Zf(poly_add_muladj_fft)(fpr *restrict d, - const fpr *restrict F, const fpr *restrict G, - const fpr *restrict f, const fpr *restrict g, unsigned logn); - -/* - * Multiply polynomial a by polynomial b, where b is autoadjoint. Both - * a and b are in FFT representation. Since b is autoadjoint, all its - * FFT coefficients are real, and the array b contains only N/2 elements. - * a and b MUST NOT overlap. - */ -void Zf(poly_mul_autoadj_fft)(fpr *restrict a, - const fpr *restrict b, unsigned logn); - -/* - * Divide polynomial a by polynomial b, where b is autoadjoint. Both - * a and b are in FFT representation. Since b is autoadjoint, all its - * FFT coefficients are real, and the array b contains only N/2 elements. - * a and b MUST NOT overlap. - */ -void Zf(poly_div_autoadj_fft)(fpr *restrict a, - const fpr *restrict b, unsigned logn); - -/* - * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT - * representation. On input, g00, g01 and g11 are provided (where the - * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10 - * and d11 values are written in g00, g01 and g11, respectively - * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]). - * (In fact, d00 = g00, so the g00 operand is left unmodified.) - */ -void Zf(poly_LDL_fft)(const fpr *restrict g00, - fpr *restrict g01, fpr *restrict g11, unsigned logn); - -/* - * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT - * representation. This is identical to poly_LDL_fft() except that - * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written - * in two other separate buffers provided as extra parameters. - */ -void Zf(poly_LDLmv_fft)(fpr *restrict d11, fpr *restrict l10, - const fpr *restrict g00, const fpr *restrict g01, - const fpr *restrict g11, unsigned logn); - -/* - * Apply "split" operation on a polynomial in FFT representation: - * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1 - * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap. - */ -void Zf(poly_split_fft)(fpr *restrict f0, fpr *restrict f1, - const fpr *restrict f, unsigned logn); - -/* - * Apply "merge" operation on two polynomials in FFT representation: - * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes - * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1. - * f MUST NOT overlap with either f0 or f1. - */ -void Zf(poly_merge_fft)(fpr *restrict f, - const fpr *restrict f0, const fpr *restrict f1, unsigned logn); - -/* ==================================================================== */ -/* - * Key pair generation. - */ - -/* - * Required sizes of the temporary buffer (in bytes). - * - * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1 - * or 2) where it is slightly greater. - */ -#define FALCON_KEYGEN_TEMP_1 136 -#define FALCON_KEYGEN_TEMP_2 272 -#define FALCON_KEYGEN_TEMP_3 224 -#define FALCON_KEYGEN_TEMP_4 448 -#define FALCON_KEYGEN_TEMP_5 896 -#define FALCON_KEYGEN_TEMP_6 1792 -#define FALCON_KEYGEN_TEMP_7 3584 -#define FALCON_KEYGEN_TEMP_8 7168 -#define FALCON_KEYGEN_TEMP_9 14336 -#define FALCON_KEYGEN_TEMP_10 28672 - -/* - * Generate a new key pair. Randomness is extracted from the provided - * SHAKE256 context, which must have already been seeded and flipped. - * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_* - * macros) and be aligned for the uint32_t, uint64_t and fpr types. - * - * The private key elements are written in f, g, F and G, and the - * public key is written in h. Either or both of G and h may be NULL, - * in which case the corresponding element is not returned (they can - * be recomputed from f, g and F). - * - * tmp[] must have 64-bit alignment. - * This function uses floating-point rounding (see set_fpu_cw()). - */ -void Zf(keygen)(inner_shake256_context *rng, - int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h, - unsigned logn, uint8_t *tmp); - -/* ==================================================================== */ -/* - * Signature generation. - */ - -/* - * Expand a private key into the B0 matrix in FFT representation and - * the LDL tree. All the values are written in 'expanded_key', for - * a total of (8*logn+40)*2^logn bytes. - * - * The tmp[] array must have room for at least 48*2^logn bytes. - * - * tmp[] must have 64-bit alignment. - * This function uses floating-point rounding (see set_fpu_cw()). - */ -void Zf(expand_privkey)(fpr *restrict expanded_key, - const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G, - unsigned logn, uint8_t *restrict tmp); - -/* - * Compute a signature over the provided hashed message (hm); the - * signature value is one short vector. This function uses an - * expanded key (as generated by Zf(expand_privkey)()). - * - * The sig[] and hm[] buffers may overlap. - * - * On successful output, the start of the tmp[] buffer contains the s1 - * vector (as int16_t elements). - * - * The minimal size (in bytes) of tmp[] is 48*2^logn bytes. - * - * tmp[] must have 64-bit alignment. - * This function uses floating-point rounding (see set_fpu_cw()). - */ -void Zf(sign_tree)(int16_t *sig, inner_shake256_context *rng, - const fpr *restrict expanded_key, - const uint16_t *hm, unsigned logn, uint8_t *tmp); - -/* - * Compute a signature over the provided hashed message (hm); the - * signature value is one short vector. This function uses a raw - * key and dynamically recompute the B0 matrix and LDL tree; this - * saves RAM since there is no needed for an expanded key, but - * increases the signature cost. - * - * The sig[] and hm[] buffers may overlap. - * - * On successful output, the start of the tmp[] buffer contains the s1 - * vector (as int16_t elements). - * - * The minimal size (in bytes) of tmp[] is 72*2^logn bytes. - * - * tmp[] must have 64-bit alignment. - * This function uses floating-point rounding (see set_fpu_cw()). - */ -void Zf(sign_dyn)(int16_t *sig, inner_shake256_context *rng, - const int8_t *restrict f, const int8_t *restrict g, - const int8_t *restrict F, const int8_t *restrict G, - const uint16_t *hm, unsigned logn, uint8_t *tmp); - -/* - * Internal sampler engine. Exported for tests. - * - * sampler_context wraps around a source of random numbers (PRNG) and - * the sigma_min value (nominally dependent on the degree). - * - * sampler() takes as parameters: - * ctx pointer to the sampler_context structure - * mu center for the distribution - * isigma inverse of the distribution standard deviation - * It returns an integer sampled along the Gaussian distribution centered - * on mu and of standard deviation sigma = 1/isigma. - * - * gaussian0_sampler() takes as parameter a pointer to a PRNG, and - * returns an integer sampled along a half-Gaussian with standard - * deviation sigma0 = 1.8205 (center is 0, returned value is - * nonnegative). - */ - -typedef struct { - prng p; - fpr sigma_min; -} sampler_context; - -TARGET_AVX2 -int Zf(sampler)(void *ctx, fpr mu, fpr isigma); - -TARGET_AVX2 -int Zf(gaussian0_sampler)(prng *p); - -/* ==================================================================== */ - -#endif diff --git a/crypto_sign/falcon-512/m4-ct/keygen.c b/crypto_sign/falcon-512/m4-ct/keygen.c deleted file mode 100644 index cf7de008..00000000 --- a/crypto_sign/falcon-512/m4-ct/keygen.c +++ /dev/null @@ -1,4301 +0,0 @@ -/* - * Falcon key pair generation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -#define MKN(logn) ((size_t)1 << (logn)) - -/* ==================================================================== */ -/* - * Modular arithmetics. - * - * We implement a few functions for computing modulo a small integer p. - * - * All functions require that 2^30 < p < 2^31. Moreover, operands must - * be in the 0..p-1 range. - * - * Modular addition and subtraction work for all such p. - * - * Montgomery multiplication requires that p is odd, and must be provided - * with an additional value p0i = -1/p mod 2^31. See below for some basics - * on Montgomery multiplication. - * - * Division computes an inverse modulo p by an exponentiation (with - * exponent p-2): this works only if p is prime. Multiplication - * requirements also apply, i.e. p must be odd and p0i must be provided. - * - * The NTT and inverse NTT need all of the above, and also that - * p = 1 mod 2048. - * - * ----------------------------------------------------------------------- - * - * We use Montgomery representation with 31-bit values: - * - * Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p. - * Montgomery representation of an integer x modulo p is x*R mod p. - * - * Montgomery multiplication computes (x*y)/R mod p for - * operands x and y. Therefore: - * - * - if operands are x*R and y*R (Montgomery representations of x and - * y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R - * mod p, which is the Montgomery representation of the product x*y; - * - * - if operands are x*R and y (or x and y*R), then Montgomery - * multiplication returns x*y mod p: mixed-representation - * multiplications yield results in normal representation. - * - * To convert to Montgomery representation, we multiply by R, which is done - * by Montgomery-multiplying by R^2. Stand-alone conversion back from - * Montgomery representation is Montgomery-multiplication by 1. - */ - -/* - * Precomputed small primes. Each element contains the following: - * - * p The prime itself. - * - * g A primitive root of phi = X^N+1 (in field Z_p). - * - * s The inverse of the product of all previous primes in the array, - * computed modulo p and in Montgomery representation. - * - * All primes are such that p = 1 mod 2048, and are lower than 2^31. They - * are listed in decreasing order. - */ - -typedef struct { - uint32_t p; - uint32_t g; - uint32_t s; -} small_prime; - -static const small_prime PRIMES[] = { - { 2147473409, 383167813, 10239 }, - { 2147389441, 211808905, 471403745 }, - { 2147387393, 37672282, 1329335065 }, - { 2147377153, 1977035326, 968223422 }, - { 2147358721, 1067163706, 132460015 }, - { 2147352577, 1606082042, 598693809 }, - { 2147346433, 2033915641, 1056257184 }, - { 2147338241, 1653770625, 421286710 }, - { 2147309569, 631200819, 1111201074 }, - { 2147297281, 2038364663, 1042003613 }, - { 2147295233, 1962540515, 19440033 }, - { 2147239937, 2100082663, 353296760 }, - { 2147235841, 1991153006, 1703918027 }, - { 2147217409, 516405114, 1258919613 }, - { 2147205121, 409347988, 1089726929 }, - { 2147196929, 927788991, 1946238668 }, - { 2147178497, 1136922411, 1347028164 }, - { 2147100673, 868626236, 701164723 }, - { 2147082241, 1897279176, 617820870 }, - { 2147074049, 1888819123, 158382189 }, - { 2147051521, 25006327, 522758543 }, - { 2147043329, 327546255, 37227845 }, - { 2147039233, 766324424, 1133356428 }, - { 2146988033, 1862817362, 73861329 }, - { 2146963457, 404622040, 653019435 }, - { 2146959361, 1936581214, 995143093 }, - { 2146938881, 1559770096, 634921513 }, - { 2146908161, 422623708, 1985060172 }, - { 2146885633, 1751189170, 298238186 }, - { 2146871297, 578919515, 291810829 }, - { 2146846721, 1114060353, 915902322 }, - { 2146834433, 2069565474, 47859524 }, - { 2146818049, 1552824584, 646281055 }, - { 2146775041, 1906267847, 1597832891 }, - { 2146756609, 1847414714, 1228090888 }, - { 2146744321, 1818792070, 1176377637 }, - { 2146738177, 1118066398, 1054971214 }, - { 2146736129, 52057278, 933422153 }, - { 2146713601, 592259376, 1406621510 }, - { 2146695169, 263161877, 1514178701 }, - { 2146656257, 685363115, 384505091 }, - { 2146650113, 927727032, 537575289 }, - { 2146646017, 52575506, 1799464037 }, - { 2146643969, 1276803876, 1348954416 }, - { 2146603009, 814028633, 1521547704 }, - { 2146572289, 1846678872, 1310832121 }, - { 2146547713, 919368090, 1019041349 }, - { 2146508801, 671847612, 38582496 }, - { 2146492417, 283911680, 532424562 }, - { 2146490369, 1780044827, 896447978 }, - { 2146459649, 327980850, 1327906900 }, - { 2146447361, 1310561493, 958645253 }, - { 2146441217, 412148926, 287271128 }, - { 2146437121, 293186449, 2009822534 }, - { 2146430977, 179034356, 1359155584 }, - { 2146418689, 1517345488, 1790248672 }, - { 2146406401, 1615820390, 1584833571 }, - { 2146404353, 826651445, 607120498 }, - { 2146379777, 3816988, 1897049071 }, - { 2146363393, 1221409784, 1986921567 }, - { 2146355201, 1388081168, 849968120 }, - { 2146336769, 1803473237, 1655544036 }, - { 2146312193, 1023484977, 273671831 }, - { 2146293761, 1074591448, 467406983 }, - { 2146283521, 831604668, 1523950494 }, - { 2146203649, 712865423, 1170834574 }, - { 2146154497, 1764991362, 1064856763 }, - { 2146142209, 627386213, 1406840151 }, - { 2146127873, 1638674429, 2088393537 }, - { 2146099201, 1516001018, 690673370 }, - { 2146093057, 1294931393, 315136610 }, - { 2146091009, 1942399533, 973539425 }, - { 2146078721, 1843461814, 2132275436 }, - { 2146060289, 1098740778, 360423481 }, - { 2146048001, 1617213232, 1951981294 }, - { 2146041857, 1805783169, 2075683489 }, - { 2146019329, 272027909, 1753219918 }, - { 2145986561, 1206530344, 2034028118 }, - { 2145976321, 1243769360, 1173377644 }, - { 2145964033, 887200839, 1281344586 }, - { 2145906689, 1651026455, 906178216 }, - { 2145875969, 1673238256, 1043521212 }, - { 2145871873, 1226591210, 1399796492 }, - { 2145841153, 1465353397, 1324527802 }, - { 2145832961, 1150638905, 554084759 }, - { 2145816577, 221601706, 427340863 }, - { 2145785857, 608896761, 316590738 }, - { 2145755137, 1712054942, 1684294304 }, - { 2145742849, 1302302867, 724873116 }, - { 2145728513, 516717693, 431671476 }, - { 2145699841, 524575579, 1619722537 }, - { 2145691649, 1925625239, 982974435 }, - { 2145687553, 463795662, 1293154300 }, - { 2145673217, 771716636, 881778029 }, - { 2145630209, 1509556977, 837364988 }, - { 2145595393, 229091856, 851648427 }, - { 2145587201, 1796903241, 635342424 }, - { 2145525761, 715310882, 1677228081 }, - { 2145495041, 1040930522, 200685896 }, - { 2145466369, 949804237, 1809146322 }, - { 2145445889, 1673903706, 95316881 }, - { 2145390593, 806941852, 1428671135 }, - { 2145372161, 1402525292, 159350694 }, - { 2145361921, 2124760298, 1589134749 }, - { 2145359873, 1217503067, 1561543010 }, - { 2145355777, 338341402, 83865711 }, - { 2145343489, 1381532164, 641430002 }, - { 2145325057, 1883895478, 1528469895 }, - { 2145318913, 1335370424, 65809740 }, - { 2145312769, 2000008042, 1919775760 }, - { 2145300481, 961450962, 1229540578 }, - { 2145282049, 910466767, 1964062701 }, - { 2145232897, 816527501, 450152063 }, - { 2145218561, 1435128058, 1794509700 }, - { 2145187841, 33505311, 1272467582 }, - { 2145181697, 269767433, 1380363849 }, - { 2145175553, 56386299, 1316870546 }, - { 2145079297, 2106880293, 1391797340 }, - { 2145021953, 1347906152, 720510798 }, - { 2145015809, 206769262, 1651459955 }, - { 2145003521, 1885513236, 1393381284 }, - { 2144960513, 1810381315, 31937275 }, - { 2144944129, 1306487838, 2019419520 }, - { 2144935937, 37304730, 1841489054 }, - { 2144894977, 1601434616, 157985831 }, - { 2144888833, 98749330, 2128592228 }, - { 2144880641, 1772327002, 2076128344 }, - { 2144864257, 1404514762, 2029969964 }, - { 2144827393, 801236594, 406627220 }, - { 2144806913, 349217443, 1501080290 }, - { 2144796673, 1542656776, 2084736519 }, - { 2144778241, 1210734884, 1746416203 }, - { 2144759809, 1146598851, 716464489 }, - { 2144757761, 286328400, 1823728177 }, - { 2144729089, 1347555695, 1836644881 }, - { 2144727041, 1795703790, 520296412 }, - { 2144696321, 1302475157, 852964281 }, - { 2144667649, 1075877614, 504992927 }, - { 2144573441, 198765808, 1617144982 }, - { 2144555009, 321528767, 155821259 }, - { 2144550913, 814139516, 1819937644 }, - { 2144536577, 571143206, 962942255 }, - { 2144524289, 1746733766, 2471321 }, - { 2144512001, 1821415077, 124190939 }, - { 2144468993, 917871546, 1260072806 }, - { 2144458753, 378417981, 1569240563 }, - { 2144421889, 175229668, 1825620763 }, - { 2144409601, 1699216963, 351648117 }, - { 2144370689, 1071885991, 958186029 }, - { 2144348161, 1763151227, 540353574 }, - { 2144335873, 1060214804, 919598847 }, - { 2144329729, 663515846, 1448552668 }, - { 2144327681, 1057776305, 590222840 }, - { 2144309249, 1705149168, 1459294624 }, - { 2144296961, 325823721, 1649016934 }, - { 2144290817, 738775789, 447427206 }, - { 2144243713, 962347618, 893050215 }, - { 2144237569, 1655257077, 900860862 }, - { 2144161793, 242206694, 1567868672 }, - { 2144155649, 769415308, 1247993134 }, - { 2144137217, 320492023, 515841070 }, - { 2144120833, 1639388522, 770877302 }, - { 2144071681, 1761785233, 964296120 }, - { 2144065537, 419817825, 204564472 }, - { 2144028673, 666050597, 2091019760 }, - { 2144010241, 1413657615, 1518702610 }, - { 2143952897, 1238327946, 475672271 }, - { 2143940609, 307063413, 1176750846 }, - { 2143918081, 2062905559, 786785803 }, - { 2143899649, 1338112849, 1562292083 }, - { 2143891457, 68149545, 87166451 }, - { 2143885313, 921750778, 394460854 }, - { 2143854593, 719766593, 133877196 }, - { 2143836161, 1149399850, 1861591875 }, - { 2143762433, 1848739366, 1335934145 }, - { 2143756289, 1326674710, 102999236 }, - { 2143713281, 808061791, 1156900308 }, - { 2143690753, 388399459, 1926468019 }, - { 2143670273, 1427891374, 1756689401 }, - { 2143666177, 1912173949, 986629565 }, - { 2143645697, 2041160111, 371842865 }, - { 2143641601, 1279906897, 2023974350 }, - { 2143635457, 720473174, 1389027526 }, - { 2143621121, 1298309455, 1732632006 }, - { 2143598593, 1548762216, 1825417506 }, - { 2143567873, 620475784, 1073787233 }, - { 2143561729, 1932954575, 949167309 }, - { 2143553537, 354315656, 1652037534 }, - { 2143541249, 577424288, 1097027618 }, - { 2143531009, 357862822, 478640055 }, - { 2143522817, 2017706025, 1550531668 }, - { 2143506433, 2078127419, 1824320165 }, - { 2143488001, 613475285, 1604011510 }, - { 2143469569, 1466594987, 502095196 }, - { 2143426561, 1115430331, 1044637111 }, - { 2143383553, 9778045, 1902463734 }, - { 2143377409, 1557401276, 2056861771 }, - { 2143363073, 652036455, 1965915971 }, - { 2143260673, 1464581171, 1523257541 }, - { 2143246337, 1876119649, 764541916 }, - { 2143209473, 1614992673, 1920672844 }, - { 2143203329, 981052047, 2049774209 }, - { 2143160321, 1847355533, 728535665 }, - { 2143129601, 965558457, 603052992 }, - { 2143123457, 2140817191, 8348679 }, - { 2143100929, 1547263683, 694209023 }, - { 2143092737, 643459066, 1979934533 }, - { 2143082497, 188603778, 2026175670 }, - { 2143062017, 1657329695, 377451099 }, - { 2143051777, 114967950, 979255473 }, - { 2143025153, 1698431342, 1449196896 }, - { 2143006721, 1862741675, 1739650365 }, - { 2142996481, 756660457, 996160050 }, - { 2142976001, 927864010, 1166847574 }, - { 2142965761, 905070557, 661974566 }, - { 2142916609, 40932754, 1787161127 }, - { 2142892033, 1987985648, 675335382 }, - { 2142885889, 797497211, 1323096997 }, - { 2142871553, 2068025830, 1411877159 }, - { 2142861313, 1217177090, 1438410687 }, - { 2142830593, 409906375, 1767860634 }, - { 2142803969, 1197788993, 359782919 }, - { 2142785537, 643817365, 513932862 }, - { 2142779393, 1717046338, 218943121 }, - { 2142724097, 89336830, 416687049 }, - { 2142707713, 5944581, 1356813523 }, - { 2142658561, 887942135, 2074011722 }, - { 2142638081, 151851972, 1647339939 }, - { 2142564353, 1691505537, 1483107336 }, - { 2142533633, 1989920200, 1135938817 }, - { 2142529537, 959263126, 1531961857 }, - { 2142527489, 453251129, 1725566162 }, - { 2142502913, 1536028102, 182053257 }, - { 2142498817, 570138730, 701443447 }, - { 2142416897, 326965800, 411931819 }, - { 2142363649, 1675665410, 1517191733 }, - { 2142351361, 968529566, 1575712703 }, - { 2142330881, 1384953238, 1769087884 }, - { 2142314497, 1977173242, 1833745524 }, - { 2142289921, 95082313, 1714775493 }, - { 2142283777, 109377615, 1070584533 }, - { 2142277633, 16960510, 702157145 }, - { 2142263297, 553850819, 431364395 }, - { 2142208001, 241466367, 2053967982 }, - { 2142164993, 1795661326, 1031836848 }, - { 2142097409, 1212530046, 712772031 }, - { 2142087169, 1763869720, 822276067 }, - { 2142078977, 644065713, 1765268066 }, - { 2142074881, 112671944, 643204925 }, - { 2142044161, 1387785471, 1297890174 }, - { 2142025729, 783885537, 1000425730 }, - { 2142011393, 905662232, 1679401033 }, - { 2141974529, 799788433, 468119557 }, - { 2141943809, 1932544124, 449305555 }, - { 2141933569, 1527403256, 841867925 }, - { 2141931521, 1247076451, 743823916 }, - { 2141902849, 1199660531, 401687910 }, - { 2141890561, 150132350, 1720336972 }, - { 2141857793, 1287438162, 663880489 }, - { 2141833217, 618017731, 1819208266 }, - { 2141820929, 999578638, 1403090096 }, - { 2141786113, 81834325, 1523542501 }, - { 2141771777, 120001928, 463556492 }, - { 2141759489, 122455485, 2124928282 }, - { 2141749249, 141986041, 940339153 }, - { 2141685761, 889088734, 477141499 }, - { 2141673473, 324212681, 1122558298 }, - { 2141669377, 1175806187, 1373818177 }, - { 2141655041, 1113654822, 296887082 }, - { 2141587457, 991103258, 1585913875 }, - { 2141583361, 1401451409, 1802457360 }, - { 2141575169, 1571977166, 712760980 }, - { 2141546497, 1107849376, 1250270109 }, - { 2141515777, 196544219, 356001130 }, - { 2141495297, 1733571506, 1060744866 }, - { 2141483009, 321552363, 1168297026 }, - { 2141458433, 505818251, 733225819 }, - { 2141360129, 1026840098, 948342276 }, - { 2141325313, 945133744, 2129965998 }, - { 2141317121, 1871100260, 1843844634 }, - { 2141286401, 1790639498, 1750465696 }, - { 2141267969, 1376858592, 186160720 }, - { 2141255681, 2129698296, 1876677959 }, - { 2141243393, 2138900688, 1340009628 }, - { 2141214721, 1933049835, 1087819477 }, - { 2141212673, 1898664939, 1786328049 }, - { 2141202433, 990234828, 940682169 }, - { 2141175809, 1406392421, 993089586 }, - { 2141165569, 1263518371, 289019479 }, - { 2141073409, 1485624211, 507864514 }, - { 2141052929, 1885134788, 311252465 }, - { 2141040641, 1285021247, 280941862 }, - { 2141028353, 1527610374, 375035110 }, - { 2141011969, 1400626168, 164696620 }, - { 2140999681, 632959608, 966175067 }, - { 2140997633, 2045628978, 1290889438 }, - { 2140993537, 1412755491, 375366253 }, - { 2140942337, 719477232, 785367828 }, - { 2140925953, 45224252, 836552317 }, - { 2140917761, 1157376588, 1001839569 }, - { 2140887041, 278480752, 2098732796 }, - { 2140837889, 1663139953, 924094810 }, - { 2140788737, 802501511, 2045368990 }, - { 2140766209, 1820083885, 1800295504 }, - { 2140764161, 1169561905, 2106792035 }, - { 2140696577, 127781498, 1885987531 }, - { 2140684289, 16014477, 1098116827 }, - { 2140653569, 665960598, 1796728247 }, - { 2140594177, 1043085491, 377310938 }, - { 2140579841, 1732838211, 1504505945 }, - { 2140569601, 302071939, 358291016 }, - { 2140567553, 192393733, 1909137143 }, - { 2140557313, 406595731, 1175330270 }, - { 2140549121, 1748850918, 525007007 }, - { 2140477441, 499436566, 1031159814 }, - { 2140469249, 1886004401, 1029951320 }, - { 2140426241, 1483168100, 1676273461 }, - { 2140420097, 1779917297, 846024476 }, - { 2140413953, 522948893, 1816354149 }, - { 2140383233, 1931364473, 1296921241 }, - { 2140366849, 1917356555, 147196204 }, - { 2140354561, 16466177, 1349052107 }, - { 2140348417, 1875366972, 1860485634 }, - { 2140323841, 456498717, 1790256483 }, - { 2140321793, 1629493973, 150031888 }, - { 2140315649, 1904063898, 395510935 }, - { 2140280833, 1784104328, 831417909 }, - { 2140250113, 256087139, 697349101 }, - { 2140229633, 388553070, 243875754 }, - { 2140223489, 747459608, 1396270850 }, - { 2140200961, 507423743, 1895572209 }, - { 2140162049, 580106016, 2045297469 }, - { 2140149761, 712426444, 785217995 }, - { 2140137473, 1441607584, 536866543 }, - { 2140119041, 346538902, 1740434653 }, - { 2140090369, 282642885, 21051094 }, - { 2140076033, 1407456228, 319910029 }, - { 2140047361, 1619330500, 1488632070 }, - { 2140041217, 2089408064, 2012026134 }, - { 2140008449, 1705524800, 1613440760 }, - { 2139924481, 1846208233, 1280649481 }, - { 2139906049, 989438755, 1185646076 }, - { 2139867137, 1522314850, 372783595 }, - { 2139842561, 1681587377, 216848235 }, - { 2139826177, 2066284988, 1784999464 }, - { 2139824129, 480888214, 1513323027 }, - { 2139789313, 847937200, 858192859 }, - { 2139783169, 1642000434, 1583261448 }, - { 2139770881, 940699589, 179702100 }, - { 2139768833, 315623242, 964612676 }, - { 2139666433, 331649203, 764666914 }, - { 2139641857, 2118730799, 1313764644 }, - { 2139635713, 519149027, 519212449 }, - { 2139598849, 1526413634, 1769667104 }, - { 2139574273, 551148610, 820739925 }, - { 2139568129, 1386800242, 472447405 }, - { 2139549697, 813760130, 1412328531 }, - { 2139537409, 1615286260, 1609362979 }, - { 2139475969, 1352559299, 1696720421 }, - { 2139455489, 1048691649, 1584935400 }, - { 2139432961, 836025845, 950121150 }, - { 2139424769, 1558281165, 1635486858 }, - { 2139406337, 1728402143, 1674423301 }, - { 2139396097, 1727715782, 1483470544 }, - { 2139383809, 1092853491, 1741699084 }, - { 2139369473, 690776899, 1242798709 }, - { 2139351041, 1768782380, 2120712049 }, - { 2139334657, 1739968247, 1427249225 }, - { 2139332609, 1547189119, 623011170 }, - { 2139310081, 1346827917, 1605466350 }, - { 2139303937, 369317948, 828392831 }, - { 2139301889, 1560417239, 1788073219 }, - { 2139283457, 1303121623, 595079358 }, - { 2139248641, 1354555286, 573424177 }, - { 2139240449, 60974056, 885781403 }, - { 2139222017, 355573421, 1221054839 }, - { 2139215873, 566477826, 1724006500 }, - { 2139150337, 871437673, 1609133294 }, - { 2139144193, 1478130914, 1137491905 }, - { 2139117569, 1854880922, 964728507 }, - { 2139076609, 202405335, 756508944 }, - { 2139062273, 1399715741, 884826059 }, - { 2139045889, 1051045798, 1202295476 }, - { 2139033601, 1707715206, 632234634 }, - { 2139006977, 2035853139, 231626690 }, - { 2138951681, 183867876, 838350879 }, - { 2138945537, 1403254661, 404460202 }, - { 2138920961, 310865011, 1282911681 }, - { 2138910721, 1328496553, 103472415 }, - { 2138904577, 78831681, 993513549 }, - { 2138902529, 1319697451, 1055904361 }, - { 2138816513, 384338872, 1706202469 }, - { 2138810369, 1084868275, 405677177 }, - { 2138787841, 401181788, 1964773901 }, - { 2138775553, 1850532988, 1247087473 }, - { 2138767361, 874261901, 1576073565 }, - { 2138757121, 1187474742, 993541415 }, - { 2138748929, 1782458888, 1043206483 }, - { 2138744833, 1221500487, 800141243 }, - { 2138738689, 413465368, 1450660558 }, - { 2138695681, 739045140, 342611472 }, - { 2138658817, 1355845756, 672674190 }, - { 2138644481, 608379162, 1538874380 }, - { 2138632193, 1444914034, 686911254 }, - { 2138607617, 484707818, 1435142134 }, - { 2138591233, 539460669, 1290458549 }, - { 2138572801, 2093538990, 2011138646 }, - { 2138552321, 1149786988, 1076414907 }, - { 2138546177, 840688206, 2108985273 }, - { 2138533889, 209669619, 198172413 }, - { 2138523649, 1975879426, 1277003968 }, - { 2138490881, 1351891144, 1976858109 }, - { 2138460161, 1817321013, 1979278293 }, - { 2138429441, 1950077177, 203441928 }, - { 2138400769, 908970113, 628395069 }, - { 2138398721, 219890864, 758486760 }, - { 2138376193, 1306654379, 977554090 }, - { 2138351617, 298822498, 2004708503 }, - { 2138337281, 441457816, 1049002108 }, - { 2138320897, 1517731724, 1442269609 }, - { 2138290177, 1355911197, 1647139103 }, - { 2138234881, 531313247, 1746591962 }, - { 2138214401, 1899410930, 781416444 }, - { 2138202113, 1813477173, 1622508515 }, - { 2138191873, 1086458299, 1025408615 }, - { 2138183681, 1998800427, 827063290 }, - { 2138173441, 1921308898, 749670117 }, - { 2138103809, 1620902804, 2126787647 }, - { 2138099713, 828647069, 1892961817 }, - { 2138085377, 179405355, 1525506535 }, - { 2138060801, 615683235, 1259580138 }, - { 2138044417, 2030277840, 1731266562 }, - { 2138042369, 2087222316, 1627902259 }, - { 2138032129, 126388712, 1108640984 }, - { 2138011649, 715026550, 1017980050 }, - { 2137993217, 1693714349, 1351778704 }, - { 2137888769, 1289762259, 1053090405 }, - { 2137853953, 199991890, 1254192789 }, - { 2137833473, 941421685, 896995556 }, - { 2137817089, 750416446, 1251031181 }, - { 2137792513, 798075119, 368077456 }, - { 2137786369, 878543495, 1035375025 }, - { 2137767937, 9351178, 1156563902 }, - { 2137755649, 1382297614, 1686559583 }, - { 2137724929, 1345472850, 1681096331 }, - { 2137704449, 834666929, 630551727 }, - { 2137673729, 1646165729, 1892091571 }, - { 2137620481, 778943821, 48456461 }, - { 2137618433, 1730837875, 1713336725 }, - { 2137581569, 805610339, 1378891359 }, - { 2137538561, 204342388, 1950165220 }, - { 2137526273, 1947629754, 1500789441 }, - { 2137516033, 719902645, 1499525372 }, - { 2137491457, 230451261, 556382829 }, - { 2137440257, 979573541, 412760291 }, - { 2137374721, 927841248, 1954137185 }, - { 2137362433, 1243778559, 861024672 }, - { 2137313281, 1341338501, 980638386 }, - { 2137311233, 937415182, 1793212117 }, - { 2137255937, 795331324, 1410253405 }, - { 2137243649, 150756339, 1966999887 }, - { 2137182209, 163346914, 1939301431 }, - { 2137171969, 1952552395, 758913141 }, - { 2137159681, 570788721, 218668666 }, - { 2137147393, 1896656810, 2045670345 }, - { 2137141249, 358493842, 518199643 }, - { 2137139201, 1505023029, 674695848 }, - { 2137133057, 27911103, 830956306 }, - { 2137122817, 439771337, 1555268614 }, - { 2137116673, 790988579, 1871449599 }, - { 2137110529, 432109234, 811805080 }, - { 2137102337, 1357900653, 1184997641 }, - { 2137098241, 515119035, 1715693095 }, - { 2137090049, 408575203, 2085660657 }, - { 2137085953, 2097793407, 1349626963 }, - { 2137055233, 1556739954, 1449960883 }, - { 2137030657, 1545758650, 1369303716 }, - { 2136987649, 332602570, 103875114 }, - { 2136969217, 1499989506, 1662964115 }, - { 2136924161, 857040753, 4738842 }, - { 2136895489, 1948872712, 570436091 }, - { 2136893441, 58969960, 1568349634 }, - { 2136887297, 2127193379, 273612548 }, - { 2136850433, 111208983, 1181257116 }, - { 2136809473, 1627275942, 1680317971 }, - { 2136764417, 1574888217, 14011331 }, - { 2136741889, 14011055, 1129154251 }, - { 2136727553, 35862563, 1838555253 }, - { 2136721409, 310235666, 1363928244 }, - { 2136698881, 1612429202, 1560383828 }, - { 2136649729, 1138540131, 800014364 }, - { 2136606721, 602323503, 1433096652 }, - { 2136563713, 182209265, 1919611038 }, - { 2136555521, 324156477, 165591039 }, - { 2136549377, 195513113, 217165345 }, - { 2136526849, 1050768046, 939647887 }, - { 2136508417, 1886286237, 1619926572 }, - { 2136477697, 609647664, 35065157 }, - { 2136471553, 679352216, 1452259468 }, - { 2136457217, 128630031, 824816521 }, - { 2136422401, 19787464, 1526049830 }, - { 2136420353, 698316836, 1530623527 }, - { 2136371201, 1651862373, 1804812805 }, - { 2136334337, 326596005, 336977082 }, - { 2136322049, 63253370, 1904972151 }, - { 2136297473, 312176076, 172182411 }, - { 2136248321, 381261841, 369032670 }, - { 2136242177, 358688773, 1640007994 }, - { 2136229889, 512677188, 75585225 }, - { 2136219649, 2095003250, 1970086149 }, - { 2136207361, 1909650722, 537760675 }, - { 2136176641, 1334616195, 1533487619 }, - { 2136158209, 2096285632, 1793285210 }, - { 2136143873, 1897347517, 293843959 }, - { 2136133633, 923586222, 1022655978 }, - { 2136096769, 1464868191, 1515074410 }, - { 2136094721, 2020679520, 2061636104 }, - { 2136076289, 290798503, 1814726809 }, - { 2136041473, 156415894, 1250757633 }, - { 2135996417, 297459940, 1132158924 }, - { 2135955457, 538755304, 1688831340 }, - { 0, 0, 0 } -}; - -/* - * Reduce a small signed integer modulo a small prime. The source - * value x MUST be such that -p < x < p. - */ -static inline uint32_t -modp_set(int32_t x, uint32_t p) -{ - uint32_t w; - - w = (uint32_t)x; - w += p & -(w >> 31); - return w; -} - -/* - * Normalize a modular integer around 0. - */ -static inline int32_t -modp_norm(uint32_t x, uint32_t p) -{ - return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1))); -} - -/* - * Compute -1/p mod 2^31. This works for all odd integers p that fit - * on 31 bits. - */ -static uint32_t -modp_ninv31(uint32_t p) -{ - uint32_t y; - - y = 2 - p; - y *= 2 - p * y; - y *= 2 - p * y; - y *= 2 - p * y; - y *= 2 - p * y; - return (uint32_t)0x7FFFFFFF & -y; -} - -/* - * Compute R = 2^31 mod p. - */ -static inline uint32_t -modp_R(uint32_t p) -{ - /* - * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply - * 2^31 - p. - */ - return ((uint32_t)1 << 31) - p; -} - -/* - * Addition modulo p. - */ -static inline uint32_t -modp_add(uint32_t a, uint32_t b, uint32_t p) -{ - uint32_t d; - - d = a + b - p; - d += p & -(d >> 31); - return d; -} - -/* - * Subtraction modulo p. - */ -static inline uint32_t -modp_sub(uint32_t a, uint32_t b, uint32_t p) -{ - uint32_t d; - - d = a - b; - d += p & -(d >> 31); - return d; -} - -/* - * Halving modulo p. - */ -/* unused -static inline uint32_t -modp_half(uint32_t a, uint32_t p) -{ - a += p & -(a & 1); - return a >> 1; -} -*/ - -/* - * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31. - * It is required that p is an odd integer. - */ -static inline uint32_t -modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i) -{ - uint64_t z, w; - uint32_t d; - - z = (uint64_t)a * (uint64_t)b; - w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p; - d = (uint32_t)((z + w) >> 31) - p; - d += p & -(d >> 31); - return d; -} - -/* - * Compute R2 = 2^62 mod p. - */ -static uint32_t -modp_R2(uint32_t p, uint32_t p0i) -{ - uint32_t z; - - /* - * Compute z = 2^31 mod p (this is the value 1 in Montgomery - * representation), then double it with an addition. - */ - z = modp_R(p); - z = modp_add(z, z, p); - - /* - * Square it five times to obtain 2^32 in Montgomery representation - * (i.e. 2^63 mod p). - */ - z = modp_montymul(z, z, p, p0i); - z = modp_montymul(z, z, p, p0i); - z = modp_montymul(z, z, p, p0i); - z = modp_montymul(z, z, p, p0i); - z = modp_montymul(z, z, p, p0i); - - /* - * Halve the value mod p to get 2^62. - */ - z = (z + (p & -(z & 1))) >> 1; - return z; -} - -/* - * Compute 2^(31*x) modulo p. This works for integers x up to 2^11. - * p must be prime such that 2^30 < p < 2^31; p0i must be equal to - * -1/p mod 2^31; R2 must be equal to 2^62 mod p. - */ -static inline uint32_t -modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2) -{ - int i; - uint32_t r, z; - - /* - * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery - * representation of (2^31)^e mod p, where e = x-1. - * R2 is 2^31 in Montgomery representation. - */ - x --; - r = R2; - z = modp_R(p); - for (i = 0; (1U << i) <= x; i ++) { - if ((x & (1U << i)) != 0) { - z = modp_montymul(z, r, p, p0i); - } - r = modp_montymul(r, r, p, p0i); - } - return z; -} - -/* - * Division modulo p. If the divisor (b) is 0, then 0 is returned. - * This function computes proper results only when p is prime. - * Parameters: - * a dividend - * b divisor - * p odd prime modulus - * p0i -1/p mod 2^31 - * R 2^31 mod R - */ -static uint32_t -modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R) -{ - uint32_t z, e; - int i; - - e = p - 2; - z = R; - for (i = 30; i >= 0; i --) { - uint32_t z2; - - z = modp_montymul(z, z, p, p0i); - z2 = modp_montymul(z, b, p, p0i); - z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1); - } - - /* - * The loop above just assumed that b was in Montgomery - * representation, i.e. really contained b*R; under that - * assumption, it returns 1/b in Montgomery representation, - * which is R/b. But we gave it b in normal representation, - * so the loop really returned R/(b/R) = R^2/b. - * - * We want a/b, so we need one Montgomery multiplication with a, - * which also remove one of the R factors, and another such - * multiplication to remove the second R factor. - */ - z = modp_montymul(z, 1, p, p0i); - return modp_montymul(a, z, p, p0i); -} - -/* - * Bit-reversal index table. - */ -static const uint16_t REV10[] = { - 0, 512, 256, 768, 128, 640, 384, 896, 64, 576, 320, 832, - 192, 704, 448, 960, 32, 544, 288, 800, 160, 672, 416, 928, - 96, 608, 352, 864, 224, 736, 480, 992, 16, 528, 272, 784, - 144, 656, 400, 912, 80, 592, 336, 848, 208, 720, 464, 976, - 48, 560, 304, 816, 176, 688, 432, 944, 112, 624, 368, 880, - 240, 752, 496, 1008, 8, 520, 264, 776, 136, 648, 392, 904, - 72, 584, 328, 840, 200, 712, 456, 968, 40, 552, 296, 808, - 168, 680, 424, 936, 104, 616, 360, 872, 232, 744, 488, 1000, - 24, 536, 280, 792, 152, 664, 408, 920, 88, 600, 344, 856, - 216, 728, 472, 984, 56, 568, 312, 824, 184, 696, 440, 952, - 120, 632, 376, 888, 248, 760, 504, 1016, 4, 516, 260, 772, - 132, 644, 388, 900, 68, 580, 324, 836, 196, 708, 452, 964, - 36, 548, 292, 804, 164, 676, 420, 932, 100, 612, 356, 868, - 228, 740, 484, 996, 20, 532, 276, 788, 148, 660, 404, 916, - 84, 596, 340, 852, 212, 724, 468, 980, 52, 564, 308, 820, - 180, 692, 436, 948, 116, 628, 372, 884, 244, 756, 500, 1012, - 12, 524, 268, 780, 140, 652, 396, 908, 76, 588, 332, 844, - 204, 716, 460, 972, 44, 556, 300, 812, 172, 684, 428, 940, - 108, 620, 364, 876, 236, 748, 492, 1004, 28, 540, 284, 796, - 156, 668, 412, 924, 92, 604, 348, 860, 220, 732, 476, 988, - 60, 572, 316, 828, 188, 700, 444, 956, 124, 636, 380, 892, - 252, 764, 508, 1020, 2, 514, 258, 770, 130, 642, 386, 898, - 66, 578, 322, 834, 194, 706, 450, 962, 34, 546, 290, 802, - 162, 674, 418, 930, 98, 610, 354, 866, 226, 738, 482, 994, - 18, 530, 274, 786, 146, 658, 402, 914, 82, 594, 338, 850, - 210, 722, 466, 978, 50, 562, 306, 818, 178, 690, 434, 946, - 114, 626, 370, 882, 242, 754, 498, 1010, 10, 522, 266, 778, - 138, 650, 394, 906, 74, 586, 330, 842, 202, 714, 458, 970, - 42, 554, 298, 810, 170, 682, 426, 938, 106, 618, 362, 874, - 234, 746, 490, 1002, 26, 538, 282, 794, 154, 666, 410, 922, - 90, 602, 346, 858, 218, 730, 474, 986, 58, 570, 314, 826, - 186, 698, 442, 954, 122, 634, 378, 890, 250, 762, 506, 1018, - 6, 518, 262, 774, 134, 646, 390, 902, 70, 582, 326, 838, - 198, 710, 454, 966, 38, 550, 294, 806, 166, 678, 422, 934, - 102, 614, 358, 870, 230, 742, 486, 998, 22, 534, 278, 790, - 150, 662, 406, 918, 86, 598, 342, 854, 214, 726, 470, 982, - 54, 566, 310, 822, 182, 694, 438, 950, 118, 630, 374, 886, - 246, 758, 502, 1014, 14, 526, 270, 782, 142, 654, 398, 910, - 78, 590, 334, 846, 206, 718, 462, 974, 46, 558, 302, 814, - 174, 686, 430, 942, 110, 622, 366, 878, 238, 750, 494, 1006, - 30, 542, 286, 798, 158, 670, 414, 926, 94, 606, 350, 862, - 222, 734, 478, 990, 62, 574, 318, 830, 190, 702, 446, 958, - 126, 638, 382, 894, 254, 766, 510, 1022, 1, 513, 257, 769, - 129, 641, 385, 897, 65, 577, 321, 833, 193, 705, 449, 961, - 33, 545, 289, 801, 161, 673, 417, 929, 97, 609, 353, 865, - 225, 737, 481, 993, 17, 529, 273, 785, 145, 657, 401, 913, - 81, 593, 337, 849, 209, 721, 465, 977, 49, 561, 305, 817, - 177, 689, 433, 945, 113, 625, 369, 881, 241, 753, 497, 1009, - 9, 521, 265, 777, 137, 649, 393, 905, 73, 585, 329, 841, - 201, 713, 457, 969, 41, 553, 297, 809, 169, 681, 425, 937, - 105, 617, 361, 873, 233, 745, 489, 1001, 25, 537, 281, 793, - 153, 665, 409, 921, 89, 601, 345, 857, 217, 729, 473, 985, - 57, 569, 313, 825, 185, 697, 441, 953, 121, 633, 377, 889, - 249, 761, 505, 1017, 5, 517, 261, 773, 133, 645, 389, 901, - 69, 581, 325, 837, 197, 709, 453, 965, 37, 549, 293, 805, - 165, 677, 421, 933, 101, 613, 357, 869, 229, 741, 485, 997, - 21, 533, 277, 789, 149, 661, 405, 917, 85, 597, 341, 853, - 213, 725, 469, 981, 53, 565, 309, 821, 181, 693, 437, 949, - 117, 629, 373, 885, 245, 757, 501, 1013, 13, 525, 269, 781, - 141, 653, 397, 909, 77, 589, 333, 845, 205, 717, 461, 973, - 45, 557, 301, 813, 173, 685, 429, 941, 109, 621, 365, 877, - 237, 749, 493, 1005, 29, 541, 285, 797, 157, 669, 413, 925, - 93, 605, 349, 861, 221, 733, 477, 989, 61, 573, 317, 829, - 189, 701, 445, 957, 125, 637, 381, 893, 253, 765, 509, 1021, - 3, 515, 259, 771, 131, 643, 387, 899, 67, 579, 323, 835, - 195, 707, 451, 963, 35, 547, 291, 803, 163, 675, 419, 931, - 99, 611, 355, 867, 227, 739, 483, 995, 19, 531, 275, 787, - 147, 659, 403, 915, 83, 595, 339, 851, 211, 723, 467, 979, - 51, 563, 307, 819, 179, 691, 435, 947, 115, 627, 371, 883, - 243, 755, 499, 1011, 11, 523, 267, 779, 139, 651, 395, 907, - 75, 587, 331, 843, 203, 715, 459, 971, 43, 555, 299, 811, - 171, 683, 427, 939, 107, 619, 363, 875, 235, 747, 491, 1003, - 27, 539, 283, 795, 155, 667, 411, 923, 91, 603, 347, 859, - 219, 731, 475, 987, 59, 571, 315, 827, 187, 699, 443, 955, - 123, 635, 379, 891, 251, 763, 507, 1019, 7, 519, 263, 775, - 135, 647, 391, 903, 71, 583, 327, 839, 199, 711, 455, 967, - 39, 551, 295, 807, 167, 679, 423, 935, 103, 615, 359, 871, - 231, 743, 487, 999, 23, 535, 279, 791, 151, 663, 407, 919, - 87, 599, 343, 855, 215, 727, 471, 983, 55, 567, 311, 823, - 183, 695, 439, 951, 119, 631, 375, 887, 247, 759, 503, 1015, - 15, 527, 271, 783, 143, 655, 399, 911, 79, 591, 335, 847, - 207, 719, 463, 975, 47, 559, 303, 815, 175, 687, 431, 943, - 111, 623, 367, 879, 239, 751, 495, 1007, 31, 543, 287, 799, - 159, 671, 415, 927, 95, 607, 351, 863, 223, 735, 479, 991, - 63, 575, 319, 831, 191, 703, 447, 959, 127, 639, 383, 895, - 255, 767, 511, 1023 -}; - -/* - * Compute the roots for NTT and inverse NTT (binary case). Input - * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 = - * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g: - * gm[rev(i)] = g^i mod p - * igm[rev(i)] = (1/g)^i mod p - * where rev() is the "bit reversal" function over 10 bits. It fills - * the arrays only up to N = 2^logn values. - * - * The values stored in gm[] and igm[] are in Montgomery representation. - * - * p must be a prime such that p = 1 mod 2048. - */ -static void -modp_mkgm2(uint32_t *restrict gm, uint32_t *restrict igm, unsigned logn, - uint32_t g, uint32_t p, uint32_t p0i) -{ - size_t u, n; - unsigned k; - uint32_t ig, x1, x2, R2; - - n = (size_t)1 << logn; - - /* - * We want g such that g^(2N) = 1 mod p, but the provided - * generator has order 2048. We must square it a few times. - */ - R2 = modp_R2(p, p0i); - g = modp_montymul(g, R2, p, p0i); - for (k = logn; k < 10; k ++) { - g = modp_montymul(g, g, p, p0i); - } - - ig = modp_div(R2, g, p, p0i, modp_R(p)); - k = 10 - logn; - x1 = x2 = modp_R(p); - for (u = 0; u < n; u ++) { - size_t v; - - v = REV10[u << k]; - gm[v] = x1; - igm[v] = x2; - x1 = modp_montymul(x1, g, p, p0i); - x2 = modp_montymul(x2, ig, p, p0i); - } -} - -/* - * Compute the NTT over a polynomial (binary case). Polynomial elements - * are a[0], a[stride], a[2 * stride]... - */ -static void -modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn, - uint32_t p, uint32_t p0i) -{ - size_t t, m, n; - - if (logn == 0) { - return; - } - n = (size_t)1 << logn; - t = n; - for (m = 1; m < n; m <<= 1) { - size_t ht, u, v1; - - ht = t >> 1; - for (u = 0, v1 = 0; u < m; u ++, v1 += t) { - uint32_t s; - size_t v; - uint32_t *r1, *r2; - - s = gm[m + u]; - r1 = a + v1 * stride; - r2 = r1 + ht * stride; - for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) { - uint32_t x, y; - - x = *r1; - y = modp_montymul(*r2, s, p, p0i); - *r1 = modp_add(x, y, p); - *r2 = modp_sub(x, y, p); - } - } - t = ht; - } -} - -/* - * Compute the inverse NTT over a polynomial (binary case). - */ -static void -modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn, - uint32_t p, uint32_t p0i) -{ - size_t t, m, n, k; - uint32_t ni; - uint32_t *r; - - if (logn == 0) { - return; - } - n = (size_t)1 << logn; - t = 1; - for (m = n; m > 1; m >>= 1) { - size_t hm, dt, u, v1; - - hm = m >> 1; - dt = t << 1; - for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) { - uint32_t s; - size_t v; - uint32_t *r1, *r2; - - s = igm[hm + u]; - r1 = a + v1 * stride; - r2 = r1 + t * stride; - for (v = 0; v < t; v ++, r1 += stride, r2 += stride) { - uint32_t x, y; - - x = *r1; - y = *r2; - *r1 = modp_add(x, y, p); - *r2 = modp_montymul( - modp_sub(x, y, p), s, p, p0i);; - } - } - t = dt; - } - - /* - * We need 1/n in Montgomery representation, i.e. R/n. Since - * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p, - * thus a simple shift will do. - */ - ni = (uint32_t)1 << (31 - logn); - for (k = 0, r = a; k < n; k ++, r += stride) { - *r = modp_montymul(*r, ni, p, p0i); - } -} - -/* - * Simplified macros for NTT and iNTT (binary case) when the elements - * are consecutive in RAM. - */ -#define modp_NTT2(a, gm, logn, p, p0i) modp_NTT2_ext(a, 1, gm, logn, p, p0i) -#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i) - -/* - * Given polynomial f in NTT representation modulo p, compute f' of degree - * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are - * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2). - * - * The new polynomial is written "in place" over the first N/2 elements - * of f. - * - * If applied logn times successively on a given polynomial, the resulting - * degree-0 polynomial is the resultant of f and X^N+1 modulo p. - * - * This function applies only to the binary case; it is invoked from - * solve_NTRU_binary_depth1(). - */ -static void -modp_poly_rec_res(uint32_t *f, unsigned logn, - uint32_t p, uint32_t p0i, uint32_t R2) -{ - size_t hn, u; - - hn = (size_t)1 << (logn - 1); - for (u = 0; u < hn; u ++) { - uint32_t w0, w1; - - w0 = f[(u << 1) + 0]; - w1 = f[(u << 1) + 1]; - f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i); - } -} - -/* ==================================================================== */ -/* - * Custom bignum implementation. - * - * This is a very reduced set of functionalities. We need to do the - * following operations: - * - * - Rebuild the resultant and the polynomial coefficients from their - * values modulo small primes (of length 31 bits each). - * - * - Compute an extended GCD between the two computed resultants. - * - * - Extract top bits and add scaled values during the successive steps - * of Babai rounding. - * - * When rebuilding values using CRT, we must also recompute the product - * of the small prime factors. We always do it one small factor at a - * time, so the "complicated" operations can be done modulo the small - * prime with the modp_* functions. CRT coefficients (inverses) are - * precomputed. - * - * All values are positive until the last step: when the polynomial - * coefficients have been rebuilt, we normalize them around 0. But then, - * only additions and subtractions on the upper few bits are needed - * afterwards. - * - * We keep big integers as arrays of 31-bit words (in uint32_t values); - * the top bit of each uint32_t is kept equal to 0. Using 31-bit words - * makes it easier to keep track of carries. When negative values are - * used, two's complement is used. - */ - -/* - * Subtract integer b from integer a. Both integers are supposed to have - * the same size. The carry (0 or 1) is returned. Source arrays a and b - * MUST be distinct. - * - * The operation is performed as described above if ctr = 1. If - * ctl = 0, the value a[] is unmodified, but all memory accesses are - * still performed, and the carry is computed and returned. - */ -static uint32_t -zint_sub(uint32_t *restrict a, const uint32_t *restrict b, size_t len, - uint32_t ctl) -{ - size_t u; - uint32_t cc, m; - - cc = 0; - m = -ctl; - for (u = 0; u < len; u ++) { - uint32_t aw, w; - - aw = a[u]; - w = aw - b[u] - cc; - cc = w >> 31; - aw ^= ((w & 0x7FFFFFFF) ^ aw) & m; - a[u] = aw; - } - return cc; -} - -/* - * Mutiply the provided big integer m with a small value x. - * This function assumes that x < 2^31. The carry word is returned. - */ -static uint32_t -zint_mul_small(uint32_t *m, size_t mlen, uint32_t x) -{ - size_t u; - uint32_t cc; - - cc = 0; - for (u = 0; u < mlen; u ++) { - uint64_t z; - - z = (uint64_t)m[u] * (uint64_t)x + cc; - m[u] = (uint32_t)z & 0x7FFFFFFF; - cc = (uint32_t)(z >> 31); - } - return cc; -} - -/* - * Reduce a big integer d modulo a small integer p. - * Rules: - * d is unsigned - * p is prime - * 2^30 < p < 2^31 - * p0i = -(1/p) mod 2^31 - * R2 = 2^62 mod p - */ -static uint32_t -zint_mod_small_unsigned(const uint32_t *d, size_t dlen, - uint32_t p, uint32_t p0i, uint32_t R2) -{ - uint32_t x; - size_t u; - - /* - * Algorithm: we inject words one by one, starting with the high - * word. Each step is: - * - multiply x by 2^31 - * - add new word - */ - x = 0; - u = dlen; - while (u -- > 0) { - uint32_t w; - - x = modp_montymul(x, R2, p, p0i); - w = d[u] - p; - w += p & -(w >> 31); - x = modp_add(x, w, p); - } - return x; -} - -/* - * Similar to zint_mod_small_unsigned(), except that d may be signed. - * Extra parameter is Rx = 2^(31*dlen) mod p. - */ -static uint32_t -zint_mod_small_signed(const uint32_t *d, size_t dlen, - uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx) -{ - uint32_t z; - - if (dlen == 0) { - return 0; - } - z = zint_mod_small_unsigned(d, dlen, p, p0i, R2); - z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p); - return z; -} - -/* - * Add y*s to x. x and y initially have length 'len' words; the new x - * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must - * not overlap. - */ -static void -zint_add_mul_small(uint32_t *restrict x, - const uint32_t *restrict y, size_t len, uint32_t s) -{ - size_t u; - uint32_t cc; - - cc = 0; - for (u = 0; u < len; u ++) { - uint32_t xw, yw; - uint64_t z; - - xw = x[u]; - yw = y[u]; - z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc; - x[u] = (uint32_t)z & 0x7FFFFFFF; - cc = (uint32_t)(z >> 31); - } - x[len] = cc; -} - -/* - * Normalize a modular integer around 0: if x > p/2, then x is replaced - * with x - p (signed encoding with two's complement); otherwise, x is - * untouched. The two integers x and p are encoded over the same length. - */ -static void -zint_norm_zero(uint32_t *restrict x, const uint32_t *restrict p, size_t len) -{ - size_t u; - uint32_t r, bb; - - /* - * Compare x with p/2. We use the shifted version of p, and p - * is odd, so we really compare with (p-1)/2; we want to perform - * the subtraction if and only if x > (p-1)/2. - */ - r = 0; - bb = 0; - u = len; - while (u -- > 0) { - uint32_t wx, wp, cc; - - /* - * Get the two words to compare in wx and wp (both over - * 31 bits exactly). - */ - wx = x[u]; - wp = (p[u] >> 1) | (bb << 30); - bb = p[u] & 1; - - /* - * We set cc to -1, 0 or 1, depending on whether wp is - * lower than, equal to, or greater than wx. - */ - cc = wp - wx; - cc = ((-cc) >> 31) | -(cc >> 31); - - /* - * If r != 0 then it is either 1 or -1, and we keep its - * value. Otherwise, if r = 0, then we replace it with cc. - */ - r |= cc & ((r & 1) - 1); - } - - /* - * At this point, r = -1, 0 or 1, depending on whether (p-1)/2 - * is lower than, equal to, or greater than x. We thus want to - * do the subtraction only if r = -1. - */ - zint_sub(x, p, len, r >> 31); -} - -/* - * Rebuild integers from their RNS representation. There are 'num' - * integers, and each consists in 'xlen' words. 'xx' points at that - * first word of the first integer; subsequent integers are accessed - * by adding 'xstride' repeatedly. - * - * The words of an integer are the RNS representation of that integer, - * using the provided 'primes' are moduli. This function replaces - * each integer with its multi-word value (little-endian order). - * - * If "normalize_signed" is non-zero, then the returned value is - * normalized to the -m/2..m/2 interval (where m is the product of all - * small prime moduli); two's complement is used for negative values. - */ -static void -zint_rebuild_CRT(uint32_t *restrict xx, size_t xlen, size_t xstride, - size_t num, const small_prime *primes, int normalize_signed, - uint32_t *restrict tmp) -{ - size_t u; - uint32_t *x; - - tmp[0] = primes[0].p; - for (u = 1; u < xlen; u ++) { - /* - * At the entry of each loop iteration: - * - the first u words of each array have been - * reassembled; - * - the first u words of tmp[] contains the - * product of the prime moduli processed so far. - * - * We call 'q' the product of all previous primes. - */ - uint32_t p, p0i, s, R2; - size_t v; - - p = primes[u].p; - s = primes[u].s; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - - for (v = 0, x = xx; v < num; v ++, x += xstride) { - uint32_t xp, xq, xr; - /* - * xp = the integer x modulo the prime p for this - * iteration - * xq = (x mod q) mod p - */ - xp = x[u]; - xq = zint_mod_small_unsigned(x, u, p, p0i, R2); - - /* - * New value is (x mod q) + q * (s * (xp - xq) mod p) - */ - xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i); - zint_add_mul_small(x, tmp, u, xr); - } - - /* - * Update product of primes in tmp[]. - */ - tmp[u] = zint_mul_small(tmp, u, p); - } - - /* - * Normalize the reconstructed values around 0. - */ - if (normalize_signed) { - for (u = 0, x = xx; u < num; u ++, x += xstride) { - zint_norm_zero(x, tmp, xlen); - } - } -} - -/* - * Negate a big integer conditionally: value a is replaced with -a if - * and only if ctl = 1. Control value ctl must be 0 or 1. - */ -static void -zint_negate(uint32_t *a, size_t len, uint32_t ctl) -{ - size_t u; - uint32_t cc, m; - - /* - * If ctl = 1 then we flip the bits of a by XORing with - * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR - * with 0 and add 0, which leaves the value unchanged. - */ - cc = ctl; - m = -ctl >> 1; - for (u = 0; u < len; u ++) { - uint32_t aw; - - aw = a[u]; - aw = (aw ^ m) + cc; - a[u] = aw & 0x7FFFFFFF; - cc = aw >> 31; - } -} - -/* - * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31). - * The low bits are dropped (the caller should compute the coefficients - * such that these dropped bits are all zeros). If either or both - * yields a negative value, then the value is negated. - * - * Returned value is: - * 0 both values were positive - * 1 new a had to be negated - * 2 new b had to be negated - * 3 both new a and new b had to be negated - * - * Coefficients xa, xb, ya and yb may use the full signed 32-bit range. - */ -static uint32_t -zint_co_reduce(uint32_t *a, uint32_t *b, size_t len, - int64_t xa, int64_t xb, int64_t ya, int64_t yb) -{ - size_t u; - int64_t cca, ccb; - uint32_t nega, negb; - - cca = 0; - ccb = 0; - for (u = 0; u < len; u ++) { - uint32_t wa, wb; - uint64_t za, zb; - - wa = a[u]; - wb = b[u]; - za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca; - zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb; - if (u > 0) { - a[u - 1] = (uint32_t)za & 0x7FFFFFFF; - b[u - 1] = (uint32_t)zb & 0x7FFFFFFF; - } - cca = *(int64_t *)&za >> 31; - ccb = *(int64_t *)&zb >> 31; - } - a[len - 1] = (uint32_t)cca; - b[len - 1] = (uint32_t)ccb; - - nega = (uint32_t)((uint64_t)cca >> 63); - negb = (uint32_t)((uint64_t)ccb >> 63); - zint_negate(a, len, nega); - zint_negate(b, len, negb); - return nega | (negb << 1); -} - -/* - * Finish modular reduction. Rules on input parameters: - * - * if neg = 1, then -m <= a < 0 - * if neg = 0, then 0 <= a < 2*m - * - * If neg = 0, then the top word of a[] is allowed to use 32 bits. - * - * Modulus m must be odd. - */ -static void -zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg) -{ - size_t u; - uint32_t cc, xm, ym; - - /* - * First pass: compare a (assumed nonnegative) with m. Note that - * if the top word uses 32 bits, subtracting m must yield a - * value less than 2^31 since a < 2*m. - */ - cc = 0; - for (u = 0; u < len; u ++) { - cc = (a[u] - m[u] - cc) >> 31; - } - - /* - * If neg = 1 then we must add m (regardless of cc) - * If neg = 0 and cc = 0 then we must subtract m - * If neg = 0 and cc = 1 then we must do nothing - * - * In the loop below, we conditionally subtract either m or -m - * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1); - * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0. - */ - xm = -neg >> 1; - ym = -(neg | (1 - cc)); - cc = neg; - for (u = 0; u < len; u ++) { - uint32_t aw, mw; - - aw = a[u]; - mw = (m[u] ^ xm) & ym; - aw = aw - mw - cc; - a[u] = aw & 0x7FFFFFFF; - cc = aw >> 31; - } -} - -/* - * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with - * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31. - */ -static void -zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len, - uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb) -{ - size_t u; - int64_t cca, ccb; - uint32_t fa, fb; - - /* - * These are actually four combined Montgomery multiplications. - */ - cca = 0; - ccb = 0; - fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF; - fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF; - for (u = 0; u < len; u ++) { - uint32_t wa, wb; - uint64_t za, zb; - - wa = a[u]; - wb = b[u]; - za = wa * (uint64_t)xa + wb * (uint64_t)xb - + m[u] * (uint64_t)fa + (uint64_t)cca; - zb = wa * (uint64_t)ya + wb * (uint64_t)yb - + m[u] * (uint64_t)fb + (uint64_t)ccb; - if (u > 0) { - a[u - 1] = (uint32_t)za & 0x7FFFFFFF; - b[u - 1] = (uint32_t)zb & 0x7FFFFFFF; - } - cca = *(int64_t *)&za >> 31; - ccb = *(int64_t *)&zb >> 31; - } - a[len - 1] = (uint32_t)cca; - b[len - 1] = (uint32_t)ccb; - - /* - * At this point: - * -m <= a < 2*m - * -m <= b < 2*m - * (this is a case of Montgomery reduction) - * The top words of 'a' and 'b' may have a 32-th bit set. - * We want to add or subtract the modulus, as required. - */ - zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63)); - zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63)); -} - -/* - * Compute a GCD between two positive big integers x and y. The two - * integers must be odd. Returned value is 1 if the GCD is 1, 0 - * otherwise. When 1 is returned, arrays u and v are filled with values - * such that: - * 0 <= u <= y - * 0 <= v <= x - * x*u - y*v = 1 - * x[] and y[] are unmodified. Both input values must have the same - * encoded length. Temporary array must be large enough to accommodate 4 - * extra values of that length. Arrays u, v and tmp may not overlap with - * each other, or with either x or y. - */ -static int -zint_bezout(uint32_t *restrict u, uint32_t *restrict v, - const uint32_t *restrict x, const uint32_t *restrict y, - size_t len, uint32_t *restrict tmp) -{ - /* - * Algorithm is an extended binary GCD. We maintain 6 values - * a, b, u0, u1, v0 and v1 with the following invariants: - * - * a = x*u0 - y*v0 - * b = x*u1 - y*v1 - * 0 <= a <= x - * 0 <= b <= y - * 0 <= u0 < y - * 0 <= v0 < x - * 0 <= u1 <= y - * 0 <= v1 < x - * - * Initial values are: - * - * a = x u0 = 1 v0 = 0 - * b = y u1 = y v1 = x-1 - * - * Each iteration reduces either a or b, and maintains the - * invariants. Algorithm stops when a = b, at which point their - * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains - * the values (u,v) we want to return. - * - * The formal definition of the algorithm is a sequence of steps: - * - * - If a is even, then: - * a <- a/2 - * u0 <- u0/2 mod y - * v0 <- v0/2 mod x - * - * - Otherwise, if b is even, then: - * b <- b/2 - * u1 <- u1/2 mod y - * v1 <- v1/2 mod x - * - * - Otherwise, if a > b, then: - * a <- (a-b)/2 - * u0 <- (u0-u1)/2 mod y - * v0 <- (v0-v1)/2 mod x - * - * - Otherwise: - * b <- (b-a)/2 - * u1 <- (u1-u0)/2 mod y - * v1 <- (v1-v0)/2 mod y - * - * We can show that the operations above preserve the invariants: - * - * - If a is even, then u0 and v0 are either both even or both - * odd (since a = x*u0 - y*v0, and x and y are both odd). - * If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2). - * Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way, - * the a = x*u0 - y*v0 invariant is preserved. - * - * - The same holds for the case where b is even. - * - * - If a and b are odd, and a > b, then: - * - * a-b = x*(u0-u1) - y*(v0-v1) - * - * In that situation, if u0 < u1, then x*(u0-u1) < 0, but - * a-b > 0; therefore, it must be that v0 < v1, and the - * first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x), - * which preserves the invariants. Otherwise, if u0 > u1, - * then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and - * b >= 0, hence a-b <= x. It follows that, in that case, - * v0-v1 >= 0. The first part of the update is then: - * (u0,v0) <- (u0-u1,v0-v1), which again preserves the - * invariants. - * - * Either way, once the subtraction is done, the new value of - * a, which is the difference of two odd values, is even, - * and the remaining of this step is a subcase of the - * first algorithm case (i.e. when a is even). - * - * - If a and b are odd, and b > a, then the a similar - * argument holds. - * - * The values a and b start at x and y, respectively. Since x - * and y are odd, their GCD is odd, and it is easily seen that - * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b); - * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a - * or b is reduced by at least one bit at each iteration, so - * the algorithm necessarily converges on the case a = b, at - * which point the common value is the GCD. - * - * In the algorithm expressed above, when a = b, the fourth case - * applies, and sets b = 0. Since a contains the GCD of x and y, - * which are both odd, a must be odd, and subsequent iterations - * (if any) will simply divide b by 2 repeatedly, which has no - * consequence. Thus, the algorithm can run for more iterations - * than necessary; the final GCD will be in a, and the (u,v) - * coefficients will be (u0,v0). - * - * - * The presentation above is bit-by-bit. It can be sped up by - * noticing that all decisions are taken based on the low bits - * and high bits of a and b. We can extract the two top words - * and low word of each of a and b, and compute reduction - * parameters pa, pb, qa and qb such that the new values for - * a and b are: - * a' = (a*pa + b*pb) / (2^31) - * b' = (a*qa + b*qb) / (2^31) - * the two divisions being exact. The coefficients are obtained - * just from the extracted words, and may be slightly off, requiring - * an optional correction: if a' < 0, then we replace pa with -pa - * and pb with -pb. Each such step will reduce the total length - * (sum of lengths of a and b) by at least 30 bits at each - * iteration. - */ - uint32_t *u0, *u1, *v0, *v1, *a, *b; - uint32_t x0i, y0i; - uint32_t num, rc; - size_t j; - - if (len == 0) { - return 0; - } - - /* - * u0 and v0 are the u and v result buffers; the four other - * values (u1, v1, a and b) are taken from tmp[]. - */ - u0 = u; - v0 = v; - u1 = tmp; - v1 = u1 + len; - a = v1 + len; - b = a + len; - - /* - * We'll need the Montgomery reduction coefficients. - */ - x0i = modp_ninv31(x[0]); - y0i = modp_ninv31(y[0]); - - /* - * Initialize a, b, u0, u1, v0 and v1. - * a = x u0 = 1 v0 = 0 - * b = y u1 = y v1 = x-1 - * Note that x is odd, so computing x-1 is easy. - */ - memcpy(a, x, len * sizeof *x); - memcpy(b, y, len * sizeof *y); - u0[0] = 1; - memset(u0 + 1, 0, (len - 1) * sizeof *u0); - memset(v0, 0, len * sizeof *v0); - memcpy(u1, y, len * sizeof *u1); - memcpy(v1, x, len * sizeof *v1); - v1[0] --; - - /* - * Each input operand may be as large as 31*len bits, and we - * reduce the total length by at least 30 bits at each iteration. - */ - for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) { - uint32_t c0, c1; - uint32_t a0, a1, b0, b1; - uint64_t a_hi, b_hi; - uint32_t a_lo, b_lo; - int64_t pa, pb, qa, qb; - int i; - uint32_t r; - - /* - * Extract the top words of a and b. If j is the highest - * index >= 1 such that a[j] != 0 or b[j] != 0, then we - * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1]. - * If a and b are down to one word each, then we use - * a[0] and b[0]. - */ - c0 = (uint32_t)-1; - c1 = (uint32_t)-1; - a0 = 0; - a1 = 0; - b0 = 0; - b1 = 0; - j = len; - while (j -- > 0) { - uint32_t aw, bw; - - aw = a[j]; - bw = b[j]; - a0 ^= (a0 ^ aw) & c0; - a1 ^= (a1 ^ aw) & c1; - b0 ^= (b0 ^ bw) & c0; - b1 ^= (b1 ^ bw) & c1; - c1 = c0; - c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1; - } - - /* - * If c1 = 0, then we grabbed two words for a and b. - * If c1 != 0 but c0 = 0, then we grabbed one word. It - * is not possible that c1 != 0 and c0 != 0, because that - * would mean that both integers are zero. - */ - a1 |= a0 & c1; - a0 &= ~c1; - b1 |= b0 & c1; - b0 &= ~c1; - a_hi = ((uint64_t)a0 << 31) + a1; - b_hi = ((uint64_t)b0 << 31) + b1; - a_lo = a[0]; - b_lo = b[0]; - - /* - * Compute reduction factors: - * - * a' = a*pa + b*pb - * b' = a*qa + b*qb - * - * such that a' and b' are both multiple of 2^31, but are - * only marginally larger than a and b. - */ - pa = 1; - pb = 0; - qa = 0; - qb = 1; - for (i = 0; i < 31; i ++) { - /* - * At each iteration: - * - * a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi - * b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi - * a <- a/2 if: a is even - * b <- b/2 if: a is odd, b is even - * - * We multiply a_lo and b_lo by 2 at each - * iteration, thus a division by 2 really is a - * non-multiplication by 2. - */ - uint32_t rt, oa, ob, cAB, cBA, cA; - uint64_t rz; - - /* - * rt = 1 if a_hi > b_hi, 0 otherwise. - */ - rz = b_hi - a_hi; - rt = (uint32_t)((rz ^ ((a_hi ^ b_hi) - & (a_hi ^ rz))) >> 63); - - /* - * cAB = 1 if b must be subtracted from a - * cBA = 1 if a must be subtracted from b - * cA = 1 if a must be divided by 2 - * - * Rules: - * - * cAB and cBA cannot both be 1. - * If a is not divided by 2, b is. - */ - oa = (a_lo >> i) & 1; - ob = (b_lo >> i) & 1; - cAB = oa & ob & rt; - cBA = oa & ob & ~rt; - cA = cAB | (oa ^ 1); - - /* - * Conditional subtractions. - */ - a_lo -= b_lo & -cAB; - a_hi -= b_hi & -(uint64_t)cAB; - pa -= qa & -(int64_t)cAB; - pb -= qb & -(int64_t)cAB; - b_lo -= a_lo & -cBA; - b_hi -= a_hi & -(uint64_t)cBA; - qa -= pa & -(int64_t)cBA; - qb -= pb & -(int64_t)cBA; - - /* - * Shifting. - */ - a_lo += a_lo & (cA - 1); - pa += pa & ((int64_t)cA - 1); - pb += pb & ((int64_t)cA - 1); - a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA; - b_lo += b_lo & -cA; - qa += qa & -(int64_t)cA; - qb += qb & -(int64_t)cA; - b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1); - } - - /* - * Apply the computed parameters to our values. We - * may have to correct pa and pb depending on the - * returned value of zint_co_reduce() (when a and/or b - * had to be negated). - */ - r = zint_co_reduce(a, b, len, pa, pb, qa, qb); - pa -= (pa + pa) & -(int64_t)(r & 1); - pb -= (pb + pb) & -(int64_t)(r & 1); - qa -= (qa + qa) & -(int64_t)(r >> 1); - qb -= (qb + qb) & -(int64_t)(r >> 1); - zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb); - zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb); - } - - /* - * At that point, array a[] should contain the GCD, and the - * results (u,v) should already be set. We check that the GCD - * is indeed 1. We also check that the two operands x and y - * are odd. - */ - rc = a[0] ^ 1; - for (j = 1; j < len; j ++) { - rc |= a[j]; - } - return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]); -} - -/* - * Add k*y*2^sc to x. The result is assumed to fit in the array of - * size xlen (truncation is applied if necessary). - * Scale factor 'sc' is provided as sch and scl, such that: - * sch = sc / 31 - * scl = sc % 31 - * xlen MUST NOT be lower than ylen. - * - * x[] and y[] are both signed integers, using two's complement for - * negative values. - */ -static void -zint_add_scaled_mul_small(uint32_t *restrict x, size_t xlen, - const uint32_t *restrict y, size_t ylen, int32_t k, - uint32_t sch, uint32_t scl) -{ - size_t u; - uint32_t ysign, tw; - int32_t cc; - - if (ylen == 0) { - return; - } - - ysign = -(y[ylen - 1] >> 30) >> 1; - tw = 0; - cc = 0; - for (u = sch; u < xlen; u ++) { - size_t v; - uint32_t wy, wys, ccu; - uint64_t z; - - /* - * Get the next word of y (scaled). - */ - v = u - sch; - wy = v < ylen ? y[v] : ysign; - wys = ((wy << scl) & 0x7FFFFFFF) | tw; - tw = wy >> (31 - scl); - - /* - * The expression below does not overflow. - */ - z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc); - x[u] = (uint32_t)z & 0x7FFFFFFF; - - /* - * Right-shifting the signed value z would yield - * implementation-defined results (arithmetic shift is - * not guaranteed). However, we can cast to unsigned, - * and get the next carry as an unsigned word. We can - * then convert it back to signed by using the guaranteed - * fact that 'int32_t' uses two's complement with no - * trap representation or padding bit, and with a layout - * compatible with that of 'uint32_t'. - */ - ccu = (uint32_t)(z >> 31); - cc = *(int32_t *)&ccu; - } -} - -/* - * Subtract y*2^sc from x. The result is assumed to fit in the array of - * size xlen (truncation is applied if necessary). - * Scale factor 'sc' is provided as sch and scl, such that: - * sch = sc / 31 - * scl = sc % 31 - * xlen MUST NOT be lower than ylen. - * - * x[] and y[] are both signed integers, using two's complement for - * negative values. - */ -static void -zint_sub_scaled(uint32_t *restrict x, size_t xlen, - const uint32_t *restrict y, size_t ylen, uint32_t sch, uint32_t scl) -{ - size_t u; - uint32_t ysign, tw; - uint32_t cc; - - if (ylen == 0) { - return; - } - - ysign = -(y[ylen - 1] >> 30) >> 1; - tw = 0; - cc = 0; - for (u = sch; u < xlen; u ++) { - size_t v; - uint32_t w, wy, wys; - - /* - * Get the next word of y (scaled). - */ - v = u - sch; - wy = v < ylen ? y[v] : ysign; - wys = ((wy << scl) & 0x7FFFFFFF) | tw; - tw = wy >> (31 - scl); - - w = x[u] - wys - cc; - x[u] = w & 0x7FFFFFFF; - cc = w >> 31; - } -} - -/* - * Convert a one-word signed big integer into a signed value. - */ -static inline int32_t -zint_one_to_plain(const uint32_t *x) -{ - uint32_t w; - - w = x[0]; - w |= (w & 0x40000000) << 1; - return *(int32_t *)&w; -} - -/* ==================================================================== */ - -/* - * Convert a polynomial to floating-point values. - * - * Each coefficient has length flen words, and starts fstride words after - * the previous. - * - * IEEE-754 binary64 values can represent values in a finite range, - * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large, - * they should be "trimmed" by pointing not to the lowest word of each, - * but upper. - */ -static void -poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride, - unsigned logn) -{ - size_t n, u; - - n = MKN(logn); - if (flen == 0) { - for (u = 0; u < n; u ++) { - d[u] = fpr_zero; - } - return; - } - for (u = 0; u < n; u ++, f += fstride) { - size_t v; - uint32_t neg, cc, xm; - fpr x, fsc; - - /* - * Get sign of the integer; if it is negative, then we - * will load its absolute value instead, and negate the - * result. - */ - neg = -(f[flen - 1] >> 30); - xm = neg >> 1; - cc = neg & 1; - x = fpr_zero; - fsc = fpr_one; - for (v = 0; v < flen; v ++, fsc = fpr_mul(fsc, fpr_ptwo31)) { - uint32_t w; - - w = (f[v] ^ xm) + cc; - cc = w >> 31; - w &= 0x7FFFFFFF; - w -= (w << 1) & neg; - x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc)); - } - d[u] = x; - } -} - -/* - * Convert a polynomial to small integers. Source values are supposed - * to be one-word integers, signed over 31 bits. Returned value is 0 - * if any of the coefficients exceeds the provided limit (in absolute - * value), or 1 on success. - * - * This is not constant-time; this is not a problem here, because on - * any failure, the NTRU-solving process will be deemed to have failed - * and the (f,g) polynomials will be discarded. - */ -static int -poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn) -{ - size_t n, u; - - n = MKN(logn); - for (u = 0; u < n; u ++) { - int32_t z; - - z = zint_one_to_plain(s + u); - if (z < -lim || z > lim) { - return 0; - } - d[u] = (int8_t)z; - } - return 1; -} - -/* - * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1. - * Coefficients of polynomial k are small integers (signed values in the - * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31 - * and scl = sc % 31. - * - * This function implements the basic quadratic multiplication algorithm, - * which is efficient in space (no extra buffer needed) but slow at - * high degree. - */ -static void -poly_sub_scaled(uint32_t *restrict F, size_t Flen, size_t Fstride, - const uint32_t *restrict f, size_t flen, size_t fstride, - const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn) -{ - size_t n, u; - - n = MKN(logn); - for (u = 0; u < n; u ++) { - int32_t kf; - size_t v; - uint32_t *x; - const uint32_t *y; - - kf = -k[u]; - x = F + u * Fstride; - y = f; - for (v = 0; v < n; v ++) { - zint_add_scaled_mul_small( - x, Flen, y, flen, kf, sch, scl); - if (u + v == n - 1) { - x = F; - kf = -kf; - } else { - x += Fstride; - } - y += fstride; - } - } -} - -/* - * Subtract k*f from F. Coefficients of polynomial k are small integers - * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function - * assumes that the degree is large, and integers relatively small. - * The value sc is provided as sch = sc / 31 and scl = sc % 31. - */ -static void -poly_sub_scaled_ntt(uint32_t *restrict F, size_t Flen, size_t Fstride, - const uint32_t *restrict f, size_t flen, size_t fstride, - const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn, - uint32_t *restrict tmp) -{ - uint32_t *gm, *igm, *fk, *t1, *x; - const uint32_t *y; - size_t n, u, tlen; - const small_prime *primes; - - n = MKN(logn); - tlen = flen + 1; - gm = tmp; - igm = gm + MKN(logn); - fk = igm + MKN(logn); - t1 = fk + n * tlen; - - primes = PRIMES; - - /* - * Compute k*f in fk[], in RNS notation. - */ - for (u = 0; u < tlen; u ++) { - uint32_t p, p0i, R2, Rx; - size_t v; - - p = primes[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - Rx = modp_Rx((unsigned)flen, p, p0i, R2); - modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i); - - for (v = 0; v < n; v ++) { - t1[v] = modp_set(k[v], p); - } - modp_NTT2(t1, gm, logn, p, p0i); - for (v = 0, y = f, x = fk + u; - v < n; v ++, y += fstride, x += tlen) - { - *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx); - } - modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i); - for (v = 0, x = fk + u; v < n; v ++, x += tlen) { - *x = modp_montymul( - modp_montymul(t1[v], *x, p, p0i), R2, p, p0i); - } - modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i); - } - - /* - * Rebuild k*f. - */ - zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1); - - /* - * Subtract k*f, scaled, from F. - */ - for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) { - zint_sub_scaled(x, Flen, y, tlen, sch, scl); - } -} - -/* ==================================================================== */ - -#if FALCON_KG_CHACHA20 // yyyKG_CHACHA20+1 - -#define RNG_CONTEXT prng -#define get_rng_u64 prng_get_u64 - -#else // yyyKG_CHACHA20+0 - -#define RNG_CONTEXT inner_shake256_context - -/* - * Get a random 8-byte integer from a SHAKE-based RNG. This function - * ensures consistent interpretation of the SHAKE output so that - * the same values will be obtained over different platforms, in case - * a known seed is used. - */ -static inline uint64_t -get_rng_u64(inner_shake256_context *rng) -{ - /* - * We enforce little-endian representation. - */ - -#if FALCON_LE // yyyLE+1 - /* - * On little-endian systems we just interpret the bytes "as is" - * (this is correct because the exact-width types such as - * 'uint64_t' are guaranteed to have no padding and no trap - * representation). - */ - uint64_t r; - - inner_shake256_extract(rng, (uint8_t *)&r, sizeof r); - return r; -#else // yyyLE+0 - uint8_t tmp[8]; - - inner_shake256_extract(rng, tmp, sizeof tmp); - return (uint64_t)tmp[0] - | ((uint64_t)tmp[1] << 8) - | ((uint64_t)tmp[2] << 16) - | ((uint64_t)tmp[3] << 24) - | ((uint64_t)tmp[4] << 32) - | ((uint64_t)tmp[5] << 40) - | ((uint64_t)tmp[6] << 48) - | ((uint64_t)tmp[7] << 56); -#endif // yyyLE- -} - -#endif // yyyKG_CHACHA20- - -/* - * Table below incarnates a discrete Gaussian distribution: - * D(x) = exp(-(x^2)/(2*sigma^2)) - * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024. - * Element 0 of the table is P(x = 0). - * For k > 0, element k is P(x >= k+1 | x > 0). - * Probabilities are scaled up by 2^63. - */ -static const uint64_t gauss_1024_12289[] = { - 1283868770400643928u, 6416574995475331444u, 4078260278032692663u, - 2353523259288686585u, 1227179971273316331u, 575931623374121527u, - 242543240509105209u, 91437049221049666u, 30799446349977173u, - 9255276791179340u, 2478152334826140u, 590642893610164u, - 125206034929641u, 23590435911403u, 3948334035941u, - 586753615614u, 77391054539u, 9056793210u, - 940121950u, 86539696u, 7062824u, - 510971u, 32764u, 1862u, - 94u, 4u, 0u -}; - -/* - * Generate a random value with a Gaussian distribution centered on 0. - * The RNG must be ready for extraction (already flipped). - * - * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The - * precomputed table is for N = 1024. Since the sum of two independent - * values of standard deviation sigma has standard deviation - * sigma*sqrt(2), then we can just generate more values and add them - * together for lower dimensions. - */ -static int -mkgauss(RNG_CONTEXT *rng, unsigned logn) -{ - unsigned u, g; - int val; - - g = 1U << (10 - logn); - val = 0; - for (u = 0; u < g; u ++) { - /* - * Each iteration generates one value with the - * Gaussian distribution for N = 1024. - * - * We use two random 64-bit values. First value - * decides on whether the generated value is 0, and, - * if not, the sign of the value. Second random 64-bit - * word is used to generate the non-zero value. - * - * For constant-time code we have to read the complete - * table. This has negligible cost, compared with the - * remainder of the keygen process (solving the NTRU - * equation). - */ - uint64_t r; - uint32_t f, v, k, neg; - - /* - * First value: - * - flag 'neg' is randomly selected to be 0 or 1. - * - flag 'f' is set to 1 if the generated value is zero, - * or set to 0 otherwise. - */ - r = get_rng_u64(rng); - neg = (uint32_t)(r >> 63); - r &= ~((uint64_t)1 << 63); - f = (uint32_t)((r - gauss_1024_12289[0]) >> 63); - - /* - * We produce a new random 63-bit integer r, and go over - * the array, starting at index 1. We store in v the - * index of the first array element which is not greater - * than r, unless the flag f was already 1. - */ - v = 0; - r = get_rng_u64(rng); - r &= ~((uint64_t)1 << 63); - for (k = 1; k < (sizeof gauss_1024_12289) - / (sizeof gauss_1024_12289[0]); k ++) - { - uint32_t t; - - t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1; - v |= k & -(t & (f ^ 1)); - f |= t; - } - - /* - * We apply the sign ('neg' flag). If the value is zero, - * the sign has no effect. - */ - v = (v ^ -neg) + neg; - - /* - * Generated value is added to val. - */ - val += *(int32_t *)&v; - } - return val; -} - -/* - * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit - * words, of intermediate values in the computation: - * - * MAX_BL_SMALL[depth]: length for the input f and g at that depth - * MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth - * - * Rules: - * - * - Within an array, values grow. - * - * - The 'SMALL' array must have an entry for maximum depth, corresponding - * to the size of values used in the binary GCD. There is no such value - * for the 'LARGE' array (the binary GCD yields already reduced - * coefficients). - * - * - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1]. - * - * - Values must be large enough to handle the common cases, with some - * margins. - * - * - Values must not be "too large" either because we will convert some - * integers into floating-point values by considering the top 10 words, - * i.e. 310 bits; hence, for values of length more than 10 words, we - * should take care to have the length centered on the expected size. - * - * The following average lengths, in bits, have been measured on thousands - * of random keys (fg = max length of the absolute value of coefficients - * of f and g at that depth; FG = idem for the unreduced F and G; for the - * maximum depth, F and G are the output of binary GCD, multiplied by q; - * for each value, the average and standard deviation are provided). - * - * Binary case: - * depth: 10 fg: 6307.52 (24.48) FG: 6319.66 (24.51) - * depth: 9 fg: 3138.35 (12.25) FG: 9403.29 (27.55) - * depth: 8 fg: 1576.87 ( 7.49) FG: 4703.30 (14.77) - * depth: 7 fg: 794.17 ( 4.98) FG: 2361.84 ( 9.31) - * depth: 6 fg: 400.67 ( 3.10) FG: 1188.68 ( 6.04) - * depth: 5 fg: 202.22 ( 1.87) FG: 599.81 ( 3.87) - * depth: 4 fg: 101.62 ( 1.02) FG: 303.49 ( 2.38) - * depth: 3 fg: 50.37 ( 0.53) FG: 153.65 ( 1.39) - * depth: 2 fg: 24.07 ( 0.25) FG: 78.20 ( 0.73) - * depth: 1 fg: 10.99 ( 0.08) FG: 39.82 ( 0.41) - * depth: 0 fg: 4.00 ( 0.00) FG: 19.61 ( 0.49) - * - * Integers are actually represented either in binary notation over - * 31-bit words (signed, using two's complement), or in RNS, modulo - * many small primes. These small primes are close to, but slightly - * lower than, 2^31. Use of RNS loses less than two bits, even for - * the largest values. - * - * IMPORTANT: if these values are modified, then the temporary buffer - * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed - * accordingly. - */ - -static const size_t MAX_BL_SMALL[] = { - 1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209 -}; - -static const size_t MAX_BL_LARGE[] = { - 2, 2, 5, 7, 12, 21, 40, 78, 157, 308 -}; - -/* - * Average and standard deviation for the maximum size (in bits) of - * coefficients of (f,g), depending on depth. These values are used - * to compute bounds for Babai's reduction. - */ -static const struct { - int avg; - int std; -} BITLENGTH[] = { - { 4, 0 }, - { 11, 1 }, - { 24, 1 }, - { 50, 1 }, - { 102, 1 }, - { 202, 2 }, - { 401, 4 }, - { 794, 5 }, - { 1577, 8 }, - { 3138, 13 }, - { 6308, 25 } -}; - -/* - * Minimal recursion depth at which we rebuild intermediate values - * when reconstructing f and g. - */ -#define DEPTH_INT_FG 4 - -/* - * Compute squared norm of a short vector. Returned value is saturated to - * 2^32-1 if it is not lower than 2^31. - */ -static uint32_t -poly_small_sqnorm(const int8_t *f, unsigned logn) -{ - size_t n, u; - uint32_t s, ng; - - n = MKN(logn); - s = 0; - ng = 0; - for (u = 0; u < n; u ++) { - int32_t z; - - z = f[u]; - s += (uint32_t)(z * z); - ng |= s; - } - return s | -(ng >> 31); -} - -/* - * Align (upwards) the provided 'data' pointer with regards to 'base' - * so that the offset is a multiple of the size of 'fpr'. - */ -static fpr * -align_fpr(void *base, void *data) -{ - uint8_t *cb, *cd; - size_t k, km; - - cb = base; - cd = data; - k = (size_t)(cd - cb); - km = k % sizeof(fpr); - if (km) { - k += (sizeof(fpr)) - km; - } - return (fpr *)(cb + k); -} - -/* - * Align (upwards) the provided 'data' pointer with regards to 'base' - * so that the offset is a multiple of the size of 'uint32_t'. - */ -static uint32_t * -align_u32(void *base, void *data) -{ - uint8_t *cb, *cd; - size_t k, km; - - cb = base; - cd = data; - k = (size_t)(cd - cb); - km = k % sizeof(uint32_t); - if (km) { - k += (sizeof(uint32_t)) - km; - } - return (uint32_t *)(cb + k); -} - -/* - * Convert a small vector to floating point. - */ -static void -poly_small_to_fp(fpr *x, const int8_t *f, unsigned logn) -{ - size_t n, u; - - n = MKN(logn); - for (u = 0; u < n; u ++) { - x[u] = fpr_of(f[u]); - } -} - -/* - * Input: f,g of degree N = 2^logn; 'depth' is used only to get their - * individual length. - * - * Output: f',g' of degree N/2, with the length for 'depth+1'. - * - * Values are in RNS; input and/or output may also be in NTT. - */ -static void -make_fg_step(uint32_t *data, unsigned logn, unsigned depth, - int in_ntt, int out_ntt) -{ - size_t n, hn, u; - size_t slen, tlen; - uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1; - const small_prime *primes; - - n = (size_t)1 << logn; - hn = n >> 1; - slen = MAX_BL_SMALL[depth]; - tlen = MAX_BL_SMALL[depth + 1]; - primes = PRIMES; - - /* - * Prepare room for the result. - */ - fd = data; - gd = fd + hn * tlen; - fs = gd + hn * tlen; - gs = fs + n * slen; - gm = gs + n * slen; - igm = gm + n; - t1 = igm + n; - memmove(fs, data, 2 * n * slen * sizeof *data); - - /* - * First slen words: we use the input values directly, and apply - * inverse NTT as we go. - */ - for (u = 0; u < slen; u ++) { - uint32_t p, p0i, R2; - size_t v; - uint32_t *x; - - p = primes[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i); - - for (v = 0, x = fs + u; v < n; v ++, x += slen) { - t1[v] = *x; - } - if (!in_ntt) { - modp_NTT2(t1, gm, logn, p, p0i); - } - for (v = 0, x = fd + u; v < hn; v ++, x += tlen) { - uint32_t w0, w1; - - w0 = t1[(v << 1) + 0]; - w1 = t1[(v << 1) + 1]; - *x = modp_montymul( - modp_montymul(w0, w1, p, p0i), R2, p, p0i); - } - if (in_ntt) { - modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i); - } - - for (v = 0, x = gs + u; v < n; v ++, x += slen) { - t1[v] = *x; - } - if (!in_ntt) { - modp_NTT2(t1, gm, logn, p, p0i); - } - for (v = 0, x = gd + u; v < hn; v ++, x += tlen) { - uint32_t w0, w1; - - w0 = t1[(v << 1) + 0]; - w1 = t1[(v << 1) + 1]; - *x = modp_montymul( - modp_montymul(w0, w1, p, p0i), R2, p, p0i); - } - if (in_ntt) { - modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i); - } - - if (!out_ntt) { - modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i); - modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i); - } - } - - /* - * Since the fs and gs words have been de-NTTized, we can use the - * CRT to rebuild the values. - */ - zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm); - zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm); - - /* - * Remaining words: use modular reductions to extract the values. - */ - for (u = slen; u < tlen; u ++) { - uint32_t p, p0i, R2, Rx; - size_t v; - uint32_t *x; - - p = primes[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - Rx = modp_Rx((unsigned)slen, p, p0i, R2); - modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i); - for (v = 0, x = fs; v < n; v ++, x += slen) { - t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx); - } - modp_NTT2(t1, gm, logn, p, p0i); - for (v = 0, x = fd + u; v < hn; v ++, x += tlen) { - uint32_t w0, w1; - - w0 = t1[(v << 1) + 0]; - w1 = t1[(v << 1) + 1]; - *x = modp_montymul( - modp_montymul(w0, w1, p, p0i), R2, p, p0i); - } - for (v = 0, x = gs; v < n; v ++, x += slen) { - t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx); - } - modp_NTT2(t1, gm, logn, p, p0i); - for (v = 0, x = gd + u; v < hn; v ++, x += tlen) { - uint32_t w0, w1; - - w0 = t1[(v << 1) + 0]; - w1 = t1[(v << 1) + 1]; - *x = modp_montymul( - modp_montymul(w0, w1, p, p0i), R2, p, p0i); - } - - if (!out_ntt) { - modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i); - modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i); - } - } -} - -/* - * Compute f and g at a specific depth, in RNS notation. - * - * Returned values are stored in the data[] array, at slen words per integer. - * - * Conditions: - * 0 <= depth <= logn - * - * Space use in data[]: enough room for any two successive values (f', g', - * f and g). - */ -static void -make_fg(uint32_t *data, const int8_t *f, const int8_t *g, - unsigned logn, unsigned depth, int out_ntt) -{ - size_t n, u; - uint32_t *ft, *gt, p0; - unsigned d; - const small_prime *primes; - - n = MKN(logn); - ft = data; - gt = ft + n; - primes = PRIMES; - p0 = primes[0].p; - for (u = 0; u < n; u ++) { - ft[u] = modp_set(f[u], p0); - gt[u] = modp_set(g[u], p0); - } - - if (depth == 0 && out_ntt) { - uint32_t *gm, *igm; - uint32_t p, p0i; - - p = primes[0].p; - p0i = modp_ninv31(p); - gm = gt + n; - igm = gm + MKN(logn); - modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i); - modp_NTT2(ft, gm, logn, p, p0i); - modp_NTT2(gt, gm, logn, p, p0i); - return; - } - - for (d = 0; d < depth; d ++) { - make_fg_step(data, logn - d, d, - d != 0, (d + 1) < depth || out_ntt); - } -} - -/* - * Solving the NTRU equation, deepest level: compute the resultants of - * f and g with X^N+1, and use binary GCD. The F and G values are - * returned in tmp[]. - * - * Returned value: 1 on success, 0 on error. - */ -static int -solve_NTRU_deepest(unsigned logn_top, - const int8_t *f, const int8_t *g, uint32_t *tmp) -{ - size_t len; - uint32_t *Fp, *Gp, *fp, *gp, *t1, q; - const small_prime *primes; - - len = MAX_BL_SMALL[logn_top]; - primes = PRIMES; - - Fp = tmp; - Gp = Fp + len; - fp = Gp + len; - gp = fp + len; - t1 = gp + len; - - make_fg(fp, f, g, logn_top, logn_top, 0); - - /* - * We use the CRT to rebuild the resultants as big integers. - * There are two such big integers. The resultants are always - * nonnegative. - */ - zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1); - - /* - * Apply the binary GCD. The zint_bezout() function works only - * if both inputs are odd. - * - * We can test on the result and return 0 because that would - * imply failure of the NTRU solving equation, and the (f,g) - * values will be abandoned in that case. - */ - if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) { - return 0; - } - - /* - * Multiply the two values by the target value q. Values must - * fit in the destination arrays. - * We can again test on the returned words: a non-zero output - * of zint_mul_small() means that we exceeded our array - * capacity, and that implies failure and rejection of (f,g). - */ - q = 12289; - if (zint_mul_small(Fp, len, q) != 0 - || zint_mul_small(Gp, len, q) != 0) - { - return 0; - } - - return 1; -} - -/* - * Solving the NTRU equation, intermediate level. Upon entry, the F and G - * from the previous level should be in the tmp[] array. - * This function MAY be invoked for the top-level (in which case depth = 0). - * - * Returned value: 1 on success, 0 on error. - */ -static int -solve_NTRU_intermediate(unsigned logn_top, - const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp) -{ - /* - * In this function, 'logn' is the log2 of the degree for - * this step. If N = 2^logn, then: - * - the F and G values already in fk->tmp (from the deeper - * levels) have degree N/2; - * - this function should return F and G of degree N. - */ - unsigned logn; - size_t n, hn, slen, dlen, llen, rlen, FGlen, u; - uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1; - fpr *rt1, *rt2, *rt3, *rt4, *rt5; - int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k; - uint32_t *x, *y; - int32_t *k; - const small_prime *primes; - - logn = logn_top - depth; - n = (size_t)1 << logn; - hn = n >> 1; - - /* - * slen = size for our input f and g; also size of the reduced - * F and G we return (degree N) - * - * dlen = size of the F and G obtained from the deeper level - * (degree N/2 or N/3) - * - * llen = size for intermediary F and G before reduction (degree N) - * - * We build our non-reduced F and G as two independent halves each, - * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1). - */ - slen = MAX_BL_SMALL[depth]; - dlen = MAX_BL_SMALL[depth + 1]; - llen = MAX_BL_LARGE[depth]; - primes = PRIMES; - - /* - * Fd and Gd are the F and G from the deeper level. - */ - Fd = tmp; - Gd = Fd + dlen * hn; - - /* - * Compute the input f and g for this level. Note that we get f - * and g in RNS + NTT representation. - */ - ft = Gd + dlen * hn; - make_fg(ft, f, g, logn_top, depth, 1); - - /* - * Move the newly computed f and g to make room for our candidate - * F and G (unreduced). - */ - Ft = tmp; - Gt = Ft + n * llen; - t1 = Gt + n * llen; - memmove(t1, ft, 2 * n * slen * sizeof *ft); - ft = t1; - gt = ft + slen * n; - t1 = gt + slen * n; - - /* - * Move Fd and Gd _after_ f and g. - */ - memmove(t1, Fd, 2 * hn * dlen * sizeof *Fd); - Fd = t1; - Gd = Fd + hn * dlen; - - /* - * We reduce Fd and Gd modulo all the small primes we will need, - * and store the values in Ft and Gt (only n/2 values in each). - */ - for (u = 0; u < llen; u ++) { - uint32_t p, p0i, R2, Rx; - size_t v; - uint32_t *xs, *ys, *xd, *yd; - - p = primes[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - Rx = modp_Rx((unsigned)dlen, p, p0i, R2); - for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u; - v < hn; - v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) - { - *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx); - *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx); - } - } - - /* - * We do not need Fd and Gd after that point. - */ - - /* - * Compute our F and G modulo sufficiently many small primes. - */ - for (u = 0; u < llen; u ++) { - uint32_t p, p0i, R2; - uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp; - size_t v; - - /* - * All computations are done modulo p. - */ - p = primes[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - - /* - * If we processed slen words, then f and g have been - * de-NTTized, and are in RNS; we can rebuild them. - */ - if (u == slen) { - zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1); - zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1); - } - - gm = t1; - igm = gm + n; - fx = igm + n; - gx = fx + n; - - modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i); - - if (u < slen) { - for (v = 0, x = ft + u, y = gt + u; - v < n; v ++, x += slen, y += slen) - { - fx[v] = *x; - gx[v] = *y; - } - modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i); - modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i); - } else { - uint32_t Rx; - - Rx = modp_Rx((unsigned)slen, p, p0i, R2); - for (v = 0, x = ft, y = gt; - v < n; v ++, x += slen, y += slen) - { - fx[v] = zint_mod_small_signed(x, slen, - p, p0i, R2, Rx); - gx[v] = zint_mod_small_signed(y, slen, - p, p0i, R2, Rx); - } - modp_NTT2(fx, gm, logn, p, p0i); - modp_NTT2(gx, gm, logn, p, p0i); - } - - /* - * Get F' and G' modulo p and in NTT representation - * (they have degree n/2). These values were computed in - * a previous step, and stored in Ft and Gt. - */ - Fp = gx + n; - Gp = Fp + hn; - for (v = 0, x = Ft + u, y = Gt + u; - v < hn; v ++, x += llen, y += llen) - { - Fp[v] = *x; - Gp[v] = *y; - } - modp_NTT2(Fp, gm, logn - 1, p, p0i); - modp_NTT2(Gp, gm, logn - 1, p, p0i); - - /* - * Compute our F and G modulo p. - * - * General case: - * - * we divide degree by d = 2 or 3 - * f'(x^d) = N(f)(x^d) = f * adj(f) - * g'(x^d) = N(g)(x^d) = g * adj(g) - * f'*G' - g'*F' = q - * F = F'(x^d) * adj(g) - * G = G'(x^d) * adj(f) - * - * We compute things in the NTT. We group roots of phi - * such that all roots x in a group share the same x^d. - * If the roots in a group are x_1, x_2... x_d, then: - * - * N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d) - * - * Thus, we have: - * - * G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d) - * G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d) - * ... - * G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d) - * - * In all cases, we can thus compute F and G in NTT - * representation by a few simple multiplications. - * Moreover, in our chosen NTT representation, roots - * from the same group are consecutive in RAM. - */ - for (v = 0, x = Ft + u, y = Gt + u; v < hn; - v ++, x += (llen << 1), y += (llen << 1)) - { - uint32_t ftA, ftB, gtA, gtB; - uint32_t mFp, mGp; - - ftA = fx[(v << 1) + 0]; - ftB = fx[(v << 1) + 1]; - gtA = gx[(v << 1) + 0]; - gtB = gx[(v << 1) + 1]; - mFp = modp_montymul(Fp[v], R2, p, p0i); - mGp = modp_montymul(Gp[v], R2, p, p0i); - x[0] = modp_montymul(gtB, mFp, p, p0i); - x[llen] = modp_montymul(gtA, mFp, p, p0i); - y[0] = modp_montymul(ftB, mGp, p, p0i); - y[llen] = modp_montymul(ftA, mGp, p, p0i); - } - modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i); - modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i); - } - - /* - * Rebuild F and G with the CRT. - */ - zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1); - zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1); - - /* - * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that - * order). - */ - - /* - * Apply Babai reduction to bring back F and G to size slen. - * - * We use the FFT to compute successive approximations of the - * reduction coefficient. We first isolate the top bits of - * the coefficients of f and g, and convert them to floating - * point; with the FFT, we compute adj(f), adj(g), and - * 1/(f*adj(f)+g*adj(g)). - * - * Then, we repeatedly apply the following: - * - * - Get the top bits of the coefficients of F and G into - * floating point, and use the FFT to compute: - * (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) - * - * - Convert back that value into normal representation, and - * round it to the nearest integers, yielding a polynomial k. - * Proper scaling is applied to f, g, F and G so that the - * coefficients fit on 32 bits (signed). - * - * - Subtract k*f from F and k*g from G. - * - * Under normal conditions, this process reduces the size of F - * and G by some bits at each iteration. For constant-time - * operation, we do not want to measure the actual length of - * F and G; instead, we do the following: - * - * - f and g are converted to floating-point, with some scaling - * if necessary to keep values in the representable range. - * - * - For each iteration, we _assume_ a maximum size for F and G, - * and use the values at that size. If we overreach, then - * we get zeros, which is harmless: the resulting coefficients - * of k will be 0 and the value won't be reduced. - * - * - We conservatively assume that F and G will be reduced by - * at least 25 bits at each iteration. - * - * Even when reaching the bottom of the reduction, reduction - * coefficient will remain low. If it goes out-of-range, then - * something wrong occurred and the whole NTRU solving fails. - */ - - /* - * Memory layout: - * - We need to compute and keep adj(f), adj(g), and - * 1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers, - * respectively). - * - At each iteration we need two extra fp buffer (N fp values), - * and produce a k (N 32-bit words). k will be shared with one - * of the fp buffers. - * - To compute k*f and k*g efficiently (with the NTT), we need - * some extra room; we reuse the space of the temporary buffers. - * - * Arrays of 'fpr' are obtained from the temporary array itself. - * We ensure that the base is at a properly aligned offset (the - * source array tmp[] is supposed to be already aligned). - */ - - rt3 = align_fpr(tmp, t1); - rt4 = rt3 + n; - rt5 = rt4 + n; - rt1 = rt5 + (n >> 1); - k = (int32_t *)align_u32(tmp, rt1); - rt2 = align_fpr(tmp, k + n); - if (rt2 < (rt1 + n)) { - rt2 = rt1 + n; - } - t1 = (uint32_t *)k + n; - - /* - * Get f and g into rt3 and rt4 as floating-point approximations. - * - * We need to "scale down" the floating-point representation of - * coefficients when they are too big. We want to keep the value - * below 2^310 or so. Thus, when values are larger than 10 words, - * we consider only the top 10 words. Array lengths have been - * computed so that average maximum length will fall in the - * middle or the upper half of these top 10 words. - */ - rlen = (slen > 10) ? 10 : slen; - poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn); - poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn); - - /* - * Values in rt3 and rt4 are downscaled by 2^(scale_fg). - */ - scale_fg = 31 * (int)(slen - rlen); - - /* - * Estimated boundaries for the maximum size (in bits) of the - * coefficients of (f,g). We use the measured average, and - * allow for a deviation of at most six times the standard - * deviation. - */ - minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std; - maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std; - - /* - * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f) - * and adj(g) in rt3 and rt4, respectively. - */ - Zf(FFT)(rt3, logn); - Zf(FFT)(rt4, logn); - Zf(poly_invnorm2_fft)(rt5, rt3, rt4, logn); - Zf(poly_adj_fft)(rt3, logn); - Zf(poly_adj_fft)(rt4, logn); - - /* - * Reduce F and G repeatedly. - * - * The expected maximum bit length of coefficients of F and G - * is kept in maxbl_FG, with the corresponding word length in - * FGlen. - */ - FGlen = llen; - maxbl_FG = 31 * (int)llen; - - /* - * Each reduction operation computes the reduction polynomial - * "k". We need that polynomial to have coefficients that fit - * on 32-bit signed integers, with some scaling; thus, we use - * a descending sequence of scaling values, down to zero. - * - * The size of the coefficients of k is (roughly) the difference - * between the size of the coefficients of (F,G) and the size - * of the coefficients of (f,g). Thus, the maximum size of the - * coefficients of k is, at the start, maxbl_FG - minbl_fg; - * this is our starting scale value for k. - * - * We need to estimate the size of (F,G) during the execution of - * the algorithm; we are allowed some overestimation but not too - * much (poly_big_to_fp() uses a 310-bit window). Generally - * speaking, after applying a reduction with k scaled to - * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd, - * where 'dd' is a few bits to account for the fact that the - * reduction is never perfect (intuitively, dd is on the order - * of sqrt(N), so at most 5 bits; we here allow for 10 extra - * bits). - * - * The size of (f,g) is not known exactly, but maxbl_fg is an - * upper bound. - */ - scale_k = maxbl_FG - minbl_fg; - - for (;;) { - int scale_FG, dc, new_maxbl_FG; - uint32_t scl, sch; - fpr pdc, pt; - - /* - * Convert current F and G into floating-point. We apply - * scaling if the current length is more than 10 words. - */ - rlen = (FGlen > 10) ? 10 : FGlen; - scale_FG = 31 * (int)(FGlen - rlen); - poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn); - poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn); - - /* - * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2. - */ - Zf(FFT)(rt1, logn); - Zf(FFT)(rt2, logn); - Zf(poly_mul_fft)(rt1, rt3, logn); - Zf(poly_mul_fft)(rt2, rt4, logn); - Zf(poly_add)(rt2, rt1, logn); - Zf(poly_mul_autoadj_fft)(rt2, rt5, logn); - Zf(iFFT)(rt2, logn); - - /* - * (f,g) are scaled by 'scale_fg', meaning that the - * numbers in rt3/rt4 should be multiplied by 2^(scale_fg) - * to have their true mathematical value. - * - * (F,G) are similarly scaled by 'scale_FG'. Therefore, - * the value we computed in rt2 is scaled by - * 'scale_FG-scale_fg'. - * - * We want that value to be scaled by 'scale_k', hence we - * apply a corrective scaling. After scaling, the values - * should fit in -2^31-1..+2^31-1. - */ - dc = scale_k - scale_FG + scale_fg; - - /* - * We will need to multiply values by 2^(-dc). The value - * 'dc' is not secret, so we can compute 2^(-dc) with a - * non-constant-time process. - * (We could use ldexp(), but we prefer to avoid any - * dependency on libm. When using FP emulation, we could - * use our fpr_ldexp(), which is constant-time.) - */ - if (dc < 0) { - dc = -dc; - pt = fpr_two; - } else { - pt = fpr_onehalf; - } - pdc = fpr_one; - while (dc != 0) { - if ((dc & 1) != 0) { - pdc = fpr_mul(pdc, pt); - } - dc >>= 1; - pt = fpr_sqr(pt); - } - - for (u = 0; u < n; u ++) { - fpr xv; - - xv = fpr_mul(rt2[u], pdc); - - /* - * Sometimes the values can be out-of-bounds if - * the algorithm fails; we must not call - * fpr_rint() (and cast to int32_t) if the value - * is not in-bounds. Note that the test does not - * break constant-time discipline, since any - * failure here implies that we discard the current - * secret key (f,g). - */ - if (!fpr_lt(fpr_mtwo31m1, xv) - || !fpr_lt(xv, fpr_ptwo31m1)) - { - return 0; - } - k[u] = (int32_t)fpr_rint(xv); - } - - /* - * Values in k[] are integers. They really are scaled - * down by maxbl_FG - minbl_fg bits. - * - * If we are at low depth, then we use the NTT to - * compute k*f and k*g. - */ - sch = (uint32_t)(scale_k / 31); - scl = (uint32_t)(scale_k % 31); - if (depth <= DEPTH_INT_FG) { - poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen, - k, sch, scl, logn, t1); - poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen, - k, sch, scl, logn, t1); - } else { - poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen, - k, sch, scl, logn); - poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen, - k, sch, scl, logn); - } - - /* - * We compute the new maximum size of (F,G), assuming that - * (f,g) has _maximal_ length (i.e. that reduction is - * "late" instead of "early". We also adjust FGlen - * accordingly. - */ - new_maxbl_FG = scale_k + maxbl_fg + 10; - if (new_maxbl_FG < maxbl_FG) { - maxbl_FG = new_maxbl_FG; - if ((int)FGlen * 31 >= maxbl_FG + 31) { - FGlen --; - } - } - - /* - * We suppose that scaling down achieves a reduction by - * at least 25 bits per iteration. We stop when we have - * done the loop with an unscaled k. - */ - if (scale_k <= 0) { - break; - } - scale_k -= 25; - if (scale_k < 0) { - scale_k = 0; - } - } - - /* - * If (F,G) length was lowered below 'slen', then we must take - * care to re-extend the sign. - */ - if (FGlen < slen) { - for (u = 0; u < n; u ++, Ft += llen, Gt += llen) { - size_t v; - uint32_t sw; - - sw = -(Ft[FGlen - 1] >> 30) >> 1; - for (v = FGlen; v < slen; v ++) { - Ft[v] = sw; - } - sw = -(Gt[FGlen - 1] >> 30) >> 1; - for (v = FGlen; v < slen; v ++) { - Gt[v] = sw; - } - } - } - - /* - * Compress encoding of all values to 'slen' words (this is the - * expected output format). - */ - for (u = 0, x = tmp, y = tmp; - u < (n << 1); u ++, x += slen, y += llen) - { - memmove(x, y, slen * sizeof *y); - } - return 1; -} - -/* - * Solving the NTRU equation, binary case, depth = 1. Upon entry, the - * F and G from the previous level should be in the tmp[] array. - * - * Returned value: 1 on success, 0 on error. - */ -static int -solve_NTRU_binary_depth1(unsigned logn_top, - const int8_t *f, const int8_t *g, uint32_t *tmp) -{ - /* - * The first half of this function is a copy of the corresponding - * part in solve_NTRU_intermediate(), for the reconstruction of - * the unreduced F and G. The second half (Babai reduction) is - * done differently, because the unreduced F and G fit in 53 bits - * of precision, allowing a much simpler process with lower RAM - * usage. - */ - unsigned depth, logn; - size_t n_top, n, hn, slen, dlen, llen, u; - uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1; - fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6; - uint32_t *x, *y; - - depth = 1; - n_top = (size_t)1 << logn_top; - logn = logn_top - depth; - n = (size_t)1 << logn; - hn = n >> 1; - - /* - * Equations are: - * - * f' = f0^2 - X^2*f1^2 - * g' = g0^2 - X^2*g1^2 - * F' and G' are a solution to f'G' - g'F' = q (from deeper levels) - * F = F'*(g0 - X*g1) - * G = G'*(f0 - X*f1) - * - * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to - * degree N/2 (their odd-indexed coefficients are all zero). - */ - - /* - * slen = size for our input f and g; also size of the reduced - * F and G we return (degree N) - * - * dlen = size of the F and G obtained from the deeper level - * (degree N/2) - * - * llen = size for intermediary F and G before reduction (degree N) - * - * We build our non-reduced F and G as two independent halves each, - * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1). - */ - slen = MAX_BL_SMALL[depth]; - dlen = MAX_BL_SMALL[depth + 1]; - llen = MAX_BL_LARGE[depth]; - - /* - * Fd and Gd are the F and G from the deeper level. Ft and Gt - * are the destination arrays for the unreduced F and G. - */ - Fd = tmp; - Gd = Fd + dlen * hn; - Ft = Gd + dlen * hn; - Gt = Ft + llen * n; - - /* - * We reduce Fd and Gd modulo all the small primes we will need, - * and store the values in Ft and Gt. - */ - for (u = 0; u < llen; u ++) { - uint32_t p, p0i, R2, Rx; - size_t v; - uint32_t *xs, *ys, *xd, *yd; - - p = PRIMES[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - Rx = modp_Rx((unsigned)dlen, p, p0i, R2); - for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u; - v < hn; - v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) - { - *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx); - *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx); - } - } - - /* - * Now Fd and Gd are not needed anymore; we can squeeze them out. - */ - memmove(tmp, Ft, llen * n * sizeof(uint32_t)); - Ft = tmp; - memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t)); - Gt = Ft + llen * n; - ft = Gt + llen * n; - gt = ft + slen * n; - - t1 = gt + slen * n; - - /* - * Compute our F and G modulo sufficiently many small primes. - */ - for (u = 0; u < llen; u ++) { - uint32_t p, p0i, R2; - uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp; - unsigned e; - size_t v; - - /* - * All computations are done modulo p. - */ - p = PRIMES[u].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - - /* - * We recompute things from the source f and g, of full - * degree. However, we will need only the n first elements - * of the inverse NTT table (igm); the call to modp_mkgm() - * below will fill n_top elements in igm[] (thus overflowing - * into fx[]) but later code will overwrite these extra - * elements. - */ - gm = t1; - igm = gm + n_top; - fx = igm + n; - gx = fx + n_top; - modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i); - - /* - * Set ft and gt to f and g modulo p, respectively. - */ - for (v = 0; v < n_top; v ++) { - fx[v] = modp_set(f[v], p); - gx[v] = modp_set(g[v], p); - } - - /* - * Convert to NTT and compute our f and g. - */ - modp_NTT2(fx, gm, logn_top, p, p0i); - modp_NTT2(gx, gm, logn_top, p, p0i); - for (e = logn_top; e > logn; e --) { - modp_poly_rec_res(fx, e, p, p0i, R2); - modp_poly_rec_res(gx, e, p, p0i, R2); - } - - /* - * From that point onward, we only need tables for - * degree n, so we can save some space. - */ - if (depth > 0) { /* always true */ - memmove(gm + n, igm, n * sizeof *igm); - igm = gm + n; - memmove(igm + n, fx, n * sizeof *ft); - fx = igm + n; - memmove(fx + n, gx, n * sizeof *gt); - gx = fx + n; - } - - /* - * Get F' and G' modulo p and in NTT representation - * (they have degree n/2). These values were computed - * in a previous step, and stored in Ft and Gt. - */ - Fp = gx + n; - Gp = Fp + hn; - for (v = 0, x = Ft + u, y = Gt + u; - v < hn; v ++, x += llen, y += llen) - { - Fp[v] = *x; - Gp[v] = *y; - } - modp_NTT2(Fp, gm, logn - 1, p, p0i); - modp_NTT2(Gp, gm, logn - 1, p, p0i); - - /* - * Compute our F and G modulo p. - * - * Equations are: - * - * f'(x^2) = N(f)(x^2) = f * adj(f) - * g'(x^2) = N(g)(x^2) = g * adj(g) - * - * f'*G' - g'*F' = q - * - * F = F'(x^2) * adj(g) - * G = G'(x^2) * adj(f) - * - * The NTT representation of f is f(w) for all w which - * are roots of phi. In the binary case, as well as in - * the ternary case for all depth except the deepest, - * these roots can be grouped in pairs (w,-w), and we - * then have: - * - * f(w) = adj(f)(-w) - * f(-w) = adj(f)(w) - * - * and w^2 is then a root for phi at the half-degree. - * - * At the deepest level in the ternary case, this still - * holds, in the following sense: the roots of x^2-x+1 - * are (w,-w^2) (for w^3 = -1, and w != -1), and we - * have: - * - * f(w) = adj(f)(-w^2) - * f(-w^2) = adj(f)(w) - * - * In all case, we can thus compute F and G in NTT - * representation by a few simple multiplications. - * Moreover, the two roots for each pair are consecutive - * in our bit-reversal encoding. - */ - for (v = 0, x = Ft + u, y = Gt + u; - v < hn; v ++, x += (llen << 1), y += (llen << 1)) - { - uint32_t ftA, ftB, gtA, gtB; - uint32_t mFp, mGp; - - ftA = fx[(v << 1) + 0]; - ftB = fx[(v << 1) + 1]; - gtA = gx[(v << 1) + 0]; - gtB = gx[(v << 1) + 1]; - mFp = modp_montymul(Fp[v], R2, p, p0i); - mGp = modp_montymul(Gp[v], R2, p, p0i); - x[0] = modp_montymul(gtB, mFp, p, p0i); - x[llen] = modp_montymul(gtA, mFp, p, p0i); - y[0] = modp_montymul(ftB, mGp, p, p0i); - y[llen] = modp_montymul(ftA, mGp, p, p0i); - } - modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i); - modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i); - - /* - * Also save ft and gt (only up to size slen). - */ - if (u < slen) { - modp_iNTT2(fx, igm, logn, p, p0i); - modp_iNTT2(gx, igm, logn, p, p0i); - for (v = 0, x = ft + u, y = gt + u; - v < n; v ++, x += slen, y += slen) - { - *x = fx[v]; - *y = gx[v]; - } - } - } - - /* - * Rebuild f, g, F and G with the CRT. Note that the elements of F - * and G are consecutive, and thus can be rebuilt in a single - * loop; similarly, the elements of f and g are consecutive. - */ - zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1); - zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1); - - /* - * Here starts the Babai reduction, specialized for depth = 1. - * - * Candidates F and G (from Ft and Gt), and base f and g (ft and gt), - * are converted to floating point. There is no scaling, and a - * single pass is sufficient. - */ - - /* - * Convert F and G into floating point (rt1 and rt2). - */ - rt1 = align_fpr(tmp, gt + slen * n); - rt2 = rt1 + n; - poly_big_to_fp(rt1, Ft, llen, llen, logn); - poly_big_to_fp(rt2, Gt, llen, llen, logn); - - /* - * Integer representation of F and G is no longer needed, we - * can remove it. - */ - memmove(tmp, ft, 2 * slen * n * sizeof *ft); - ft = tmp; - gt = ft + slen * n; - rt3 = align_fpr(tmp, gt + slen * n); - memmove(rt3, rt1, 2 * n * sizeof *rt1); - rt1 = rt3; - rt2 = rt1 + n; - rt3 = rt2 + n; - rt4 = rt3 + n; - - /* - * Convert f and g into floating point (rt3 and rt4). - */ - poly_big_to_fp(rt3, ft, slen, slen, logn); - poly_big_to_fp(rt4, gt, slen, slen, logn); - - /* - * Remove unneeded ft and gt. - */ - memmove(tmp, rt1, 4 * n * sizeof *rt1); - rt1 = (fpr *)tmp; - rt2 = rt1 + n; - rt3 = rt2 + n; - rt4 = rt3 + n; - - /* - * We now have: - * rt1 = F - * rt2 = G - * rt3 = f - * rt4 = g - * in that order in RAM. We convert all of them to FFT. - */ - Zf(FFT)(rt1, logn); - Zf(FFT)(rt2, logn); - Zf(FFT)(rt3, logn); - Zf(FFT)(rt4, logn); - - /* - * Compute: - * rt5 = F*adj(f) + G*adj(g) - * rt6 = 1 / (f*adj(f) + g*adj(g)) - * (Note that rt6 is half-length.) - */ - rt5 = rt4 + n; - rt6 = rt5 + n; - Zf(poly_add_muladj_fft)(rt5, rt1, rt2, rt3, rt4, logn); - Zf(poly_invnorm2_fft)(rt6, rt3, rt4, logn); - - /* - * Compute: - * rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g)) - */ - Zf(poly_mul_autoadj_fft)(rt5, rt6, logn); - - /* - * Compute k as the rounded version of rt5. Check that none of - * the values is larger than 2^63-1 (in absolute value) - * because that would make the fpr_rint() do something undefined; - * note that any out-of-bounds value here implies a failure and - * (f,g) will be discarded, so we can make a simple test. - */ - Zf(iFFT)(rt5, logn); - for (u = 0; u < n; u ++) { - fpr z; - - z = rt5[u]; - if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) { - return 0; - } - rt5[u] = fpr_of(fpr_rint(z)); - } - Zf(FFT)(rt5, logn); - - /* - * Subtract k*f from F, and k*g from G. - */ - Zf(poly_mul_fft)(rt3, rt5, logn); - Zf(poly_mul_fft)(rt4, rt5, logn); - Zf(poly_sub)(rt1, rt3, logn); - Zf(poly_sub)(rt2, rt4, logn); - Zf(iFFT)(rt1, logn); - Zf(iFFT)(rt2, logn); - - /* - * Convert back F and G to integers, and return. - */ - Ft = tmp; - Gt = Ft + n; - rt3 = align_fpr(tmp, Gt + n); - memmove(rt3, rt1, 2 * n * sizeof *rt1); - rt1 = rt3; - rt2 = rt1 + n; - for (u = 0; u < n; u ++) { - Ft[u] = (uint32_t)fpr_rint(rt1[u]); - Gt[u] = (uint32_t)fpr_rint(rt2[u]); - } - - return 1; -} - -/* - * Solving the NTRU equation, top level. Upon entry, the F and G - * from the previous level should be in the tmp[] array. - * - * Returned value: 1 on success, 0 on error. - */ -static int -solve_NTRU_binary_depth0(unsigned logn, - const int8_t *f, const int8_t *g, uint32_t *tmp) -{ - size_t n, hn, u; - uint32_t p, p0i, R2; - uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5; - uint32_t *gm, *igm, *ft, *gt; - fpr *rt2, *rt3; - - n = (size_t)1 << logn; - hn = n >> 1; - - /* - * Equations are: - * - * f' = f0^2 - X^2*f1^2 - * g' = g0^2 - X^2*g1^2 - * F' and G' are a solution to f'G' - g'F' = q (from deeper levels) - * F = F'*(g0 - X*g1) - * G = G'*(f0 - X*f1) - * - * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to - * degree N/2 (their odd-indexed coefficients are all zero). - * - * Everything should fit in 31-bit integers, hence we can just use - * the first small prime p = 2147473409. - */ - p = PRIMES[0].p; - p0i = modp_ninv31(p); - R2 = modp_R2(p, p0i); - - Fp = tmp; - Gp = Fp + hn; - ft = Gp + hn; - gt = ft + n; - gm = gt + n; - igm = gm + n; - - modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i); - - /* - * Convert F' anf G' in NTT representation. - */ - for (u = 0; u < hn; u ++) { - Fp[u] = modp_set(zint_one_to_plain(Fp + u), p); - Gp[u] = modp_set(zint_one_to_plain(Gp + u), p); - } - modp_NTT2(Fp, gm, logn - 1, p, p0i); - modp_NTT2(Gp, gm, logn - 1, p, p0i); - - /* - * Load f and g and convert them to NTT representation. - */ - for (u = 0; u < n; u ++) { - ft[u] = modp_set(f[u], p); - gt[u] = modp_set(g[u], p); - } - modp_NTT2(ft, gm, logn, p, p0i); - modp_NTT2(gt, gm, logn, p, p0i); - - /* - * Build the unreduced F,G in ft and gt. - */ - for (u = 0; u < n; u += 2) { - uint32_t ftA, ftB, gtA, gtB; - uint32_t mFp, mGp; - - ftA = ft[u + 0]; - ftB = ft[u + 1]; - gtA = gt[u + 0]; - gtB = gt[u + 1]; - mFp = modp_montymul(Fp[u >> 1], R2, p, p0i); - mGp = modp_montymul(Gp[u >> 1], R2, p, p0i); - ft[u + 0] = modp_montymul(gtB, mFp, p, p0i); - ft[u + 1] = modp_montymul(gtA, mFp, p, p0i); - gt[u + 0] = modp_montymul(ftB, mGp, p, p0i); - gt[u + 1] = modp_montymul(ftA, mGp, p, p0i); - } - modp_iNTT2(ft, igm, logn, p, p0i); - modp_iNTT2(gt, igm, logn, p, p0i); - - Gp = Fp + n; - t1 = Gp + n; - memmove(Fp, ft, 2 * n * sizeof *ft); - - /* - * We now need to apply the Babai reduction. At that point, - * we have F and G in two n-word arrays. - * - * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g) - * modulo p, using the NTT. We still move memory around in - * order to save RAM. - */ - t2 = t1 + n; - t3 = t2 + n; - t4 = t3 + n; - t5 = t4 + n; - - /* - * Compute the NTT tables in t1 and t2. We do not keep t2 - * (we'll recompute it later on). - */ - modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i); - - /* - * Convert F and G to NTT. - */ - modp_NTT2(Fp, t1, logn, p, p0i); - modp_NTT2(Gp, t1, logn, p, p0i); - - /* - * Load f and adj(f) in t4 and t5, and convert them to NTT - * representation. - */ - t4[0] = t5[0] = modp_set(f[0], p); - for (u = 1; u < n; u ++) { - t4[u] = modp_set(f[u], p); - t5[n - u] = modp_set(-f[u], p); - } - modp_NTT2(t4, t1, logn, p, p0i); - modp_NTT2(t5, t1, logn, p, p0i); - - /* - * Compute F*adj(f) in t2, and f*adj(f) in t3. - */ - for (u = 0; u < n; u ++) { - uint32_t w; - - w = modp_montymul(t5[u], R2, p, p0i); - t2[u] = modp_montymul(w, Fp[u], p, p0i); - t3[u] = modp_montymul(w, t4[u], p, p0i); - } - - /* - * Load g and adj(g) in t4 and t5, and convert them to NTT - * representation. - */ - t4[0] = t5[0] = modp_set(g[0], p); - for (u = 1; u < n; u ++) { - t4[u] = modp_set(g[u], p); - t5[n - u] = modp_set(-g[u], p); - } - modp_NTT2(t4, t1, logn, p, p0i); - modp_NTT2(t5, t1, logn, p, p0i); - - /* - * Add G*adj(g) to t2, and g*adj(g) to t3. - */ - for (u = 0; u < n; u ++) { - uint32_t w; - - w = modp_montymul(t5[u], R2, p, p0i); - t2[u] = modp_add(t2[u], - modp_montymul(w, Gp[u], p, p0i), p); - t3[u] = modp_add(t3[u], - modp_montymul(w, t4[u], p, p0i), p); - } - - /* - * Convert back t2 and t3 to normal representation (normalized - * around 0), and then - * move them to t1 and t2. We first need to recompute the - * inverse table for NTT. - */ - modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i); - modp_iNTT2(t2, t4, logn, p, p0i); - modp_iNTT2(t3, t4, logn, p, p0i); - for (u = 0; u < n; u ++) { - t1[u] = (uint32_t)modp_norm(t2[u], p); - t2[u] = (uint32_t)modp_norm(t3[u], p); - } - - /* - * At that point, array contents are: - * - * F (NTT representation) (Fp) - * G (NTT representation) (Gp) - * F*adj(f)+G*adj(g) (t1) - * f*adj(f)+g*adj(g) (t2) - * - * We want to divide t1 by t2. The result is not integral; it - * must be rounded. We thus need to use the FFT. - */ - - /* - * Get f*adj(f)+g*adj(g) in FFT representation. Since this - * polynomial is auto-adjoint, all its coordinates in FFT - * representation are actually real, so we can truncate off - * the imaginary parts. - */ - rt3 = align_fpr(tmp, t3); - for (u = 0; u < n; u ++) { - rt3[u] = fpr_of(((int32_t *)t2)[u]); - } - Zf(FFT)(rt3, logn); - rt2 = align_fpr(tmp, t2); - memmove(rt2, rt3, hn * sizeof *rt3); - - /* - * Convert F*adj(f)+G*adj(g) in FFT representation. - */ - rt3 = rt2 + hn; - for (u = 0; u < n; u ++) { - rt3[u] = fpr_of(((int32_t *)t1)[u]); - } - Zf(FFT)(rt3, logn); - - /* - * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get - * its rounded normal representation in t1. - */ - Zf(poly_div_autoadj_fft)(rt3, rt2, logn); - Zf(iFFT)(rt3, logn); - for (u = 0; u < n; u ++) { - t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p); - } - - /* - * RAM contents are now: - * - * F (NTT representation) (Fp) - * G (NTT representation) (Gp) - * k (t1) - * - * We want to compute F-k*f, and G-k*g. - */ - t2 = t1 + n; - t3 = t2 + n; - t4 = t3 + n; - t5 = t4 + n; - modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i); - for (u = 0; u < n; u ++) { - t4[u] = modp_set(f[u], p); - t5[u] = modp_set(g[u], p); - } - modp_NTT2(t1, t2, logn, p, p0i); - modp_NTT2(t4, t2, logn, p, p0i); - modp_NTT2(t5, t2, logn, p, p0i); - for (u = 0; u < n; u ++) { - uint32_t kw; - - kw = modp_montymul(t1[u], R2, p, p0i); - Fp[u] = modp_sub(Fp[u], - modp_montymul(kw, t4[u], p, p0i), p); - Gp[u] = modp_sub(Gp[u], - modp_montymul(kw, t5[u], p, p0i), p); - } - modp_iNTT2(Fp, t3, logn, p, p0i); - modp_iNTT2(Gp, t3, logn, p, p0i); - for (u = 0; u < n; u ++) { - Fp[u] = (uint32_t)modp_norm(Fp[u], p); - Gp[u] = (uint32_t)modp_norm(Gp[u], p); - } - - return 1; -} - -/* - * Solve the NTRU equation. Returned value is 1 on success, 0 on error. - * G can be NULL, in which case that value is computed but not returned. - * If any of the coefficients of F and G exceeds lim (in absolute value), - * then 0 is returned. - */ -static int -solve_NTRU(unsigned logn, int8_t *F, int8_t *G, - const int8_t *f, const int8_t *g, int lim, uint32_t *tmp) -{ - size_t n, u; - uint32_t *ft, *gt, *Ft, *Gt, *gm; - uint32_t p, p0i, r; - const small_prime *primes; - - n = MKN(logn); - - if (!solve_NTRU_deepest(logn, f, g, tmp)) { - return 0; - } - - /* - * For logn <= 2, we need to use solve_NTRU_intermediate() - * directly, because coefficients are a bit too large and - * do not fit the hypotheses in solve_NTRU_binary_depth0(). - */ - if (logn <= 2) { - unsigned depth; - - depth = logn; - while (depth -- > 0) { - if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) { - return 0; - } - } - } else { - unsigned depth; - - depth = logn; - while (depth -- > 2) { - if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) { - return 0; - } - } - if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) { - return 0; - } - if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) { - return 0; - } - } - - /* - * If no buffer has been provided for G, use a temporary one. - */ - if (G == NULL) { - G = (int8_t *)(tmp + 2 * n); - } - - /* - * Final F and G are in fk->tmp, one word per coefficient - * (signed value over 31 bits). - */ - if (!poly_big_to_small(F, tmp, lim, logn) - || !poly_big_to_small(G, tmp + n, lim, logn)) - { - return 0; - } - - /* - * Verify that the NTRU equation is fulfilled. Since all elements - * have short lengths, verifying modulo a small prime p works, and - * allows using the NTT. - * - * We put Gt[] first in tmp[], and process it first, so that it does - * not overlap with G[] in case we allocated it ourselves. - */ - Gt = tmp; - ft = Gt + n; - gt = ft + n; - Ft = gt + n; - gm = Ft + n; - - primes = PRIMES; - p = primes[0].p; - p0i = modp_ninv31(p); - modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i); - for (u = 0; u < n; u ++) { - Gt[u] = modp_set(G[u], p); - } - for (u = 0; u < n; u ++) { - ft[u] = modp_set(f[u], p); - gt[u] = modp_set(g[u], p); - Ft[u] = modp_set(F[u], p); - } - modp_NTT2(ft, gm, logn, p, p0i); - modp_NTT2(gt, gm, logn, p, p0i); - modp_NTT2(Ft, gm, logn, p, p0i); - modp_NTT2(Gt, gm, logn, p, p0i); - r = modp_montymul(12289, 1, p, p0i); - for (u = 0; u < n; u ++) { - uint32_t z; - - z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i), - modp_montymul(gt[u], Ft[u], p, p0i), p); - if (z != r) { - return 0; - } - } - - return 1; -} - -/* - * Generate a random polynomial with a Gaussian distribution. This function - * also makes sure that the resultant of the polynomial with phi is odd. - */ -static void -poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) -{ - size_t n, u; - unsigned mod2; - - n = MKN(logn); - mod2 = 0; - for (u = 0; u < n; u ++) { - int s; - - restart: - s = mkgauss(rng, logn); - - /* - * We need the coefficient to fit within -127..+127; - * realistically, this is always the case except for - * the very low degrees (N = 2 or 4), for which there - * is no real security anyway. - */ - if (s < -127 || s > 127) { - goto restart; - } - - /* - * We need the sum of all coefficients to be 1; otherwise, - * the resultant of the polynomial with X^N+1 will be even, - * and the binary GCD will fail. - */ - if (u == n - 1) { - if ((mod2 ^ (unsigned)(s & 1)) == 0) { - goto restart; - } - } else { - mod2 ^= (unsigned)(s & 1); - } - f[u] = (int8_t)s; - } -} - -/* see falcon.h */ -void -Zf(keygen)(inner_shake256_context *rng, - int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h, - unsigned logn, uint8_t *tmp) -{ - /* - * Algorithm is the following: - * - * - Generate f and g with the Gaussian distribution. - * - * - If either Res(f,phi) or Res(g,phi) is even, try again. - * - * - If ||(f,g)|| is too large, try again. - * - * - If ||B~_{f,g}|| is too large, try again. - * - * - If f is not invertible mod phi mod q, try again. - * - * - Compute h = g/f mod phi mod q. - * - * - Solve the NTRU equation fG - gF = q; if the solving fails, - * try again. Usual failure condition is when Res(f,phi) - * and Res(g,phi) are not prime to each other. - */ - size_t n, u; - uint16_t *h2, *tmp2; - RNG_CONTEXT *rc; -#if FALCON_KG_CHACHA20 // yyyKG_CHACHA20+1 - prng p; -#endif // yyyKG_CHACHA20- - - n = MKN(logn); -#if FALCON_KG_CHACHA20 // yyyKG_CHACHA20+1 - Zf(prng_init)(&p, rng); - rc = &p; -#else // yyyKG_CHACHA20+0 - rc = rng; -#endif // yyyKG_CHACHA20- - - /* - * We need to generate f and g randomly, until we find values - * such that the norm of (g,-f), and of the orthogonalized - * vector, are satisfying. The orthogonalized vector is: - * (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g))) - * (it is actually the (N+1)-th row of the Gram-Schmidt basis). - * - * In the binary case, coefficients of f and g are generated - * independently of each other, with a discrete Gaussian - * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then, - * the two vectors have expected norm 1.17*sqrt(q), which is - * also our acceptance bound: we require both vectors to be no - * larger than that (this will be satisfied about 1/4th of the - * time, thus we expect sampling new (f,g) about 4 times for that - * step). - * - * We require that Res(f,phi) and Res(g,phi) are both odd (the - * NTRU equation solver requires it). - */ - for (;;) { - fpr *rt1, *rt2, *rt3; - fpr bnorm; - uint32_t normf, normg, norm; - int lim; - - /* - * The poly_small_mkgauss() function makes sure - * that the sum of coefficients is 1 modulo 2 - * (i.e. the resultant of the polynomial with phi - * will be odd). - */ - poly_small_mkgauss(rc, f, logn); - poly_small_mkgauss(rc, g, logn); - - /* - * Verify that all coefficients are within the bounds - * defined in max_fg_bits. This is the case with - * overwhelming probability; this guarantees that the - * key will be encodable with FALCON_COMP_TRIM. - */ - lim = 1 << (Zf(max_fg_bits)[logn] - 1); - for (u = 0; u < n; u ++) { - /* - * We can use non-CT tests since on any failure - * we will discard f and g. - */ - if (f[u] >= lim || f[u] <= -lim - || g[u] >= lim || g[u] <= -lim) - { - lim = -1; - break; - } - } - if (lim < 0) { - continue; - } - - /* - * Bound is 1.17*sqrt(q). We compute the squared - * norms. With q = 12289, the squared bound is: - * (1.17^2)* 12289 = 16822.4121 - * Since f and g are integral, the squared norm - * of (g,-f) is an integer. - */ - normf = poly_small_sqnorm(f, logn); - normg = poly_small_sqnorm(g, logn); - norm = (normf + normg) | -((normf | normg) >> 31); - if (norm >= 16823) { - continue; - } - - /* - * We compute the orthogonalized vector norm. - */ - rt1 = (fpr *)tmp; - rt2 = rt1 + n; - rt3 = rt2 + n; - poly_small_to_fp(rt1, f, logn); - poly_small_to_fp(rt2, g, logn); - Zf(FFT)(rt1, logn); - Zf(FFT)(rt2, logn); - Zf(poly_invnorm2_fft)(rt3, rt1, rt2, logn); - Zf(poly_adj_fft)(rt1, logn); - Zf(poly_adj_fft)(rt2, logn); - Zf(poly_mulconst)(rt1, fpr_q, logn); - Zf(poly_mulconst)(rt2, fpr_q, logn); - Zf(poly_mul_autoadj_fft)(rt1, rt3, logn); - Zf(poly_mul_autoadj_fft)(rt2, rt3, logn); - Zf(iFFT)(rt1, logn); - Zf(iFFT)(rt2, logn); - bnorm = fpr_zero; - for (u = 0; u < n; u ++) { - bnorm = fpr_add(bnorm, fpr_sqr(rt1[u])); - bnorm = fpr_add(bnorm, fpr_sqr(rt2[u])); - } - if (!fpr_lt(bnorm, fpr_bnorm_max)) { - continue; - } - - /* - * Compute public key h = g/f mod X^N+1 mod q. If this - * fails, we must restart. - */ - if (h == NULL) { - h2 = (uint16_t *)tmp; - tmp2 = h2 + n; - } else { - h2 = h; - tmp2 = (uint16_t *)tmp; - } - if (!Zf(compute_public)(h2, f, g, logn, (uint8_t *)tmp2)) { - continue; - } - - /* - * Solve the NTRU equation to get F and G. - */ - lim = (1 << (Zf(max_FG_bits)[logn] - 1)) - 1; - if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) { - continue; - } - - /* - * Key pair is generated. - */ - break; - } -} diff --git a/crypto_sign/falcon-512/m4-ct/pqm4.c b/crypto_sign/falcon-512/m4-ct/pqm4.c deleted file mode 100644 index d5b8a64e..00000000 --- a/crypto_sign/falcon-512/m4-ct/pqm4.c +++ /dev/null @@ -1,348 +0,0 @@ -#include -#include - -#include "api.h" -#include "inner.h" -#include "randombytes.h" - -/* ==================================================================== */ - -/* - * Falcon degree is N = 2^LOGN, where LOGN=9 (for Falcon-512) or 10 - * (for Falcon-1024). We use the advertised public key size to know - * which degree is used. - */ -#if CRYPTO_PUBLICKEYBYTES == 897 -#define LOGN 9 -#elif CRYPTO_PUBLICKEYBYTES == 1793 -#define LOGN 10 -#else -#error Unknown Falcon degree (unexpected public key size) -#endif - -#define N ((size_t)1 << LOGN) -#define NONCELEN 40 -#define SEEDLEN 48 - -/* - * If the private key length is larger than 10000, then this is the - * variant with precomputed expanded keys. - */ -#if CRYPTO_SECRETKEYBYTES > 10000 -#define KG_EXPAND 1 -#else -#define KG_EXPAND 0 -#endif - -/* - * Common buffer, to avoid bulky stack allocation. The buffer sizes are - * all expressed in bytes, but the buffer must be suitably aligned for - * 64-bit integers and floating-point values. - * - * Required size (in bytes): - * - * With expanded key: - * keygen: 48*N + 6*N = 54*N - * sign: 48*N + 2*N = 50*N - * vrfy: 8*N - * - * Without expanded key: - * keygen: 28*N + 5*N = 33*N - * sign: 72*N + 6*N = 78*N - * vrfy: 8*N - */ -static union { -#if KG_EXPAND - uint8_t b[54 * N]; -#else - uint8_t b[78 * N]; -#endif - uint64_t dummy_u64; - fpr dummy_fp; -} tmp; - - -int -crypto_sign_keypair(unsigned char *pk, unsigned char *sk) -{ - int8_t *f, *g, *F, *G; - uint16_t *h; - inner_shake256_context rng; - unsigned char seed[SEEDLEN]; -#if KG_EXPAND - size_t v; -#else - size_t u, v; -#endif - unsigned sav_cw; - -#if KG_EXPAND - f = (int8_t *)&tmp.b[48 * N]; - g = f + N; - F = g + N; - G = F + N; - h = (uint16_t *)(G + N); -#else - f = (int8_t *)&tmp.b[28 * N]; - g = f + N; - F = g + N; - G = NULL; - h = (uint16_t *)(F + N); -#endif - - randombytes(seed, SEEDLEN); - inner_shake256_init(&rng); - inner_shake256_inject(&rng, seed, SEEDLEN); - inner_shake256_flip(&rng); - sav_cw = set_fpu_cw(2); - Zf(keygen)(&rng, f, g, F, G, h, LOGN, tmp.b); - -#if KG_EXPAND - /* - * Expand private key. - */ - Zf(expand_privkey)((fpr *)sk, f, g, F, G, LOGN, tmp.b); - set_fpu_cw(sav_cw); -#else - set_fpu_cw(sav_cw); - - /* - * Encode private key. - */ - sk[0] = 0x50 + LOGN; - u = 1; - v = Zf(trim_i8_encode)(sk + u, CRYPTO_SECRETKEYBYTES - u, - f, LOGN, Zf(max_fg_bits)[LOGN]); - if (v == 0) { - return -1; - } - u += v; - v = Zf(trim_i8_encode)(sk + u, CRYPTO_SECRETKEYBYTES - u, - g, LOGN, Zf(max_fg_bits)[LOGN]); - if (v == 0) { - return -1; - } - u += v; - v = Zf(trim_i8_encode)(sk + u, CRYPTO_SECRETKEYBYTES - u, - F, LOGN, Zf(max_FG_bits)[LOGN]); - if (v == 0) { - return -1; - } - u += v; - if (u != CRYPTO_SECRETKEYBYTES) { - return -1; - } -#endif - - /* - * Encode public key. - */ - pk[0] = 0x00 + LOGN; - v = Zf(modq_encode)(pk + 1, CRYPTO_PUBLICKEYBYTES - 1, h, LOGN); - if (v != CRYPTO_PUBLICKEYBYTES - 1) { - return -1; - } - - return 0; -} - -int -crypto_sign(unsigned char *sm, size_t *smlen, - const unsigned char *m, size_t mlen, - const unsigned char *sk) -{ -#if KG_EXPAND - const fpr *expanded_key; -#else - int8_t *f, *g, *F, *G; - size_t u, v; -#endif - int16_t *sig; - uint16_t *hm; - unsigned char seed[SEEDLEN], nonce[NONCELEN]; - unsigned char *esig; - inner_shake256_context sc; - size_t sig_len; - unsigned sav_cw; - -#if KG_EXPAND - sig = (int16_t *)&tmp.b[48 * N]; -#else - f = (int8_t *)&tmp.b[72 * N]; - g = f + N; - F = g + N; - G = F + N; - sig = (int16_t *)(G + N); -#endif - hm = (uint16_t *)sig; /* hm[] is shared with sig[] */ - esig = (unsigned char *)tmp.b; - -#if KG_EXPAND - /* - * Expanded key is provided "as is". - */ - expanded_key = (const fpr *)sk; -#else - /* - * Decode the private key. - */ - if (sk[0] != 0x50 + LOGN) { - return -1; - } - u = 1; - v = Zf(trim_i8_decode)(f, LOGN, Zf(max_fg_bits)[LOGN], - sk + u, CRYPTO_SECRETKEYBYTES - u); - if (v == 0) { - return -1; - } - u += v; - v = Zf(trim_i8_decode)(g, LOGN, Zf(max_fg_bits)[LOGN], - sk + u, CRYPTO_SECRETKEYBYTES - u); - if (v == 0) { - return -1; - } - u += v; - v = Zf(trim_i8_decode)(F, LOGN, Zf(max_FG_bits)[LOGN], - sk + u, CRYPTO_SECRETKEYBYTES - u); - if (v == 0) { - return -1; - } - u += v; - if (u != CRYPTO_SECRETKEYBYTES) { - return -1; - } - if (!Zf(complete_private)(G, f, g, F, LOGN, tmp.b)) { - return -1; - } -#endif - - /* - * Create a random nonce (40 bytes). - */ - randombytes(nonce, NONCELEN); - - /* - * Hash message nonce + message into a vector. - */ - inner_shake256_init(&sc); - inner_shake256_inject(&sc, nonce, NONCELEN); - inner_shake256_inject(&sc, m, mlen); - inner_shake256_flip(&sc); - Zf(hash_to_point_vartime)(&sc, hm, LOGN); - - /* - * Initialize a RNG. - */ - randombytes(seed, SEEDLEN); - inner_shake256_init(&sc); - inner_shake256_inject(&sc, seed, SEEDLEN); - inner_shake256_flip(&sc); - - /* - * Compute the signature. - */ - sav_cw = set_fpu_cw(2); -#if KG_EXPAND - Zf(sign_tree)(sig, &sc, expanded_key, hm, LOGN, tmp.b); -#else - Zf(sign_dyn)(sig, &sc, f, g, F, G, hm, LOGN, tmp.b); -#endif - set_fpu_cw(sav_cw); - - /* - * Encode the signature and bundle it with the message. Format is: - * signature length 2 bytes, big-endian - * nonce 40 bytes - * message mlen bytes - * signature slen bytes - */ - esig[0] = 0x20 + LOGN; - sig_len = Zf(comp_encode)(esig + 1, CRYPTO_BYTES - 1, sig, LOGN); - if (sig_len == 0) { - return -1; - } - sig_len ++; - memmove(sm + 2 + NONCELEN, m, mlen); - sm[0] = (unsigned char)(sig_len >> 8); - sm[1] = (unsigned char)sig_len; - memcpy(sm + 2, nonce, NONCELEN); - memcpy(sm + 2 + NONCELEN + mlen, esig, sig_len); - *smlen = 2 + NONCELEN + mlen + sig_len; - return 0; -} - -int -crypto_sign_open(unsigned char *m, size_t *mlen, - const unsigned char *sm, size_t smlen, - const unsigned char *pk) -{ - uint16_t *h, *hm; - int16_t *sig; - const unsigned char *esig; - inner_shake256_context sc; - size_t sig_len, msg_len; - - h = (uint16_t *)&tmp.b[2 * N]; - hm = h + N; - sig = (int16_t *)(hm + N); - - /* - * Decode public key. - */ - if (pk[0] != 0x00 + LOGN) { - return -1; - } - if (Zf(modq_decode)(h, LOGN, pk + 1, CRYPTO_PUBLICKEYBYTES - 1) - != CRYPTO_PUBLICKEYBYTES - 1) - { - return -1; - } - Zf(to_ntt_monty)(h, LOGN); - - /* - * Find nonce, signature, message length. - */ - if (smlen < 2 + NONCELEN) { - return -1; - } - sig_len = ((size_t)sm[0] << 8) | (size_t)sm[1]; - if (sig_len > (smlen - 2 - NONCELEN)) { - return -1; - } - msg_len = smlen - 2 - NONCELEN - sig_len; - - /* - * Decode signature. - */ - esig = sm + 2 + NONCELEN + msg_len; - if (sig_len < 1 || esig[0] != 0x20 + LOGN) { - return -1; - } - if (Zf(comp_decode)(sig, LOGN, - esig + 1, sig_len - 1) != sig_len - 1) - { - return -1; - } - - /* - * Hash nonce + message into a vector. - */ - inner_shake256_init(&sc); - inner_shake256_inject(&sc, sm + 2, NONCELEN + msg_len); - inner_shake256_flip(&sc); - Zf(hash_to_point_vartime)(&sc, hm, LOGN); - - /* - * Verify signature. - */ - if (!Zf(verify_raw)(hm, sig, h, LOGN, tmp.b)) { - return -1; - } - - /* - * Return plaintext. - */ - memmove(m, sm + 2 + NONCELEN, msg_len); - *mlen = msg_len; - return 0; -} diff --git a/crypto_sign/falcon-512/m4-ct/rng.c b/crypto_sign/falcon-512/m4-ct/rng.c deleted file mode 100644 index d2ecb7af..00000000 --- a/crypto_sign/falcon-512/m4-ct/rng.c +++ /dev/null @@ -1,379 +0,0 @@ -/* - * PRNG and interface to the system RNG. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include - -#include "inner.h" - -// yyyNIST+0 yyyPQCLEAN+0 -/* - * Include relevant system header files. For Win32, this will also need - * linking with advapi32.dll, which we trigger with an appropriate #pragma. - */ -#if FALCON_RAND_GETENTROPY -#include -#endif -#if FALCON_RAND_URANDOM -#include -#if !FALCON_RAND_GETENTROPY -#include -#endif -#include -#include -#endif -#if FALCON_RAND_WIN32 -#include -#include -#pragma comment(lib, "advapi32") -#endif - -/* see inner.h */ -int -Zf(get_seed)(void *seed, size_t len) -{ - (void)seed; - if (len == 0) { - return 1; - } -#if FALCON_RAND_GETENTROPY - if (getentropy(seed, len) == 0) { - return 1; - } -#endif -#if FALCON_RAND_URANDOM - { - int f; - - f = open("/dev/urandom", O_RDONLY); - if (f >= 0) { - while (len > 0) { - ssize_t rlen; - - rlen = read(f, seed, len); - if (rlen < 0) { - if (errno == EINTR) { - continue; - } - break; - } - seed = (uint8_t *)seed + rlen; - len -= (size_t)rlen; - } - close(f); - if (len == 0) { - return 1; - } - } - } -#endif -#if FALCON_RAND_WIN32 - { - HCRYPTPROV hp; - - if (CryptAcquireContext(&hp, 0, 0, PROV_RSA_FULL, - CRYPT_VERIFYCONTEXT | CRYPT_SILENT)) - { - BOOL r; - - r = CryptGenRandom(hp, (DWORD)len, seed); - CryptReleaseContext(hp, 0); - if (r) { - return 1; - } - } - } -#endif - return 0; -} -// yyyNIST- yyyPQCLEAN- - -/* see inner.h */ -void -Zf(prng_init)(prng *p, inner_shake256_context *src) -{ -#if FALCON_LE // yyyLE+1 - inner_shake256_extract(src, p->state.d, 56); -#else // yyyLE+0 - /* - * To ensure reproducibility for a given seed, we - * must enforce little-endian interpretation of - * the state words. - */ - uint8_t tmp[56]; - uint64_t th, tl; - int i; - - inner_shake256_extract(src, tmp, 56); - for (i = 0; i < 14; i ++) { - uint32_t w; - - w = (uint32_t)tmp[(i << 2) + 0] - | ((uint32_t)tmp[(i << 2) + 1] << 8) - | ((uint32_t)tmp[(i << 2) + 2] << 16) - | ((uint32_t)tmp[(i << 2) + 3] << 24); - *(uint32_t *)(p->state.d + (i << 2)) = w; - } - tl = *(uint32_t *)(p->state.d + 48); - th = *(uint32_t *)(p->state.d + 52); - *(uint64_t *)(p->state.d + 48) = tl + (th << 32); -#endif // yyyLE- - Zf(prng_refill)(p); -} - -/* - * PRNG based on ChaCha20. - * - * State consists in key (32 bytes) then IV (16 bytes) and block counter - * (8 bytes). Normally, we should not care about local endianness (this - * is for a PRNG), but for the NIST competition we need reproducible KAT - * vectors that work across architectures, so we enforce little-endian - * interpretation where applicable. Moreover, output words are "spread - * out" over the output buffer with the interleaving pattern that is - * naturally obtained from the AVX2 implementation that runs eight - * ChaCha20 instances in parallel. - * - * The block counter is XORed into the first 8 bytes of the IV. - */ -TARGET_AVX2 -void -Zf(prng_refill)(prng *p) -{ -#if FALCON_AVX2 // yyyAVX2+1 - - static const uint32_t CW[] = { - 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 - }; - - uint64_t cc; - size_t u; - int i; - uint32_t *sw; - union { - uint32_t w[16]; - __m256i y[2]; /* for alignment */ - } t; - __m256i state[16], init[16]; - - sw = (uint32_t *)p->state.d; - - /* - * XOR next counter values into state. - */ - cc = *(uint64_t *)(p->state.d + 48); - for (u = 0; u < 8; u ++) { - t.w[u] = (uint32_t)(cc + u); - t.w[u + 8] = (uint32_t)((cc + u) >> 32); - } - *(uint64_t *)(p->state.d + 48) = cc + 8; - - /* - * Load state. - */ - for (u = 0; u < 4; u ++) { - state[u] = init[u] = - _mm256_broadcastd_epi32(_mm_cvtsi32_si128(CW[u])); - } - for (u = 0; u < 10; u ++) { - state[u + 4] = init[u + 4] = - _mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[u])); - } - state[14] = init[14] = _mm256_xor_si256( - _mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[10])), - _mm256_loadu_si256((__m256i *)&t.w[0])); - state[15] = init[15] = _mm256_xor_si256( - _mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[11])), - _mm256_loadu_si256((__m256i *)&t.w[8])); - - /* - * Do all rounds. - */ - for (i = 0; i < 10; i ++) { - -#define QROUND(a, b, c, d) do { \ - state[a] = _mm256_add_epi32(state[a], state[b]); \ - state[d] = _mm256_xor_si256(state[d], state[a]); \ - state[d] = _mm256_or_si256( \ - _mm256_slli_epi32(state[d], 16), \ - _mm256_srli_epi32(state[d], 16)); \ - state[c] = _mm256_add_epi32(state[c], state[d]); \ - state[b] = _mm256_xor_si256(state[b], state[c]); \ - state[b] = _mm256_or_si256( \ - _mm256_slli_epi32(state[b], 12), \ - _mm256_srli_epi32(state[b], 20)); \ - state[a] = _mm256_add_epi32(state[a], state[b]); \ - state[d] = _mm256_xor_si256(state[d], state[a]); \ - state[d] = _mm256_or_si256( \ - _mm256_slli_epi32(state[d], 8), \ - _mm256_srli_epi32(state[d], 24)); \ - state[c] = _mm256_add_epi32(state[c], state[d]); \ - state[b] = _mm256_xor_si256(state[b], state[c]); \ - state[b] = _mm256_or_si256( \ - _mm256_slli_epi32(state[b], 7), \ - _mm256_srli_epi32(state[b], 25)); \ - } while (0) - - QROUND( 0, 4, 8, 12); - QROUND( 1, 5, 9, 13); - QROUND( 2, 6, 10, 14); - QROUND( 3, 7, 11, 15); - QROUND( 0, 5, 10, 15); - QROUND( 1, 6, 11, 12); - QROUND( 2, 7, 8, 13); - QROUND( 3, 4, 9, 14); - -#undef QROUND - - } - - /* - * Add initial state back and encode the result in the destination - * buffer. We can dump the AVX2 values "as is" because the non-AVX2 - * code uses a compatible order of values. - */ - for (u = 0; u < 16; u ++) { - _mm256_storeu_si256((__m256i *)&p->buf.d[u << 5], - _mm256_add_epi32(state[u], init[u])); - } - -#else // yyyAVX2+0 - - static const uint32_t CW[] = { - 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 - }; - - uint64_t cc; - size_t u; - - /* - * State uses local endianness. Only the output bytes must be - * converted to little endian (if used on a big-endian machine). - */ - cc = *(uint64_t *)(p->state.d + 48); - for (u = 0; u < 8; u ++) { - uint32_t state[16]; - size_t v; - int i; - - memcpy(&state[0], CW, sizeof CW); - memcpy(&state[4], p->state.d, 48); - state[14] ^= (uint32_t)cc; - state[15] ^= (uint32_t)(cc >> 32); - for (i = 0; i < 10; i ++) { - -#define QROUND(a, b, c, d) do { \ - state[a] += state[b]; \ - state[d] ^= state[a]; \ - state[d] = (state[d] << 16) | (state[d] >> 16); \ - state[c] += state[d]; \ - state[b] ^= state[c]; \ - state[b] = (state[b] << 12) | (state[b] >> 20); \ - state[a] += state[b]; \ - state[d] ^= state[a]; \ - state[d] = (state[d] << 8) | (state[d] >> 24); \ - state[c] += state[d]; \ - state[b] ^= state[c]; \ - state[b] = (state[b] << 7) | (state[b] >> 25); \ - } while (0) - - QROUND( 0, 4, 8, 12); - QROUND( 1, 5, 9, 13); - QROUND( 2, 6, 10, 14); - QROUND( 3, 7, 11, 15); - QROUND( 0, 5, 10, 15); - QROUND( 1, 6, 11, 12); - QROUND( 2, 7, 8, 13); - QROUND( 3, 4, 9, 14); - -#undef QROUND - - } - - for (v = 0; v < 4; v ++) { - state[v] += CW[v]; - } - for (v = 4; v < 14; v ++) { - state[v] += ((uint32_t *)p->state.d)[v - 4]; - } - state[14] += ((uint32_t *)p->state.d)[10] - ^ (uint32_t)cc; - state[15] += ((uint32_t *)p->state.d)[11] - ^ (uint32_t)(cc >> 32); - cc ++; - - /* - * We mimic the interleaving that is used in the AVX2 - * implementation. - */ - for (v = 0; v < 16; v ++) { -#if FALCON_LE // yyyLE+1 - ((uint32_t *)p->buf.d)[u + (v << 3)] = state[v]; -#else // yyyLE+0 - p->buf.d[(u << 2) + (v << 5) + 0] = - (uint8_t)state[v]; - p->buf.d[(u << 2) + (v << 5) + 1] = - (uint8_t)(state[v] >> 8); - p->buf.d[(u << 2) + (v << 5) + 2] = - (uint8_t)(state[v] >> 16); - p->buf.d[(u << 2) + (v << 5) + 3] = - (uint8_t)(state[v] >> 24); -#endif // yyyLE- - } - } - *(uint64_t *)(p->state.d + 48) = cc; - -#endif // yyyAVX2- - - p->ptr = 0; -} - -/* see inner.h */ -void -Zf(prng_get_bytes)(prng *p, void *dst, size_t len) -{ - uint8_t *buf; - - buf = dst; - while (len > 0) { - size_t clen; - - clen = (sizeof p->buf.d) - p->ptr; - if (clen > len) { - clen = len; - } - memcpy(buf, p->buf.d, clen); - buf += clen; - len -= clen; - p->ptr += clen; - if (p->ptr == sizeof p->buf.d) { - Zf(prng_refill)(p); - } - } -} diff --git a/crypto_sign/falcon-512/m4-ct/sign.c b/crypto_sign/falcon-512/m4-ct/sign.c deleted file mode 100644 index 752fb8ba..00000000 --- a/crypto_sign/falcon-512/m4-ct/sign.c +++ /dev/null @@ -1,1532 +0,0 @@ -/* - * Falcon signature generation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -/* =================================================================== */ - -/* - * Compute degree N from logarithm 'logn'. - */ -#define MKN(logn) ((size_t)1 << (logn)) - -/* =================================================================== */ -/* - * Binary case: - * N = 2^logn - * phi = X^N+1 - */ - -/* - * Get the size of the LDL tree for an input with polynomials of size - * 2^logn. The size is expressed in the number of elements. - */ -static inline unsigned -ffLDL_treesize(unsigned logn) -{ - /* - * For logn = 0 (polynomials are constant), the "tree" is a - * single element. Otherwise, the tree node has size 2^logn, and - * has two child trees for size logn-1 each. Thus, treesize s() - * must fulfill these two relations: - * - * s(0) = 1 - * s(logn) = (2^logn) + 2*s(logn-1) - */ - return (logn + 1) << logn; -} - -/* - * Inner function for ffLDL_fft(). It expects the matrix to be both - * auto-adjoint and quasicyclic; also, it uses the source operands - * as modifiable temporaries. - * - * tmp[] must have room for at least one polynomial. - */ -static void -ffLDL_fft_inner(fpr *restrict tree, - fpr *restrict g0, fpr *restrict g1, unsigned logn, fpr *restrict tmp) -{ - size_t n, hn; - - n = MKN(logn); - if (n == 1) { - tree[0] = g0[0]; - return; - } - hn = n >> 1; - - /* - * The LDL decomposition yields L (which is written in the tree) - * and the diagonal of D. Since d00 = g0, we just write d11 - * into tmp. - */ - Zf(poly_LDLmv_fft)(tmp, tree, g0, g1, g0, logn); - - /* - * Split d00 (currently in g0) and d11 (currently in tmp). We - * reuse g0 and g1 as temporary storage spaces: - * d00 splits into g1, g1+hn - * d11 splits into g0, g0+hn - */ - Zf(poly_split_fft)(g1, g1 + hn, g0, logn); - Zf(poly_split_fft)(g0, g0 + hn, tmp, logn); - - /* - * Each split result is the first row of a new auto-adjoint - * quasicyclic matrix for the next recursive step. - */ - ffLDL_fft_inner(tree + n, - g1, g1 + hn, logn - 1, tmp); - ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1), - g0, g0 + hn, logn - 1, tmp); -} - -/* - * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix - * is provided as three polynomials (FFT representation). - * - * The "tree" array is filled with the computed tree, of size - * (logn+1)*(2^logn) elements (see ffLDL_treesize()). - * - * Input arrays MUST NOT overlap, except possibly the three unmodified - * arrays g00, g01 and g11. tmp[] should have room for at least three - * polynomials of 2^logn elements each. - */ -static void -ffLDL_fft(fpr *restrict tree, const fpr *restrict g00, - const fpr *restrict g01, const fpr *restrict g11, - unsigned logn, fpr *restrict tmp) -{ - size_t n, hn; - fpr *d00, *d11; - - n = MKN(logn); - if (n == 1) { - tree[0] = g00[0]; - return; - } - hn = n >> 1; - d00 = tmp; - d11 = tmp + n; - tmp += n << 1; - - memcpy(d00, g00, n * sizeof *g00); - Zf(poly_LDLmv_fft)(d11, tree, g00, g01, g11, logn); - - Zf(poly_split_fft)(tmp, tmp + hn, d00, logn); - Zf(poly_split_fft)(d00, d00 + hn, d11, logn); - memcpy(d11, tmp, n * sizeof *tmp); - ffLDL_fft_inner(tree + n, - d11, d11 + hn, logn - 1, tmp); - ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1), - d00, d00 + hn, logn - 1, tmp); -} - -/* - * Normalize an ffLDL tree: each leaf of value x is replaced with - * sigma / sqrt(x). - */ -static void -ffLDL_binary_normalize(fpr *tree, unsigned logn) -{ - /* - * TODO: make an iterative version. - */ - size_t n; - - n = MKN(logn); - if (n == 1) { - /* - * We actually store in the tree leaf the inverse of - * the value mandated by the specification: this - * saves a division both here and in the sampler. - */ - tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma); - } else { - ffLDL_binary_normalize(tree + n, logn - 1); - ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1), - logn - 1); - } -} - -/* =================================================================== */ - -/* - * Convert an integer polynomial (with small values) into the - * representation with complex numbers. - */ -static void -smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn) -{ - size_t n, u; - - n = MKN(logn); - for (u = 0; u < n; u ++) { - r[u] = fpr_of(t[u]); - } -} - -/* - * The expanded private key contains: - * - The B0 matrix (four elements) - * - The ffLDL tree - */ - -static inline size_t -skoff_b00(unsigned logn) -{ - (void)logn; - return 0; -} - -static inline size_t -skoff_b01(unsigned logn) -{ - return MKN(logn); -} - -static inline size_t -skoff_b10(unsigned logn) -{ - return 2 * MKN(logn); -} - -static inline size_t -skoff_b11(unsigned logn) -{ - return 3 * MKN(logn); -} - -static inline size_t -skoff_tree(unsigned logn) -{ - return 4 * MKN(logn); -} - -/* see inner.h */ -void -Zf(expand_privkey)(fpr *restrict expanded_key, - const int8_t *f, const int8_t *g, - const int8_t *F, const int8_t *G, - unsigned logn, uint8_t *restrict tmp) -{ - size_t n; - fpr *rf, *rg, *rF, *rG; - fpr *b00, *b01, *b10, *b11; - fpr *g00, *g01, *g11, *gxx; - fpr *tree; - - n = MKN(logn); - b00 = expanded_key + skoff_b00(logn); - b01 = expanded_key + skoff_b01(logn); - b10 = expanded_key + skoff_b10(logn); - b11 = expanded_key + skoff_b11(logn); - tree = expanded_key + skoff_tree(logn); - - /* - * We load the private key elements directly into the B0 matrix, - * since B0 = [[g, -f], [G, -F]]. - */ - rf = b01; - rg = b00; - rF = b11; - rG = b10; - - smallints_to_fpr(rf, f, logn); - smallints_to_fpr(rg, g, logn); - smallints_to_fpr(rF, F, logn); - smallints_to_fpr(rG, G, logn); - - /* - * Compute the FFT for the key elements, and negate f and F. - */ - Zf(FFT)(rf, logn); - Zf(FFT)(rg, logn); - Zf(FFT)(rF, logn); - Zf(FFT)(rG, logn); - Zf(poly_neg)(rf, logn); - Zf(poly_neg)(rF, logn); - - /* - * The Gram matrix is G = B·B*. Formulas are: - * g00 = b00*adj(b00) + b01*adj(b01) - * g01 = b00*adj(b10) + b01*adj(b11) - * g10 = b10*adj(b00) + b11*adj(b01) - * g11 = b10*adj(b10) + b11*adj(b11) - * - * For historical reasons, this implementation uses - * g00, g01 and g11 (upper triangle). - */ - g00 = (fpr *)tmp; - g01 = g00 + n; - g11 = g01 + n; - gxx = g11 + n; - - memcpy(g00, b00, n * sizeof *b00); - Zf(poly_mulselfadj_fft)(g00, logn); - memcpy(gxx, b01, n * sizeof *b01); - Zf(poly_mulselfadj_fft)(gxx, logn); - Zf(poly_add)(g00, gxx, logn); - - memcpy(g01, b00, n * sizeof *b00); - Zf(poly_muladj_fft)(g01, b10, logn); - memcpy(gxx, b01, n * sizeof *b01); - Zf(poly_muladj_fft)(gxx, b11, logn); - Zf(poly_add)(g01, gxx, logn); - - memcpy(g11, b10, n * sizeof *b10); - Zf(poly_mulselfadj_fft)(g11, logn); - memcpy(gxx, b11, n * sizeof *b11); - Zf(poly_mulselfadj_fft)(gxx, logn); - Zf(poly_add)(g11, gxx, logn); - - /* - * Compute the Falcon tree. - */ - ffLDL_fft(tree, g00, g01, g11, logn, gxx); - - /* - * Normalize tree. - */ - ffLDL_binary_normalize(tree, logn); -} - -typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma); - -/* - * Perform Fast Fourier Sampling for target vector t. The Gram matrix - * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector - * is written over (t0,t1). The Gram matrix is modified as well. The - * tmp[] buffer must have room for four polynomials. - */ -TARGET_AVX2 -static void -ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx, - fpr *restrict t0, fpr *restrict t1, - fpr *restrict g00, fpr *restrict g01, fpr *restrict g11, - unsigned logn, fpr *restrict tmp) -{ - size_t n, hn; - fpr *z0, *z1; - - /* - * Deepest level: the LDL tree leaf value is just g00 (the - * array has length only 1 at this point); we normalize it - * with regards to sigma, then use it for sampling. - */ - if (logn == 0) { - fpr leaf; - - leaf = g00[0]; - leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma); - t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf)); - t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf)); - return; - } - - n = (size_t)1 << logn; - hn = n >> 1; - - /* - * Decompose G into LDL. We only need d00 (identical to g00), - * d11, and l10; we do that in place. - */ - Zf(poly_LDL_fft)(g00, g01, g11, logn); - - /* - * Split d00 and d11 and expand them into half-size quasi-cyclic - * Gram matrices. We also save l10 in tmp[]. - */ - Zf(poly_split_fft)(tmp, tmp + hn, g00, logn); - memcpy(g00, tmp, n * sizeof *tmp); - Zf(poly_split_fft)(tmp, tmp + hn, g11, logn); - memcpy(g11, tmp, n * sizeof *tmp); - memcpy(tmp, g01, n * sizeof *g01); - memcpy(g01, g00, hn * sizeof *g00); - memcpy(g01 + hn, g11, hn * sizeof *g00); - - /* - * The half-size Gram matrices for the recursive LDL tree - * building are now: - * - left sub-tree: g00, g00+hn, g01 - * - right sub-tree: g11, g11+hn, g01+hn - * l10 is in tmp[]. - */ - - /* - * We split t1 and use the first recursive call on the two - * halves, using the right sub-tree. The result is merged - * back into tmp + 2*n. - */ - z1 = tmp + n; - Zf(poly_split_fft)(z1, z1 + hn, t1, logn); - ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn, - g11, g11 + hn, g01 + hn, logn - 1, z1 + n); - Zf(poly_merge_fft)(tmp + (n << 1), z1, z1 + hn, logn); - - /* - * Compute tb0 = t0 + (t1 - z1) * l10. - * At that point, l10 is in tmp, t1 is unmodified, and z1 is - * in tmp + (n << 1). The buffer in z1 is free. - * - * In the end, z1 is written over t1, and tb0 is in t0. - */ - memcpy(z1, t1, n * sizeof *t1); - Zf(poly_sub)(z1, tmp + (n << 1), logn); - memcpy(t1, tmp + (n << 1), n * sizeof *tmp); - Zf(poly_mul_fft)(tmp, z1, logn); - Zf(poly_add)(t0, tmp, logn); - - /* - * Second recursive invocation, on the split tb0 (currently in t0) - * and the left sub-tree. - */ - z0 = tmp; - Zf(poly_split_fft)(z0, z0 + hn, t0, logn); - ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn, - g00, g00 + hn, g01, logn - 1, z0 + n); - Zf(poly_merge_fft)(t0, z0, z0 + hn, logn); -} - -/* - * Perform Fast Fourier Sampling for target vector t and LDL tree T. - * tmp[] must have size for at least two polynomials of size 2^logn. - */ -TARGET_AVX2 -static void -ffSampling_fft(samplerZ samp, void *samp_ctx, - fpr *restrict z0, fpr *restrict z1, - const fpr *restrict tree, - const fpr *restrict t0, const fpr *restrict t1, unsigned logn, - fpr *restrict tmp) -{ - size_t n, hn; - const fpr *tree0, *tree1; - - /* - * When logn == 2, we inline the last two recursion levels. - */ - if (logn == 2) { -#if FALCON_AVX2 // yyyAVX2+1 - fpr w0, w1, w2, w3, sigma; - __m128d ww0, ww1, wa, wb, wc, wd; - __m128d wy0, wy1, wz0, wz1; - __m128d half, invsqrt8, invsqrt2, neghi, neglo; - int si0, si1, si2, si3; - - tree0 = tree + 4; - tree1 = tree + 8; - - half = _mm_set1_pd(0.5); - invsqrt8 = _mm_set1_pd(0.353553390593273762200422181052); - invsqrt2 = _mm_set1_pd(0.707106781186547524400844362105); - neghi = _mm_set_pd(-0.0, 0.0); - neglo = _mm_set_pd(0.0, -0.0); - - /* - * We split t1 into w*, then do the recursive invocation, - * with output in w*. We finally merge back into z1. - */ - ww0 = _mm_loadu_pd(&t1[0].v); - ww1 = _mm_loadu_pd(&t1[2].v); - wa = _mm_unpacklo_pd(ww0, ww1); - wb = _mm_unpackhi_pd(ww0, ww1); - wc = _mm_add_pd(wa, wb); - ww0 = _mm_mul_pd(wc, half); - wc = _mm_sub_pd(wa, wb); - wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi); - ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8); - - w2.v = _mm_cvtsd_f64(ww1); - w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1)); - wa = ww1; - sigma = tree1[3]; - si2 = samp(samp_ctx, w2, sigma); - si3 = samp(samp_ctx, w3, sigma); - ww1 = _mm_set_pd((double)si3, (double)si2); - wa = _mm_sub_pd(wa, ww1); - wb = _mm_loadu_pd(&tree1[0].v); - wc = _mm_mul_pd(wa, wb); - wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1)); - wa = _mm_unpacklo_pd(wc, wd); - wb = _mm_unpackhi_pd(wc, wd); - ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo))); - w0.v = _mm_cvtsd_f64(ww0); - w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1)); - sigma = tree1[2]; - si0 = samp(samp_ctx, w0, sigma); - si1 = samp(samp_ctx, w1, sigma); - ww0 = _mm_set_pd((double)si1, (double)si0); - - wc = _mm_mul_pd( - _mm_set_pd((double)(si2 + si3), (double)(si2 - si3)), - invsqrt2); - wa = _mm_add_pd(ww0, wc); - wb = _mm_sub_pd(ww0, wc); - ww0 = _mm_unpacklo_pd(wa, wb); - ww1 = _mm_unpackhi_pd(wa, wb); - _mm_storeu_pd(&z1[0].v, ww0); - _mm_storeu_pd(&z1[2].v, ww1); - - /* - * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*. - */ - wy0 = _mm_sub_pd(_mm_loadu_pd(&t1[0].v), ww0); - wy1 = _mm_sub_pd(_mm_loadu_pd(&t1[2].v), ww1); - wz0 = _mm_loadu_pd(&tree[0].v); - wz1 = _mm_loadu_pd(&tree[2].v); - ww0 = _mm_sub_pd(_mm_mul_pd(wy0, wz0), _mm_mul_pd(wy1, wz1)); - ww1 = _mm_add_pd(_mm_mul_pd(wy0, wz1), _mm_mul_pd(wy1, wz0)); - ww0 = _mm_add_pd(ww0, _mm_loadu_pd(&t0[0].v)); - ww1 = _mm_add_pd(ww1, _mm_loadu_pd(&t0[2].v)); - - /* - * Second recursive invocation. - */ - wa = _mm_unpacklo_pd(ww0, ww1); - wb = _mm_unpackhi_pd(ww0, ww1); - wc = _mm_add_pd(wa, wb); - ww0 = _mm_mul_pd(wc, half); - wc = _mm_sub_pd(wa, wb); - wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi); - ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8); - - w2.v = _mm_cvtsd_f64(ww1); - w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1)); - wa = ww1; - sigma = tree0[3]; - si2 = samp(samp_ctx, w2, sigma); - si3 = samp(samp_ctx, w3, sigma); - ww1 = _mm_set_pd((double)si3, (double)si2); - wa = _mm_sub_pd(wa, ww1); - wb = _mm_loadu_pd(&tree0[0].v); - wc = _mm_mul_pd(wa, wb); - wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1)); - wa = _mm_unpacklo_pd(wc, wd); - wb = _mm_unpackhi_pd(wc, wd); - ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo))); - w0.v = _mm_cvtsd_f64(ww0); - w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1)); - sigma = tree0[2]; - si0 = samp(samp_ctx, w0, sigma); - si1 = samp(samp_ctx, w1, sigma); - ww0 = _mm_set_pd((double)si1, (double)si0); - - wc = _mm_mul_pd( - _mm_set_pd((double)(si2 + si3), (double)(si2 - si3)), - invsqrt2); - wa = _mm_add_pd(ww0, wc); - wb = _mm_sub_pd(ww0, wc); - ww0 = _mm_unpacklo_pd(wa, wb); - ww1 = _mm_unpackhi_pd(wa, wb); - _mm_storeu_pd(&z0[0].v, ww0); - _mm_storeu_pd(&z0[2].v, ww1); - - return; -#else // yyyAVX2+0 - fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma; - fpr a_re, a_im, b_re, b_im, c_re, c_im; - - tree0 = tree + 4; - tree1 = tree + 8; - - /* - * We split t1 into w*, then do the recursive invocation, - * with output in w*. We finally merge back into z1. - */ - a_re = t1[0]; - a_im = t1[2]; - b_re = t1[1]; - b_im = t1[3]; - c_re = fpr_add(a_re, b_re); - c_im = fpr_add(a_im, b_im); - w0 = fpr_half(c_re); - w1 = fpr_half(c_im); - c_re = fpr_sub(a_re, b_re); - c_im = fpr_sub(a_im, b_im); - w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8); - w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8); - - x0 = w2; - x1 = w3; - sigma = tree1[3]; - w2 = fpr_of(samp(samp_ctx, x0, sigma)); - w3 = fpr_of(samp(samp_ctx, x1, sigma)); - a_re = fpr_sub(x0, w2); - a_im = fpr_sub(x1, w3); - b_re = tree1[0]; - b_im = tree1[1]; - c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); - c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); - x0 = fpr_add(c_re, w0); - x1 = fpr_add(c_im, w1); - sigma = tree1[2]; - w0 = fpr_of(samp(samp_ctx, x0, sigma)); - w1 = fpr_of(samp(samp_ctx, x1, sigma)); - - a_re = w0; - a_im = w1; - b_re = w2; - b_im = w3; - c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2); - c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2); - z1[0] = w0 = fpr_add(a_re, c_re); - z1[2] = w2 = fpr_add(a_im, c_im); - z1[1] = w1 = fpr_sub(a_re, c_re); - z1[3] = w3 = fpr_sub(a_im, c_im); - - /* - * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*. - */ - w0 = fpr_sub(t1[0], w0); - w1 = fpr_sub(t1[1], w1); - w2 = fpr_sub(t1[2], w2); - w3 = fpr_sub(t1[3], w3); - - a_re = w0; - a_im = w2; - b_re = tree[0]; - b_im = tree[2]; - w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); - w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); - a_re = w1; - a_im = w3; - b_re = tree[1]; - b_im = tree[3]; - w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); - w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); - - w0 = fpr_add(w0, t0[0]); - w1 = fpr_add(w1, t0[1]); - w2 = fpr_add(w2, t0[2]); - w3 = fpr_add(w3, t0[3]); - - /* - * Second recursive invocation. - */ - a_re = w0; - a_im = w2; - b_re = w1; - b_im = w3; - c_re = fpr_add(a_re, b_re); - c_im = fpr_add(a_im, b_im); - w0 = fpr_half(c_re); - w1 = fpr_half(c_im); - c_re = fpr_sub(a_re, b_re); - c_im = fpr_sub(a_im, b_im); - w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8); - w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8); - - x0 = w2; - x1 = w3; - sigma = tree0[3]; - w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma)); - w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma)); - a_re = fpr_sub(x0, y0); - a_im = fpr_sub(x1, y1); - b_re = tree0[0]; - b_im = tree0[1]; - c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); - c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); - x0 = fpr_add(c_re, w0); - x1 = fpr_add(c_im, w1); - sigma = tree0[2]; - w0 = fpr_of(samp(samp_ctx, x0, sigma)); - w1 = fpr_of(samp(samp_ctx, x1, sigma)); - - a_re = w0; - a_im = w1; - b_re = w2; - b_im = w3; - c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2); - c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2); - z0[0] = fpr_add(a_re, c_re); - z0[2] = fpr_add(a_im, c_im); - z0[1] = fpr_sub(a_re, c_re); - z0[3] = fpr_sub(a_im, c_im); - - return; -#endif // yyyAVX2- - } - - /* - * Case logn == 1 is reachable only when using Falcon-2 (the - * smallest size for which Falcon is mathematically defined, but - * of course way too insecure to be of any use). - */ - if (logn == 1) { - fpr x0, x1, y0, y1, sigma; - fpr a_re, a_im, b_re, b_im, c_re, c_im; - - x0 = t1[0]; - x1 = t1[1]; - sigma = tree[3]; - z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma)); - z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma)); - a_re = fpr_sub(x0, y0); - a_im = fpr_sub(x1, y1); - b_re = tree[0]; - b_im = tree[1]; - c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); - c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); - x0 = fpr_add(c_re, t0[0]); - x1 = fpr_add(c_im, t0[1]); - sigma = tree[2]; - z0[0] = fpr_of(samp(samp_ctx, x0, sigma)); - z0[1] = fpr_of(samp(samp_ctx, x1, sigma)); - - return; - } - - /* - * Normal end of recursion is for logn == 0. Since the last - * steps of the recursions were inlined in the blocks above - * (when logn == 1 or 2), this case is not reachable, and is - * retained here only for documentation purposes. - - if (logn == 0) { - fpr x0, x1, sigma; - - x0 = t0[0]; - x1 = t1[0]; - sigma = tree[0]; - z0[0] = fpr_of(samp(samp_ctx, x0, sigma)); - z1[0] = fpr_of(samp(samp_ctx, x1, sigma)); - return; - } - - */ - - /* - * General recursive case (logn >= 3). - */ - - n = (size_t)1 << logn; - hn = n >> 1; - tree0 = tree + n; - tree1 = tree + n + ffLDL_treesize(logn - 1); - - /* - * We split t1 into z1 (reused as temporary storage), then do - * the recursive invocation, with output in tmp. We finally - * merge back into z1. - */ - Zf(poly_split_fft)(z1, z1 + hn, t1, logn); - ffSampling_fft(samp, samp_ctx, tmp, tmp + hn, - tree1, z1, z1 + hn, logn - 1, tmp + n); - Zf(poly_merge_fft)(z1, tmp, tmp + hn, logn); - - /* - * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[]. - */ - memcpy(tmp, t1, n * sizeof *t1); - Zf(poly_sub)(tmp, z1, logn); - Zf(poly_mul_fft)(tmp, tree, logn); - Zf(poly_add)(tmp, t0, logn); - - /* - * Second recursive invocation. - */ - Zf(poly_split_fft)(z0, z0 + hn, tmp, logn); - ffSampling_fft(samp, samp_ctx, tmp, tmp + hn, - tree0, z0, z0 + hn, logn - 1, tmp + n); - Zf(poly_merge_fft)(z0, tmp, tmp + hn, logn); -} - -/* - * Compute a signature: the signature contains two vectors, s1 and s2. - * The s1 vector is not returned. The squared norm of (s1,s2) is - * computed, and if it is short enough, then s2 is returned into the - * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is - * returned; the caller should then try again. This function uses an - * expanded key. - * - * tmp[] must have room for at least six polynomials. - */ -static int -do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2, - const fpr *restrict expanded_key, - const uint16_t *hm, - unsigned logn, fpr *restrict tmp) -{ - size_t n, u; - fpr *t0, *t1, *tx, *ty; - const fpr *b00, *b01, *b10, *b11, *tree; - fpr ni; - uint32_t sqn, ng; - int16_t *s1tmp, *s2tmp; - - n = MKN(logn); - t0 = tmp; - t1 = t0 + n; - b00 = expanded_key + skoff_b00(logn); - b01 = expanded_key + skoff_b01(logn); - b10 = expanded_key + skoff_b10(logn); - b11 = expanded_key + skoff_b11(logn); - tree = expanded_key + skoff_tree(logn); - - /* - * Set the target vector to [hm, 0] (hm is the hashed message). - */ - for (u = 0; u < n; u ++) { - t0[u] = fpr_of(hm[u]); - /* This is implicit. - t1[u] = fpr_zero; - */ - } - - /* - * Apply the lattice basis to obtain the real target - * vector (after normalization with regards to modulus). - */ - Zf(FFT)(t0, logn); - ni = fpr_inverse_of_q; - memcpy(t1, t0, n * sizeof *t0); - Zf(poly_mul_fft)(t1, b01, logn); - Zf(poly_mulconst)(t1, fpr_neg(ni), logn); - Zf(poly_mul_fft)(t0, b11, logn); - Zf(poly_mulconst)(t0, ni, logn); - - tx = t1 + n; - ty = tx + n; - - /* - * Apply sampling. Output is written back in [tx, ty]. - */ - ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, logn, ty + n); - - /* - * Get the lattice point corresponding to that tiny vector. - */ - memcpy(t0, tx, n * sizeof *tx); - memcpy(t1, ty, n * sizeof *ty); - Zf(poly_mul_fft)(tx, b00, logn); - Zf(poly_mul_fft)(ty, b10, logn); - Zf(poly_add)(tx, ty, logn); - memcpy(ty, t0, n * sizeof *t0); - Zf(poly_mul_fft)(ty, b01, logn); - - memcpy(t0, tx, n * sizeof *tx); - Zf(poly_mul_fft)(t1, b11, logn); - Zf(poly_add)(t1, ty, logn); - - Zf(iFFT)(t0, logn); - Zf(iFFT)(t1, logn); - - /* - * Compute the signature. - */ - s1tmp = (int16_t *)tx; - sqn = 0; - ng = 0; - for (u = 0; u < n; u ++) { - int32_t z; - - z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]); - sqn += (uint32_t)(z * z); - ng |= sqn; - s1tmp[u] = (int16_t)z; - } - sqn |= -(ng >> 31); - - /* - * With "normal" degrees (e.g. 512 or 1024), it is very - * improbable that the computed vector is not short enough; - * however, it may happen in practice for the very reduced - * versions (e.g. degree 16 or below). In that case, the caller - * will loop, and we must not write anything into s2[] because - * s2[] may overlap with the hashed message hm[] and we need - * hm[] for the next iteration. - */ - s2tmp = (int16_t *)tmp; - for (u = 0; u < n; u ++) { - s2tmp[u] = (int16_t)-fpr_rint(t1[u]); - } - if (Zf(is_short_half)(sqn, s2tmp, logn)) { - memcpy(s2, s2tmp, n * sizeof *s2); - memcpy(tmp, s1tmp, n * sizeof *s1tmp); - return 1; - } - return 0; -} - -/* - * Compute a signature: the signature contains two vectors, s1 and s2. - * The s1 vector is not returned. The squared norm of (s1,s2) is - * computed, and if it is short enough, then s2 is returned into the - * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is - * returned; the caller should then try again. - * - * tmp[] must have room for at least nine polynomials. - */ -static int -do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2, - const int8_t *restrict f, const int8_t *restrict g, - const int8_t *restrict F, const int8_t *restrict G, - const uint16_t *hm, unsigned logn, fpr *restrict tmp) -{ - size_t n, u; - fpr *t0, *t1, *tx, *ty; - fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11; - fpr ni; - uint32_t sqn, ng; - int16_t *s1tmp, *s2tmp; - - n = MKN(logn); - - /* - * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT. - */ - b00 = tmp; - b01 = b00 + n; - b10 = b01 + n; - b11 = b10 + n; - smallints_to_fpr(b01, f, logn); - smallints_to_fpr(b00, g, logn); - smallints_to_fpr(b11, F, logn); - smallints_to_fpr(b10, G, logn); - Zf(FFT)(b01, logn); - Zf(FFT)(b00, logn); - Zf(FFT)(b11, logn); - Zf(FFT)(b10, logn); - Zf(poly_neg)(b01, logn); - Zf(poly_neg)(b11, logn); - - /* - * Compute the Gram matrix G = B·B*. Formulas are: - * g00 = b00*adj(b00) + b01*adj(b01) - * g01 = b00*adj(b10) + b01*adj(b11) - * g10 = b10*adj(b00) + b11*adj(b01) - * g11 = b10*adj(b10) + b11*adj(b11) - * - * For historical reasons, this implementation uses - * g00, g01 and g11 (upper triangle). g10 is not kept - * since it is equal to adj(g01). - * - * We _replace_ the matrix B with the Gram matrix, but we - * must keep b01 and b11 for computing the target vector. - */ - t0 = b11 + n; - t1 = t0 + n; - - memcpy(t0, b01, n * sizeof *b01); - Zf(poly_mulselfadj_fft)(t0, logn); // t0 <- b01*adj(b01) - - memcpy(t1, b00, n * sizeof *b00); - Zf(poly_muladj_fft)(t1, b10, logn); // t1 <- b00*adj(b10) - Zf(poly_mulselfadj_fft)(b00, logn); // b00 <- b00*adj(b00) - Zf(poly_add)(b00, t0, logn); // b00 <- g00 - memcpy(t0, b01, n * sizeof *b01); - Zf(poly_muladj_fft)(b01, b11, logn); // b01 <- b01*adj(b11) - Zf(poly_add)(b01, t1, logn); // b01 <- g01 - - Zf(poly_mulselfadj_fft)(b10, logn); // b10 <- b10*adj(b10) - memcpy(t1, b11, n * sizeof *b11); - Zf(poly_mulselfadj_fft)(t1, logn); // t1 <- b11*adj(b11) - Zf(poly_add)(b10, t1, logn); // b10 <- g11 - - /* - * We rename variables to make things clearer. The three elements - * of the Gram matrix uses the first 3*n slots of tmp[], followed - * by b11 and b01 (in that order). - */ - g00 = b00; - g01 = b01; - g11 = b10; - b01 = t0; - t0 = b01 + n; - t1 = t0 + n; - - /* - * Memory layout at that point: - * g00 g01 g11 b11 b01 t0 t1 - */ - - /* - * Set the target vector to [hm, 0] (hm is the hashed message). - */ - for (u = 0; u < n; u ++) { - t0[u] = fpr_of(hm[u]); - /* This is implicit. - t1[u] = fpr_zero; - */ - } - - /* - * Apply the lattice basis to obtain the real target - * vector (after normalization with regards to modulus). - */ - Zf(FFT)(t0, logn); - ni = fpr_inverse_of_q; - memcpy(t1, t0, n * sizeof *t0); - Zf(poly_mul_fft)(t1, b01, logn); - Zf(poly_mulconst)(t1, fpr_neg(ni), logn); - Zf(poly_mul_fft)(t0, b11, logn); - Zf(poly_mulconst)(t0, ni, logn); - - /* - * b01 and b11 can be discarded, so we move back (t0,t1). - * Memory layout is now: - * g00 g01 g11 t0 t1 - */ - memcpy(b11, t0, n * 2 * sizeof *t0); - t0 = g11 + n; - t1 = t0 + n; - - /* - * Apply sampling; result is written over (t0,t1). - */ - ffSampling_fft_dyntree(samp, samp_ctx, - t0, t1, g00, g01, g11, logn, t1 + n); - - /* - * We arrange the layout back to: - * b00 b01 b10 b11 t0 t1 - * - * We did not conserve the matrix basis, so we must recompute - * it now. - */ - b00 = tmp; - b01 = b00 + n; - b10 = b01 + n; - b11 = b10 + n; - memmove(b11 + n, t0, n * 2 * sizeof *t0); - t0 = b11 + n; - t1 = t0 + n; - smallints_to_fpr(b01, f, logn); - smallints_to_fpr(b00, g, logn); - smallints_to_fpr(b11, F, logn); - smallints_to_fpr(b10, G, logn); - Zf(FFT)(b01, logn); - Zf(FFT)(b00, logn); - Zf(FFT)(b11, logn); - Zf(FFT)(b10, logn); - Zf(poly_neg)(b01, logn); - Zf(poly_neg)(b11, logn); - tx = t1 + n; - ty = tx + n; - - /* - * Get the lattice point corresponding to that tiny vector. - */ - memcpy(tx, t0, n * sizeof *t0); - memcpy(ty, t1, n * sizeof *t1); - Zf(poly_mul_fft)(tx, b00, logn); - Zf(poly_mul_fft)(ty, b10, logn); - Zf(poly_add)(tx, ty, logn); - memcpy(ty, t0, n * sizeof *t0); - Zf(poly_mul_fft)(ty, b01, logn); - - memcpy(t0, tx, n * sizeof *tx); - Zf(poly_mul_fft)(t1, b11, logn); - Zf(poly_add)(t1, ty, logn); - Zf(iFFT)(t0, logn); - Zf(iFFT)(t1, logn); - - s1tmp = (int16_t *)tx; - sqn = 0; - ng = 0; - for (u = 0; u < n; u ++) { - int32_t z; - - z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]); - sqn += (uint32_t)(z * z); - ng |= sqn; - s1tmp[u] = (int16_t)z; - } - sqn |= -(ng >> 31); - - /* - * With "normal" degrees (e.g. 512 or 1024), it is very - * improbable that the computed vector is not short enough; - * however, it may happen in practice for the very reduced - * versions (e.g. degree 16 or below). In that case, the caller - * will loop, and we must not write anything into s2[] because - * s2[] may overlap with the hashed message hm[] and we need - * hm[] for the next iteration. - */ - s2tmp = (int16_t *)tmp; - for (u = 0; u < n; u ++) { - s2tmp[u] = (int16_t)-fpr_rint(t1[u]); - } - if (Zf(is_short_half)(sqn, s2tmp, logn)) { - memcpy(s2, s2tmp, n * sizeof *s2); - memcpy(tmp, s1tmp, n * sizeof *s1tmp); - return 1; - } - return 0; -} - -/* - * Sample an integer value along a half-gaussian distribution centered - * on zero and standard deviation 1.8205, with a precision of 72 bits. - */ -TARGET_AVX2 -int -Zf(gaussian0_sampler)(prng *p) -{ -#if FALCON_AVX2 // yyyAVX2+1 - - /* - * High words. - */ - static const union { - uint16_t u16[16]; - __m256i ymm[1]; - } rhi15 = { - { - 0x51FB, 0x2A69, 0x113E, 0x0568, - 0x014A, 0x003B, 0x0008, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000 - } - }; - - static const union { - uint64_t u64[20]; - __m256i ymm[5]; - } rlo57 = { - { - 0x1F42ED3AC391802, 0x12B181F3F7DDB82, - 0x1CDD0934829C1FF, 0x1754377C7994AE4, - 0x1846CAEF33F1F6F, 0x14AC754ED74BD5F, - 0x024DD542B776AE4, 0x1A1FFDC65AD63DA, - 0x01F80D88A7B6428, 0x001C3FDB2040C69, - 0x00012CF24D031FB, 0x00000949F8B091F, - 0x0000003665DA998, 0x00000000EBF6EBB, - 0x0000000002F5D7E, 0x000000000007098, - 0x0000000000000C6, 0x000000000000001, - 0x000000000000000, 0x000000000000000 - } - }; - - uint64_t lo; - unsigned hi; - __m256i xhi, rhi, gthi, eqhi, eqm; - __m256i xlo, gtlo0, gtlo1, gtlo2, gtlo3, gtlo4; - __m128i t, zt; - int r; - - /* - * Get a 72-bit random value and split it into a low part - * (57 bits) and a high part (15 bits) - */ - lo = prng_get_u64(p); - hi = prng_get_u8(p); - hi = (hi << 7) | (unsigned)(lo >> 57); - lo &= 0x1FFFFFFFFFFFFFF; - - /* - * Broadcast the high part and compare it with the relevant - * values. We need both a "greater than" and an "equal" - * comparisons. - */ - xhi = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(hi)); - rhi = _mm256_loadu_si256(&rhi15.ymm[0]); - gthi = _mm256_cmpgt_epi16(rhi, xhi); - eqhi = _mm256_cmpeq_epi16(rhi, xhi); - - /* - * The result is the number of 72-bit values (among the list of 19) - * which are greater than the 72-bit random value. We first count - * all non-zero 16-bit elements in the first eight of gthi. Such - * elements have value -1 or 0, so we first negate them. - */ - t = _mm_srli_epi16(_mm256_castsi256_si128(gthi), 15); - zt = _mm_setzero_si128(); - t = _mm_hadd_epi16(t, zt); - t = _mm_hadd_epi16(t, zt); - t = _mm_hadd_epi16(t, zt); - r = _mm_cvtsi128_si32(t); - - /* - * We must look at the low bits for all values for which the - * high bits are an "equal" match; values 8-18 all have the - * same high bits (0). - * On 32-bit systems, 'lo' really is two registers, requiring - * some extra code. - */ -#if defined(__x86_64__) || defined(_M_X64) - xlo = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(*(int64_t *)&lo)); -#else - { - uint32_t e0, e1; - int32_t f0, f1; - - e0 = (uint32_t)lo; - e1 = (uint32_t)(lo >> 32); - f0 = *(int32_t *)&e0; - f1 = *(int32_t *)&e1; - xlo = _mm256_set_epi32(f1, f0, f1, f0, f1, f0, f1, f0); - } -#endif - gtlo0 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[0]), xlo); - gtlo1 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[1]), xlo); - gtlo2 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[2]), xlo); - gtlo3 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[3]), xlo); - gtlo4 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[4]), xlo); - - /* - * Keep only comparison results that correspond to the non-zero - * elements in eqhi. - */ - gtlo0 = _mm256_and_si256(gtlo0, _mm256_cvtepi16_epi64( - _mm256_castsi256_si128(eqhi))); - gtlo1 = _mm256_and_si256(gtlo1, _mm256_cvtepi16_epi64( - _mm256_castsi256_si128(_mm256_bsrli_epi128(eqhi, 8)))); - eqm = _mm256_permute4x64_epi64(eqhi, 0xFF); - gtlo2 = _mm256_and_si256(gtlo2, eqm); - gtlo3 = _mm256_and_si256(gtlo3, eqm); - gtlo4 = _mm256_and_si256(gtlo4, eqm); - - /* - * Add all values to count the total number of "-1" elements. - * Since the first eight "high" words are all different, only - * one element (at most) in gtlo0:gtlo1 can be non-zero; however, - * if the high word of the random value is zero, then many - * elements of gtlo2:gtlo3:gtlo4 can be non-zero. - */ - gtlo0 = _mm256_or_si256(gtlo0, gtlo1); - gtlo0 = _mm256_add_epi64( - _mm256_add_epi64(gtlo0, gtlo2), - _mm256_add_epi64(gtlo3, gtlo4)); - t = _mm_add_epi64( - _mm256_castsi256_si128(gtlo0), - _mm256_extracti128_si256(gtlo0, 1)); - t = _mm_add_epi64(t, _mm_srli_si128(t, 8)); - r -= _mm_cvtsi128_si32(t); - - return r; - -#else // yyyAVX2+0 - - static const uint32_t dist[] = { - 10745844u, 3068844u, 3741698u, - 5559083u, 1580863u, 8248194u, - 2260429u, 13669192u, 2736639u, - 708981u, 4421575u, 10046180u, - 169348u, 7122675u, 4136815u, - 30538u, 13063405u, 7650655u, - 4132u, 14505003u, 7826148u, - 417u, 16768101u, 11363290u, - 31u, 8444042u, 8086568u, - 1u, 12844466u, 265321u, - 0u, 1232676u, 13644283u, - 0u, 38047u, 9111839u, - 0u, 870u, 6138264u, - 0u, 14u, 12545723u, - 0u, 0u, 3104126u, - 0u, 0u, 28824u, - 0u, 0u, 198u, - 0u, 0u, 1u - }; - - uint32_t v0, v1, v2, hi; - uint64_t lo; - size_t u; - int z; - - /* - * Get a random 72-bit value, into three 24-bit limbs v0..v2. - */ - lo = prng_get_u64(p); - hi = prng_get_u8(p); - v0 = (uint32_t)lo & 0xFFFFFF; - v1 = (uint32_t)(lo >> 24) & 0xFFFFFF; - v2 = (uint32_t)(lo >> 48) | (hi << 16); - - /* - * Sampled value is z, such that v0..v2 is lower than the first - * z elements of the table. - */ - z = 0; - for (u = 0; u < (sizeof dist) / sizeof(dist[0]); u += 3) { - uint32_t w0, w1, w2, cc; - - w0 = dist[u + 2]; - w1 = dist[u + 1]; - w2 = dist[u + 0]; - cc = (v0 - w0) >> 31; - cc = (v1 - w1 - cc) >> 31; - cc = (v2 - w2 - cc) >> 31; - z += (int)cc; - } - return z; - -#endif // yyyAVX2- -} - -/* - * Sample a bit with probability exp(-x) for some x >= 0. - */ -TARGET_AVX2 -static int -BerExp(prng *p, fpr x, fpr ccs) -{ - int s, i; - fpr r; - uint32_t sw, w; - uint64_t z; - - /* - * Reduce x modulo log(2): x = s*log(2) + r, with s an integer, - * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc(). - */ - s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2)); - r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2)); - - /* - * It may happen (quite rarely) that s >= 64; if sigma = 1.2 - * (the minimum value for sigma), r = 0 and b = 1, then we get - * s >= 64 if the half-Gaussian produced a z >= 13, which happens - * with probability about 0.000000000230383991, which is - * approximatively equal to 2^(-32). In any case, if s >= 64, - * then BerExp will be non-zero with probability less than - * 2^(-64), so we can simply saturate s at 63. - */ - sw = (uint32_t)s; - sw ^= (sw ^ 63) & -((63 - sw) >> 31); - s = (int)sw; - - /* - * Compute exp(-r); we know that 0 <= r < log(2) at this point, so - * we can use fpr_expm_p63(), which yields a result scaled to 2^63. - * We scale it up to 2^64, then right-shift it by s bits because - * we really want exp(-x) = 2^(-s)*exp(-r). - * - * The "-1" operation makes sure that the value fits on 64 bits - * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that - * case). The bias is negligible since fpr_expm_p63() only computes - * with 51 bits of precision or so. - */ - z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s; - - /* - * Sample a bit with probability exp(-x). Since x = s*log(2) + r, - * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the - * PRNG output to limit its consumption, the sign of the difference - * yields the expected result. - */ - i = 64; - do { - i -= 8; - w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF); - } while (!w && i > 0); - return (int)(w >> 31); -} - -/* - * The sampler produces a random integer that follows a discrete Gaussian - * distribution, centered on mu, and with standard deviation sigma. The - * provided parameter isigma is equal to 1/sigma. - * - * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between - * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9. - */ -TARGET_AVX2 -int -Zf(sampler)(void *ctx, fpr mu, fpr isigma) -{ - sampler_context *spc; - int s; - fpr r, dss, ccs; - - spc = ctx; - - /* - * Center is mu. We compute mu = s + r where s is an integer - * and 0 <= r < 1. - */ - s = (int)fpr_floor(mu); - r = fpr_sub(mu, fpr_of(s)); - - /* - * dss = 1/(2*sigma^2) = 0.5*(isigma^2). - */ - dss = fpr_half(fpr_sqr(isigma)); - - /* - * ccs = sigma_min / sigma = sigma_min * isigma. - */ - ccs = fpr_mul(isigma, spc->sigma_min); - - /* - * We now need to sample on center r. - */ - for (;;) { - int z0, z, b; - fpr x; - - /* - * Sample z for a Gaussian distribution. Then get a - * random bit b to turn the sampling into a bimodal - * distribution: if b = 1, we use z+1, otherwise we - * use -z. We thus have two situations: - * - * - b = 1: z >= 1 and sampled against a Gaussian - * centered on 1. - * - b = 0: z <= 0 and sampled against a Gaussian - * centered on 0. - */ - z0 = Zf(gaussian0_sampler)(&spc->p); - b = prng_get_u8(&spc->p) & 1; - z = b + ((b << 1) - 1) * z0; - - /* - * Rejection sampling. We want a Gaussian centered on r; - * but we sampled against a Gaussian centered on b (0 or - * 1). But we know that z is always in the range where - * our sampling distribution is greater than the Gaussian - * distribution, so rejection works. - * - * We got z with distribution: - * G(z) = exp(-((z-b)^2)/(2*sigma0^2)) - * We target distribution: - * S(z) = exp(-((z-r)^2)/(2*sigma^2)) - * Rejection sampling works by keeping the value z with - * probability S(z)/G(z), and starting again otherwise. - * This requires S(z) <= G(z), which is the case here. - * Thus, we simply need to keep our z with probability: - * P = exp(-x) - * where: - * x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2) - * - * Here, we scale up the Bernouilli distribution, which - * makes rejection more probable, but makes rejection - * rate sufficiently decorrelated from the Gaussian - * center and standard deviation that the whole sampler - * can be said to be constant-time. - */ - x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss); - x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0)); - if (BerExp(&spc->p, x, ccs)) { - /* - * Rejection sampling was centered on r, but the - * actual center is mu = s + r. - */ - return s + z; - } - } -} - -/* see inner.h */ -void -Zf(sign_tree)(int16_t *sig, inner_shake256_context *rng, - const fpr *restrict expanded_key, - const uint16_t *hm, unsigned logn, uint8_t *tmp) -{ - fpr *ftmp; - - ftmp = (fpr *)tmp; - for (;;) { - /* - * Signature produces short vectors s1 and s2. The - * signature is acceptable only if the aggregate vector - * s1,s2 is short; we must use the same bound as the - * verifier. - * - * If the signature is acceptable, then we return only s2 - * (the verifier recomputes s1 from s2, the hashed message, - * and the public key). - */ - sampler_context spc; - samplerZ samp; - void *samp_ctx; - - /* - * Normal sampling. We use a fast PRNG seeded from our - * SHAKE context ('rng'). - */ - spc.sigma_min = (logn == 10) - ? fpr_sigma_min_10 - : fpr_sigma_min_9; - Zf(prng_init)(&spc.p, rng); - samp = Zf(sampler); - samp_ctx = &spc; - - /* - * Do the actual signature. - */ - if (do_sign_tree(samp, samp_ctx, sig, - expanded_key, hm, logn, ftmp)) - { - break; - } - } -} - -/* see inner.h */ -void -Zf(sign_dyn)(int16_t *sig, inner_shake256_context *rng, - const int8_t *restrict f, const int8_t *restrict g, - const int8_t *restrict F, const int8_t *restrict G, - const uint16_t *hm, unsigned logn, uint8_t *tmp) -{ - fpr *ftmp; - - ftmp = (fpr *)tmp; - for (;;) { - /* - * Signature produces short vectors s1 and s2. The - * signature is acceptable only if the aggregate vector - * s1,s2 is short; we must use the same bound as the - * verifier. - * - * If the signature is acceptable, then we return only s2 - * (the verifier recomputes s1 from s2, the hashed message, - * and the public key). - */ - sampler_context spc; - samplerZ samp; - void *samp_ctx; - - /* - * Normal sampling. We use a fast PRNG seeded from our - * SHAKE context ('rng'). - */ - spc.sigma_min = (logn == 10) - ? fpr_sigma_min_10 - : fpr_sigma_min_9; - Zf(prng_init)(&spc.p, rng); - samp = Zf(sampler); - samp_ctx = &spc; - - /* - * Do the actual signature. - */ - if (do_sign_dyn(samp, samp_ctx, sig, - f, g, F, G, hm, logn, ftmp)) - { - break; - } - } -} diff --git a/crypto_sign/falcon-512/m4-ct/vrfy.c b/crypto_sign/falcon-512/m4-ct/vrfy.c deleted file mode 100644 index c74a3dd3..00000000 --- a/crypto_sign/falcon-512/m4-ct/vrfy.c +++ /dev/null @@ -1,871 +0,0 @@ -/* - * Falcon signature verification. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include "inner.h" - -/* ===================================================================== */ -/* - * Constants for NTT. - * - * n = 2^logn (2 <= n <= 1024) - * phi = X^n + 1 - * q = 12289 - * q0i = -1/q mod 2^16 - * R = 2^16 mod q - * R2 = 2^32 mod q - */ - -#define Q 12289 -#define Q0I 12287 -#define R 4091 -#define R2 10952 - -/* - * Table for NTT, binary case: - * GMb[x] = R*(g^rev(x)) mod q - * where g = 7 (it is a 2048-th primitive root of 1 modulo q) - * and rev() is the bit-reversal function over 10 bits. - */ -static const uint16_t GMb[] = { - 4091, 7888, 11060, 11208, 6960, 4342, 6275, 9759, - 1591, 6399, 9477, 5266, 586, 5825, 7538, 9710, - 1134, 6407, 1711, 965, 7099, 7674, 3743, 6442, - 10414, 8100, 1885, 1688, 1364, 10329, 10164, 9180, - 12210, 6240, 997, 117, 4783, 4407, 1549, 7072, - 2829, 6458, 4431, 8877, 7144, 2564, 5664, 4042, - 12189, 432, 10751, 1237, 7610, 1534, 3983, 7863, - 2181, 6308, 8720, 6570, 4843, 1690, 14, 3872, - 5569, 9368, 12163, 2019, 7543, 2315, 4673, 7340, - 1553, 1156, 8401, 11389, 1020, 2967, 10772, 7045, - 3316, 11236, 5285, 11578, 10637, 10086, 9493, 6180, - 9277, 6130, 3323, 883, 10469, 489, 1502, 2851, - 11061, 9729, 2742, 12241, 4970, 10481, 10078, 1195, - 730, 1762, 3854, 2030, 5892, 10922, 9020, 5274, - 9179, 3604, 3782, 10206, 3180, 3467, 4668, 2446, - 7613, 9386, 834, 7703, 6836, 3403, 5351, 12276, - 3580, 1739, 10820, 9787, 10209, 4070, 12250, 8525, - 10401, 2749, 7338, 10574, 6040, 943, 9330, 1477, - 6865, 9668, 3585, 6633, 12145, 4063, 3684, 7680, - 8188, 6902, 3533, 9807, 6090, 727, 10099, 7003, - 6945, 1949, 9731, 10559, 6057, 378, 7871, 8763, - 8901, 9229, 8846, 4551, 9589, 11664, 7630, 8821, - 5680, 4956, 6251, 8388, 10156, 8723, 2341, 3159, - 1467, 5460, 8553, 7783, 2649, 2320, 9036, 6188, - 737, 3698, 4699, 5753, 9046, 3687, 16, 914, - 5186, 10531, 4552, 1964, 3509, 8436, 7516, 5381, - 10733, 3281, 7037, 1060, 2895, 7156, 8887, 5357, - 6409, 8197, 2962, 6375, 5064, 6634, 5625, 278, - 932, 10229, 8927, 7642, 351, 9298, 237, 5858, - 7692, 3146, 12126, 7586, 2053, 11285, 3802, 5204, - 4602, 1748, 11300, 340, 3711, 4614, 300, 10993, - 5070, 10049, 11616, 12247, 7421, 10707, 5746, 5654, - 3835, 5553, 1224, 8476, 9237, 3845, 250, 11209, - 4225, 6326, 9680, 12254, 4136, 2778, 692, 8808, - 6410, 6718, 10105, 10418, 3759, 7356, 11361, 8433, - 6437, 3652, 6342, 8978, 5391, 2272, 6476, 7416, - 8418, 10824, 11986, 5733, 876, 7030, 2167, 2436, - 3442, 9217, 8206, 4858, 5964, 2746, 7178, 1434, - 7389, 8879, 10661, 11457, 4220, 1432, 10832, 4328, - 8557, 1867, 9454, 2416, 3816, 9076, 686, 5393, - 2523, 4339, 6115, 619, 937, 2834, 7775, 3279, - 2363, 7488, 6112, 5056, 824, 10204, 11690, 1113, - 2727, 9848, 896, 2028, 5075, 2654, 10464, 7884, - 12169, 5434, 3070, 6400, 9132, 11672, 12153, 4520, - 1273, 9739, 11468, 9937, 10039, 9720, 2262, 9399, - 11192, 315, 4511, 1158, 6061, 6751, 11865, 357, - 7367, 4550, 983, 8534, 8352, 10126, 7530, 9253, - 4367, 5221, 3999, 8777, 3161, 6990, 4130, 11652, - 3374, 11477, 1753, 292, 8681, 2806, 10378, 12188, - 5800, 11811, 3181, 1988, 1024, 9340, 2477, 10928, - 4582, 6750, 3619, 5503, 5233, 2463, 8470, 7650, - 7964, 6395, 1071, 1272, 3474, 11045, 3291, 11344, - 8502, 9478, 9837, 1253, 1857, 6233, 4720, 11561, - 6034, 9817, 3339, 1797, 2879, 6242, 5200, 2114, - 7962, 9353, 11363, 5475, 6084, 9601, 4108, 7323, - 10438, 9471, 1271, 408, 6911, 3079, 360, 8276, - 11535, 9156, 9049, 11539, 850, 8617, 784, 7919, - 8334, 12170, 1846, 10213, 12184, 7827, 11903, 5600, - 9779, 1012, 721, 2784, 6676, 6552, 5348, 4424, - 6816, 8405, 9959, 5150, 2356, 5552, 5267, 1333, - 8801, 9661, 7308, 5788, 4910, 909, 11613, 4395, - 8238, 6686, 4302, 3044, 2285, 12249, 1963, 9216, - 4296, 11918, 695, 4371, 9793, 4884, 2411, 10230, - 2650, 841, 3890, 10231, 7248, 8505, 11196, 6688, - 4059, 6060, 3686, 4722, 11853, 5816, 7058, 6868, - 11137, 7926, 4894, 12284, 4102, 3908, 3610, 6525, - 7938, 7982, 11977, 6755, 537, 4562, 1623, 8227, - 11453, 7544, 906, 11816, 9548, 10858, 9703, 2815, - 11736, 6813, 6979, 819, 8903, 6271, 10843, 348, - 7514, 8339, 6439, 694, 852, 5659, 2781, 3716, - 11589, 3024, 1523, 8659, 4114, 10738, 3303, 5885, - 2978, 7289, 11884, 9123, 9323, 11830, 98, 2526, - 2116, 4131, 11407, 1844, 3645, 3916, 8133, 2224, - 10871, 8092, 9651, 5989, 7140, 8480, 1670, 159, - 10923, 4918, 128, 7312, 725, 9157, 5006, 6393, - 3494, 6043, 10972, 6181, 11838, 3423, 10514, 7668, - 3693, 6658, 6905, 11953, 10212, 11922, 9101, 8365, - 5110, 45, 2400, 1921, 4377, 2720, 1695, 51, - 2808, 650, 1896, 9997, 9971, 11980, 8098, 4833, - 4135, 4257, 5838, 4765, 10985, 11532, 590, 12198, - 482, 12173, 2006, 7064, 10018, 3912, 12016, 10519, - 11362, 6954, 2210, 284, 5413, 6601, 3865, 10339, - 11188, 6231, 517, 9564, 11281, 3863, 1210, 4604, - 8160, 11447, 153, 7204, 5763, 5089, 9248, 12154, - 11748, 1354, 6672, 179, 5532, 2646, 5941, 12185, - 862, 3158, 477, 7279, 5678, 7914, 4254, 302, - 2893, 10114, 6890, 9560, 9647, 11905, 4098, 9824, - 10269, 1353, 10715, 5325, 6254, 3951, 1807, 6449, - 5159, 1308, 8315, 3404, 1877, 1231, 112, 6398, - 11724, 12272, 7286, 1459, 12274, 9896, 3456, 800, - 1397, 10678, 103, 7420, 7976, 936, 764, 632, - 7996, 8223, 8445, 7758, 10870, 9571, 2508, 1946, - 6524, 10158, 1044, 4338, 2457, 3641, 1659, 4139, - 4688, 9733, 11148, 3946, 2082, 5261, 2036, 11850, - 7636, 12236, 5366, 2380, 1399, 7720, 2100, 3217, - 10912, 8898, 7578, 11995, 2791, 1215, 3355, 2711, - 2267, 2004, 8568, 10176, 3214, 2337, 1750, 4729, - 4997, 7415, 6315, 12044, 4374, 7157, 4844, 211, - 8003, 10159, 9290, 11481, 1735, 2336, 5793, 9875, - 8192, 986, 7527, 1401, 870, 3615, 8465, 2756, - 9770, 2034, 10168, 3264, 6132, 54, 2880, 4763, - 11805, 3074, 8286, 9428, 4881, 6933, 1090, 10038, - 2567, 708, 893, 6465, 4962, 10024, 2090, 5718, - 10743, 780, 4733, 4623, 2134, 2087, 4802, 884, - 5372, 5795, 5938, 4333, 6559, 7549, 5269, 10664, - 4252, 3260, 5917, 10814, 5768, 9983, 8096, 7791, - 6800, 7491, 6272, 1907, 10947, 6289, 11803, 6032, - 11449, 1171, 9201, 7933, 2479, 7970, 11337, 7062, - 8911, 6728, 6542, 8114, 8828, 6595, 3545, 4348, - 4610, 2205, 6999, 8106, 5560, 10390, 9321, 2499, - 2413, 7272, 6881, 10582, 9308, 9437, 3554, 3326, - 5991, 11969, 3415, 12283, 9838, 12063, 4332, 7830, - 11329, 6605, 12271, 2044, 11611, 7353, 11201, 11582, - 3733, 8943, 9978, 1627, 7168, 3935, 5050, 2762, - 7496, 10383, 755, 1654, 12053, 4952, 10134, 4394, - 6592, 7898, 7497, 8904, 12029, 3581, 10748, 5674, - 10358, 4901, 7414, 8771, 710, 6764, 8462, 7193, - 5371, 7274, 11084, 290, 7864, 6827, 11822, 2509, - 6578, 4026, 5807, 1458, 5721, 5762, 4178, 2105, - 11621, 4852, 8897, 2856, 11510, 9264, 2520, 8776, - 7011, 2647, 1898, 7039, 5950, 11163, 5488, 6277, - 9182, 11456, 633, 10046, 11554, 5633, 9587, 2333, - 7008, 7084, 5047, 7199, 9865, 8997, 569, 6390, - 10845, 9679, 8268, 11472, 4203, 1997, 2, 9331, - 162, 6182, 2000, 3649, 9792, 6363, 7557, 6187, - 8510, 9935, 5536, 9019, 3706, 12009, 1452, 3067, - 5494, 9692, 4865, 6019, 7106, 9610, 4588, 10165, - 6261, 5887, 2652, 10172, 1580, 10379, 4638, 9949 -}; - -/* - * Table for inverse NTT, binary case: - * iGMb[x] = R*((1/g)^rev(x)) mod q - * Since g = 7, 1/g = 8778 mod 12289. - */ -static const uint16_t iGMb[] = { - 4091, 4401, 1081, 1229, 2530, 6014, 7947, 5329, - 2579, 4751, 6464, 11703, 7023, 2812, 5890, 10698, - 3109, 2125, 1960, 10925, 10601, 10404, 4189, 1875, - 5847, 8546, 4615, 5190, 11324, 10578, 5882, 11155, - 8417, 12275, 10599, 7446, 5719, 3569, 5981, 10108, - 4426, 8306, 10755, 4679, 11052, 1538, 11857, 100, - 8247, 6625, 9725, 5145, 3412, 7858, 5831, 9460, - 5217, 10740, 7882, 7506, 12172, 11292, 6049, 79, - 13, 6938, 8886, 5453, 4586, 11455, 2903, 4676, - 9843, 7621, 8822, 9109, 2083, 8507, 8685, 3110, - 7015, 3269, 1367, 6397, 10259, 8435, 10527, 11559, - 11094, 2211, 1808, 7319, 48, 9547, 2560, 1228, - 9438, 10787, 11800, 1820, 11406, 8966, 6159, 3012, - 6109, 2796, 2203, 1652, 711, 7004, 1053, 8973, - 5244, 1517, 9322, 11269, 900, 3888, 11133, 10736, - 4949, 7616, 9974, 4746, 10270, 126, 2921, 6720, - 6635, 6543, 1582, 4868, 42, 673, 2240, 7219, - 1296, 11989, 7675, 8578, 11949, 989, 10541, 7687, - 7085, 8487, 1004, 10236, 4703, 163, 9143, 4597, - 6431, 12052, 2991, 11938, 4647, 3362, 2060, 11357, - 12011, 6664, 5655, 7225, 5914, 9327, 4092, 5880, - 6932, 3402, 5133, 9394, 11229, 5252, 9008, 1556, - 6908, 4773, 3853, 8780, 10325, 7737, 1758, 7103, - 11375, 12273, 8602, 3243, 6536, 7590, 8591, 11552, - 6101, 3253, 9969, 9640, 4506, 3736, 6829, 10822, - 9130, 9948, 3566, 2133, 3901, 6038, 7333, 6609, - 3468, 4659, 625, 2700, 7738, 3443, 3060, 3388, - 3526, 4418, 11911, 6232, 1730, 2558, 10340, 5344, - 5286, 2190, 11562, 6199, 2482, 8756, 5387, 4101, - 4609, 8605, 8226, 144, 5656, 8704, 2621, 5424, - 10812, 2959, 11346, 6249, 1715, 4951, 9540, 1888, - 3764, 39, 8219, 2080, 2502, 1469, 10550, 8709, - 5601, 1093, 3784, 5041, 2058, 8399, 11448, 9639, - 2059, 9878, 7405, 2496, 7918, 11594, 371, 7993, - 3073, 10326, 40, 10004, 9245, 7987, 5603, 4051, - 7894, 676, 11380, 7379, 6501, 4981, 2628, 3488, - 10956, 7022, 6737, 9933, 7139, 2330, 3884, 5473, - 7865, 6941, 5737, 5613, 9505, 11568, 11277, 2510, - 6689, 386, 4462, 105, 2076, 10443, 119, 3955, - 4370, 11505, 3672, 11439, 750, 3240, 3133, 754, - 4013, 11929, 9210, 5378, 11881, 11018, 2818, 1851, - 4966, 8181, 2688, 6205, 6814, 926, 2936, 4327, - 10175, 7089, 6047, 9410, 10492, 8950, 2472, 6255, - 728, 7569, 6056, 10432, 11036, 2452, 2811, 3787, - 945, 8998, 1244, 8815, 11017, 11218, 5894, 4325, - 4639, 3819, 9826, 7056, 6786, 8670, 5539, 7707, - 1361, 9812, 2949, 11265, 10301, 9108, 478, 6489, - 101, 1911, 9483, 3608, 11997, 10536, 812, 8915, - 637, 8159, 5299, 9128, 3512, 8290, 7068, 7922, - 3036, 4759, 2163, 3937, 3755, 11306, 7739, 4922, - 11932, 424, 5538, 6228, 11131, 7778, 11974, 1097, - 2890, 10027, 2569, 2250, 2352, 821, 2550, 11016, - 7769, 136, 617, 3157, 5889, 9219, 6855, 120, - 4405, 1825, 9635, 7214, 10261, 11393, 2441, 9562, - 11176, 599, 2085, 11465, 7233, 6177, 4801, 9926, - 9010, 4514, 9455, 11352, 11670, 6174, 7950, 9766, - 6896, 11603, 3213, 8473, 9873, 2835, 10422, 3732, - 7961, 1457, 10857, 8069, 832, 1628, 3410, 4900, - 10855, 5111, 9543, 6325, 7431, 4083, 3072, 8847, - 9853, 10122, 5259, 11413, 6556, 303, 1465, 3871, - 4873, 5813, 10017, 6898, 3311, 5947, 8637, 5852, - 3856, 928, 4933, 8530, 1871, 2184, 5571, 5879, - 3481, 11597, 9511, 8153, 35, 2609, 5963, 8064, - 1080, 12039, 8444, 3052, 3813, 11065, 6736, 8454, - 2340, 7651, 1910, 10709, 2117, 9637, 6402, 6028, - 2124, 7701, 2679, 5183, 6270, 7424, 2597, 6795, - 9222, 10837, 280, 8583, 3270, 6753, 2354, 3779, - 6102, 4732, 5926, 2497, 8640, 10289, 6107, 12127, - 2958, 12287, 10292, 8086, 817, 4021, 2610, 1444, - 5899, 11720, 3292, 2424, 5090, 7242, 5205, 5281, - 9956, 2702, 6656, 735, 2243, 11656, 833, 3107, - 6012, 6801, 1126, 6339, 5250, 10391, 9642, 5278, - 3513, 9769, 3025, 779, 9433, 3392, 7437, 668, - 10184, 8111, 6527, 6568, 10831, 6482, 8263, 5711, - 9780, 467, 5462, 4425, 11999, 1205, 5015, 6918, - 5096, 3827, 5525, 11579, 3518, 4875, 7388, 1931, - 6615, 1541, 8708, 260, 3385, 4792, 4391, 5697, - 7895, 2155, 7337, 236, 10635, 11534, 1906, 4793, - 9527, 7239, 8354, 5121, 10662, 2311, 3346, 8556, - 707, 1088, 4936, 678, 10245, 18, 5684, 960, - 4459, 7957, 226, 2451, 6, 8874, 320, 6298, - 8963, 8735, 2852, 2981, 1707, 5408, 5017, 9876, - 9790, 2968, 1899, 6729, 4183, 5290, 10084, 7679, - 7941, 8744, 5694, 3461, 4175, 5747, 5561, 3378, - 5227, 952, 4319, 9810, 4356, 3088, 11118, 840, - 6257, 486, 6000, 1342, 10382, 6017, 4798, 5489, - 4498, 4193, 2306, 6521, 1475, 6372, 9029, 8037, - 1625, 7020, 4740, 5730, 7956, 6351, 6494, 6917, - 11405, 7487, 10202, 10155, 7666, 7556, 11509, 1546, - 6571, 10199, 2265, 7327, 5824, 11396, 11581, 9722, - 2251, 11199, 5356, 7408, 2861, 4003, 9215, 484, - 7526, 9409, 12235, 6157, 9025, 2121, 10255, 2519, - 9533, 3824, 8674, 11419, 10888, 4762, 11303, 4097, - 2414, 6496, 9953, 10554, 808, 2999, 2130, 4286, - 12078, 7445, 5132, 7915, 245, 5974, 4874, 7292, - 7560, 10539, 9952, 9075, 2113, 3721, 10285, 10022, - 9578, 8934, 11074, 9498, 294, 4711, 3391, 1377, - 9072, 10189, 4569, 10890, 9909, 6923, 53, 4653, - 439, 10253, 7028, 10207, 8343, 1141, 2556, 7601, - 8150, 10630, 8648, 9832, 7951, 11245, 2131, 5765, - 10343, 9781, 2718, 1419, 4531, 3844, 4066, 4293, - 11657, 11525, 11353, 4313, 4869, 12186, 1611, 10892, - 11489, 8833, 2393, 15, 10830, 5003, 17, 565, - 5891, 12177, 11058, 10412, 8885, 3974, 10981, 7130, - 5840, 10482, 8338, 6035, 6964, 1574, 10936, 2020, - 2465, 8191, 384, 2642, 2729, 5399, 2175, 9396, - 11987, 8035, 4375, 6611, 5010, 11812, 9131, 11427, - 104, 6348, 9643, 6757, 12110, 5617, 10935, 541, - 135, 3041, 7200, 6526, 5085, 12136, 842, 4129, - 7685, 11079, 8426, 1008, 2725, 11772, 6058, 1101, - 1950, 8424, 5688, 6876, 12005, 10079, 5335, 927, - 1770, 273, 8377, 2271, 5225, 10283, 116, 11807, - 91, 11699, 757, 1304, 7524, 6451, 8032, 8154, - 7456, 4191, 309, 2318, 2292, 10393, 11639, 9481, - 12238, 10594, 9569, 7912, 10368, 9889, 12244, 7179, - 3924, 3188, 367, 2077, 336, 5384, 5631, 8596, - 4621, 1775, 8866, 451, 6108, 1317, 6246, 8795, - 5896, 7283, 3132, 11564, 4977, 12161, 7371, 1366, - 12130, 10619, 3809, 5149, 6300, 2638, 4197, 1418, - 10065, 4156, 8373, 8644, 10445, 882, 8158, 10173, - 9763, 12191, 459, 2966, 3166, 405, 5000, 9311, - 6404, 8986, 1551, 8175, 3630, 10766, 9265, 700, - 8573, 9508, 6630, 11437, 11595, 5850, 3950, 4775, - 11941, 1446, 6018, 3386, 11470, 5310, 5476, 553, - 9474, 2586, 1431, 2741, 473, 11383, 4745, 836, - 4062, 10666, 7727, 11752, 5534, 312, 4307, 4351, - 5764, 8679, 8381, 8187, 5, 7395, 4363, 1152, - 5421, 5231, 6473, 436, 7567, 8603, 6229, 8230 -}; - -/* - * Reduce a small signed integer modulo q. The source integer MUST - * be between -q/2 and +q/2. - */ -static inline uint32_t -mq_conv_small(int x) -{ - /* - * If x < 0, the cast to uint32_t will set the high bit to 1. - */ - uint32_t y; - - y = (uint32_t)x; - y += Q & -(y >> 31); - return y; -} - -/* - * Addition modulo q. Operands must be in the 0..q-1 range. - */ -static inline uint32_t -mq_add(uint32_t x, uint32_t y) -{ - /* - * We compute x + y - q. If the result is negative, then the - * high bit will be set, and 'd >> 31' will be equal to 1; - * thus '-(d >> 31)' will be an all-one pattern. Otherwise, - * it will be an all-zero pattern. In other words, this - * implements a conditional addition of q. - */ - uint32_t d; - - d = x + y - Q; - d += Q & -(d >> 31); - return d; -} - -/* - * Subtraction modulo q. Operands must be in the 0..q-1 range. - */ -static inline uint32_t -mq_sub(uint32_t x, uint32_t y) -{ - /* - * As in mq_add(), we use a conditional addition to ensure the - * result is in the 0..q-1 range. - */ - uint32_t d; - - d = x - y; - d += Q & -(d >> 31); - return d; -} - -/* - * Division by 2 modulo q. Operand must be in the 0..q-1 range. - */ -static inline uint32_t -mq_rshift1(uint32_t x) -{ - x += Q & -(x & 1); - return (x >> 1); -} - -/* - * Montgomery multiplication modulo q. If we set R = 2^16 mod q, then - * this function computes: x * y / R mod q - * Operands must be in the 0..q-1 range. - */ -static inline uint32_t -mq_montymul(uint32_t x, uint32_t y) -{ - uint32_t z, w; - - /* - * We compute x*y + k*q with a value of k chosen so that the 16 - * low bits of the result are 0. We can then shift the value. - * After the shift, result may still be larger than q, but it - * will be lower than 2*q, so a conditional subtraction works. - */ - - z = x * y; - w = ((z * Q0I) & 0xFFFF) * Q; - - /* - * When adding z and w, the result will have its low 16 bits - * equal to 0. Since x, y and z are lower than q, the sum will - * be no more than (2^15 - 1) * q + (q - 1)^2, which will - * fit on 29 bits. - */ - z = (z + w) >> 16; - - /* - * After the shift, analysis shows that the value will be less - * than 2q. We do a subtraction then conditional subtraction to - * ensure the result is in the expected range. - */ - z -= Q; - z += Q & -(z >> 31); - return z; -} - -/* - * Montgomery squaring (computes (x^2)/R). - */ -static inline uint32_t -mq_montysqr(uint32_t x) -{ - return mq_montymul(x, x); -} - -/* - * Divide x by y modulo q = 12289. - */ -static inline uint32_t -mq_div_12289(uint32_t x, uint32_t y) -{ - /* - * We invert y by computing y^(q-2) mod q. - * - * We use the following addition chain for exponent e = 12287: - * - * e0 = 1 - * e1 = 2 * e0 = 2 - * e2 = e1 + e0 = 3 - * e3 = e2 + e1 = 5 - * e4 = 2 * e3 = 10 - * e5 = 2 * e4 = 20 - * e6 = 2 * e5 = 40 - * e7 = 2 * e6 = 80 - * e8 = 2 * e7 = 160 - * e9 = e8 + e2 = 163 - * e10 = e9 + e8 = 323 - * e11 = 2 * e10 = 646 - * e12 = 2 * e11 = 1292 - * e13 = e12 + e9 = 1455 - * e14 = 2 * e13 = 2910 - * e15 = 2 * e14 = 5820 - * e16 = e15 + e10 = 6143 - * e17 = 2 * e16 = 12286 - * e18 = e17 + e0 = 12287 - * - * Additions on exponents are converted to Montgomery - * multiplications. We define all intermediate results as so - * many local variables, and let the C compiler work out which - * must be kept around. - */ - uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9; - uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18; - - y0 = mq_montymul(y, R2); - y1 = mq_montysqr(y0); - y2 = mq_montymul(y1, y0); - y3 = mq_montymul(y2, y1); - y4 = mq_montysqr(y3); - y5 = mq_montysqr(y4); - y6 = mq_montysqr(y5); - y7 = mq_montysqr(y6); - y8 = mq_montysqr(y7); - y9 = mq_montymul(y8, y2); - y10 = mq_montymul(y9, y8); - y11 = mq_montysqr(y10); - y12 = mq_montysqr(y11); - y13 = mq_montymul(y12, y9); - y14 = mq_montysqr(y13); - y15 = mq_montysqr(y14); - y16 = mq_montymul(y15, y10); - y17 = mq_montysqr(y16); - y18 = mq_montymul(y17, y0); - - /* - * Final multiplication with x, which is not in Montgomery - * representation, computes the correct division result. - */ - return mq_montymul(y18, x); -} - -/* - * Compute NTT on a ring element. - */ -static void -mq_NTT(uint16_t *a, unsigned logn) -{ - size_t n, t, m; - - n = (size_t)1 << logn; - t = n; - for (m = 1; m < n; m <<= 1) { - size_t ht, i, j1; - - ht = t >> 1; - for (i = 0, j1 = 0; i < m; i ++, j1 += t) { - size_t j, j2; - uint32_t s; - - s = GMb[m + i]; - j2 = j1 + ht; - for (j = j1; j < j2; j ++) { - uint32_t u, v; - - u = a[j]; - v = mq_montymul(a[j + ht], s); - a[j] = (uint16_t)mq_add(u, v); - a[j + ht] = (uint16_t)mq_sub(u, v); - } - } - t = ht; - } -} - -/* - * Compute the inverse NTT on a ring element, binary case. - */ -static void -mq_iNTT(uint16_t *a, unsigned logn) -{ - size_t n, t, m; - uint32_t ni; - - n = (size_t)1 << logn; - t = 1; - m = n; - while (m > 1) { - size_t hm, dt, i, j1; - - hm = m >> 1; - dt = t << 1; - for (i = 0, j1 = 0; i < hm; i ++, j1 += dt) { - size_t j, j2; - uint32_t s; - - j2 = j1 + t; - s = iGMb[hm + i]; - for (j = j1; j < j2; j ++) { - uint32_t u, v, w; - - u = a[j]; - v = a[j + t]; - a[j] = (uint16_t)mq_add(u, v); - w = mq_sub(u, v); - a[j + t] = (uint16_t) - mq_montymul(w, s); - } - } - t = dt; - m = hm; - } - - /* - * To complete the inverse NTT, we must now divide all values by - * n (the vector size). We thus need the inverse of n, i.e. we - * need to divide 1 by 2 logn times. But we also want it in - * Montgomery representation, i.e. we also want to multiply it - * by R = 2^16. In the common case, this should be a simple right - * shift. The loop below is generic and works also in corner cases; - * its computation time is negligible. - */ - ni = R; - for (m = n; m > 1; m >>= 1) { - ni = mq_rshift1(ni); - } - for (m = 0; m < n; m ++) { - a[m] = (uint16_t)mq_montymul(a[m], ni); - } -} - -/* - * Convert a polynomial (mod q) to Montgomery representation. - */ -static void -mq_poly_tomonty(uint16_t *f, unsigned logn) -{ - size_t u, n; - - n = (size_t)1 << logn; - for (u = 0; u < n; u ++) { - f[u] = (uint16_t)mq_montymul(f[u], R2); - } -} - -/* - * Multiply two polynomials together (NTT representation, and using - * a Montgomery multiplication). Result f*g is written over f. - */ -static void -mq_poly_montymul_ntt(uint16_t *f, const uint16_t *g, unsigned logn) -{ - size_t u, n; - - n = (size_t)1 << logn; - for (u = 0; u < n; u ++) { - f[u] = (uint16_t)mq_montymul(f[u], g[u]); - } -} - -/* - * Subtract polynomial g from polynomial f. - */ -static void -mq_poly_sub(uint16_t *f, const uint16_t *g, unsigned logn) -{ - size_t u, n; - - n = (size_t)1 << logn; - for (u = 0; u < n; u ++) { - f[u] = (uint16_t)mq_sub(f[u], g[u]); - } -} - -/* ===================================================================== */ - -/* see inner.h */ -void -Zf(to_ntt_monty)(uint16_t *h, unsigned logn) -{ - mq_NTT(h, logn); - mq_poly_tomonty(h, logn); -} - -/* see inner.h */ -int -Zf(verify_raw)(const uint16_t *c0, const int16_t *s2, - const uint16_t *h, unsigned logn, uint8_t *tmp) -{ - size_t u, n; - uint16_t *tt; - - n = (size_t)1 << logn; - tt = (uint16_t *)tmp; - - /* - * Reduce s2 elements modulo q ([0..q-1] range). - */ - for (u = 0; u < n; u ++) { - uint32_t w; - - w = (uint32_t)s2[u]; - w += Q & -(w >> 31); - tt[u] = (uint16_t)w; - } - - /* - * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]). - */ - mq_NTT(tt, logn); - mq_poly_montymul_ntt(tt, h, logn); - mq_iNTT(tt, logn); - mq_poly_sub(tt, c0, logn); - - /* - * Normalize -s1 elements into the [-q/2..q/2] range. - */ - for (u = 0; u < n; u ++) { - int32_t w; - - w = (int32_t)tt[u]; - w -= (int32_t)(Q & -(((Q >> 1) - (uint32_t)w) >> 31)); - ((int16_t *)tt)[u] = (int16_t)w; - } - - /* - * Signature is valid if and only if the aggregate (-s1,s2) vector - * is short enough. - */ - return Zf(is_short)((int16_t *)tt, s2, logn); -} - -/* see inner.h */ -int -Zf(compute_public)(uint16_t *h, - const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp) -{ - size_t u, n; - uint16_t *tt; - - n = (size_t)1 << logn; - tt = (uint16_t *)tmp; - for (u = 0; u < n; u ++) { - tt[u] = (uint16_t)mq_conv_small(f[u]); - h[u] = (uint16_t)mq_conv_small(g[u]); - } - mq_NTT(h, logn); - mq_NTT(tt, logn); - for (u = 0; u < n; u ++) { - if (tt[u] == 0) { - return 0; - } - h[u] = (uint16_t)mq_div_12289(h[u], tt[u]); - } - mq_iNTT(h, logn); - return 1; -} - -/* see inner.h */ -int -Zf(complete_private)(int8_t *G, - const int8_t *f, const int8_t *g, const int8_t *F, - unsigned logn, uint8_t *tmp) -{ - size_t u, n; - uint16_t *t1, *t2; - - n = (size_t)1 << logn; - t1 = (uint16_t *)tmp; - t2 = t1 + n; - for (u = 0; u < n; u ++) { - t1[u] = (uint16_t)mq_conv_small(g[u]); - t2[u] = (uint16_t)mq_conv_small(F[u]); - } - mq_NTT(t1, logn); - mq_NTT(t2, logn); - mq_poly_tomonty(t1, logn); - mq_poly_montymul_ntt(t1, t2, logn); - for (u = 0; u < n; u ++) { - t2[u] = (uint16_t)mq_conv_small(f[u]); - } - mq_NTT(t2, logn); - for (u = 0; u < n; u ++) { - if (t2[u] == 0) { - return 0; - } - t1[u] = (uint16_t)mq_div_12289(t1[u], t2[u]); - } - mq_iNTT(t1, logn); - for (u = 0; u < n; u ++) { - uint32_t w; - int32_t gi; - - w = t1[u]; - w -= (Q & ~-((w - (Q >> 1)) >> 31)); - gi = *(int32_t *)&w; - if (gi < -127 || gi > +127) { - return 0; - } - G[u] = (int8_t)gi; - } - return 1; -} - -/* see inner.h */ -int -Zf(is_invertible)( - const int16_t *s2, unsigned logn, uint8_t *tmp) -{ - size_t u, n; - uint16_t *tt; - uint32_t r; - - n = (size_t)1 << logn; - tt = (uint16_t *)tmp; - for (u = 0; u < n; u ++) { - uint32_t w; - - w = (uint32_t)s2[u]; - w += Q & -(w >> 31); - tt[u] = (uint16_t)w; - } - mq_NTT(tt, logn); - r = 0; - for (u = 0; u < n; u ++) { - r |= (uint32_t)(tt[u] - 1); - } - return (int)(1u - (r >> 31)); -} - -/* see inner.h */ -int -Zf(verify_recover)(uint16_t *h, - const uint16_t *c0, const int16_t *s1, const int16_t *s2, - unsigned logn, uint8_t *tmp) -{ - size_t u, n; - uint16_t *tt; - uint32_t r; - - n = (size_t)1 << logn; - - /* - * Reduce elements of s1 and s2 modulo q; then write s2 into tt[] - * and c0 - s1 into h[]. - */ - tt = (uint16_t *)tmp; - for (u = 0; u < n; u ++) { - uint32_t w; - - w = (uint32_t)s2[u]; - w += Q & -(w >> 31); - tt[u] = (uint16_t)w; - - w = (uint32_t)s1[u]; - w += Q & -(w >> 31); - w = mq_sub(c0[u], w); - h[u] = (uint16_t)w; - } - - /* - * Compute h = (c0 - s1) / s2. If one of the coefficients of s2 - * is zero (in NTT representation) then the operation fails. We - * keep that information into a flag so that we do not deviate - * from strict constant-time processing; if all coefficients of - * s2 are non-zero, then the high bit of r will be zero. - */ - mq_NTT(tt, logn); - mq_NTT(h, logn); - r = 0; - for (u = 0; u < n; u ++) { - r |= (uint32_t)(tt[u] - 1); - h[u] = (uint16_t)mq_div_12289(h[u], tt[u]); - } - mq_iNTT(h, logn); - - /* - * Signature is acceptable if and only if it is short enough, - * and s2 was invertible mod phi mod q. The caller must still - * check that the rebuilt public key matches the expected - * value (e.g. through a hash). - */ - r = ~r & (uint32_t)-Zf(is_short)(s1, s2, logn); - return (int)(r >> 31); -} - -/* see inner.h */ -int -Zf(count_nttzero)(const int16_t *sig, unsigned logn, uint8_t *tmp) -{ - uint16_t *s2; - size_t u, n; - uint32_t r; - - n = (size_t)1 << logn; - s2 = (uint16_t *)tmp; - for (u = 0; u < n; u ++) { - uint32_t w; - - w = (uint32_t)sig[u]; - w += Q & -(w >> 31); - s2[u] = (uint16_t)w; - } - mq_NTT(s2, logn); - r = 0; - for (u = 0; u < n; u ++) { - uint32_t w; - - w = (uint32_t)s2[u] - 1u; - r += (w >> 31); - } - return (int)r; -} diff --git a/crypto_sign/fndsa_provisional-1024/m4f/LICENSE b/crypto_sign/fndsa_provisional-1024/m4f/LICENSE new file mode 120000 index 00000000..a8b0b647 --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/LICENSE @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/LICENSE \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/api.c b/crypto_sign/fndsa_provisional-1024/m4f/api.c new file mode 120000 index 00000000..925a7669 --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/api.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/api.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/api.h b/crypto_sign/fndsa_provisional-1024/m4f/api.h new file mode 120000 index 00000000..a128187e --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/api.h @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-1024/ref/api.h \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/archflags.h b/crypto_sign/fndsa_provisional-1024/m4f/archflags.h new file mode 120000 index 00000000..5995cb74 --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/archflags.h @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/archflags.h \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/codec.c b/crypto_sign/fndsa_provisional-1024/m4f/codec.c new file mode 120000 index 00000000..f24de227 --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/codec.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/codec.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/codec_cm4.s b/crypto_sign/fndsa_provisional-1024/m4f/codec_cm4.s new file mode 120000 index 00000000..0c44b8f4 --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/codec_cm4.s @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/codec_cm4.s \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/fndsa.h b/crypto_sign/fndsa_provisional-1024/m4f/fndsa.h new file mode 120000 index 00000000..9b987bbd --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/fndsa.h @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/fndsa.h \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/inner.h b/crypto_sign/fndsa_provisional-1024/m4f/inner.h new file mode 120000 index 00000000..ccf790d2 --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/inner.h @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/inner.h \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/kgen.c b/crypto_sign/fndsa_provisional-1024/m4f/kgen.c new file mode 120000 index 00000000..905593b2 --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/kgen.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/kgen.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/kgen_fxp.c b/crypto_sign/fndsa_provisional-1024/m4f/kgen_fxp.c new file mode 120000 index 00000000..5a42ccfd --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/kgen_fxp.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/kgen_fxp.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/kgen_gauss.c b/crypto_sign/fndsa_provisional-1024/m4f/kgen_gauss.c new file mode 120000 index 00000000..56f96573 --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/kgen_gauss.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/kgen_gauss.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/kgen_inner.h b/crypto_sign/fndsa_provisional-1024/m4f/kgen_inner.h new file mode 120000 index 00000000..1fc8ceef --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/kgen_inner.h @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/kgen_inner.h \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/kgen_mp31.c b/crypto_sign/fndsa_provisional-1024/m4f/kgen_mp31.c new file mode 120000 index 00000000..f8c33ba5 --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/kgen_mp31.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/kgen_mp31.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/kgen_ntru.c b/crypto_sign/fndsa_provisional-1024/m4f/kgen_ntru.c new file mode 120000 index 00000000..ba0388c9 --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/kgen_ntru.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/kgen_ntru.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/kgen_poly.c b/crypto_sign/fndsa_provisional-1024/m4f/kgen_poly.c new file mode 120000 index 00000000..05fd362f --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/kgen_poly.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/kgen_poly.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/kgen_zint31.c b/crypto_sign/fndsa_provisional-1024/m4f/kgen_zint31.c new file mode 120000 index 00000000..3cb38053 --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/kgen_zint31.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/kgen_zint31.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/mq.c b/crypto_sign/fndsa_provisional-1024/m4f/mq.c new file mode 120000 index 00000000..a351025f --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/mq.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/mq.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/mq_cm4.s b/crypto_sign/fndsa_provisional-1024/m4f/mq_cm4.s new file mode 120000 index 00000000..f0f60a44 --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/mq_cm4.s @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/mq_cm4.s \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sha3.c b/crypto_sign/fndsa_provisional-1024/m4f/sha3.c new file mode 120000 index 00000000..933b2bda --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/sha3.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/sha3.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sha3_cm4.s b/crypto_sign/fndsa_provisional-1024/m4f/sha3_cm4.s new file mode 120000 index 00000000..d1d08be8 --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/sha3_cm4.s @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/sha3_cm4.s \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sign.c b/crypto_sign/fndsa_provisional-1024/m4f/sign.c new file mode 120000 index 00000000..3dbf03a2 --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/sign.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/sign.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sign_core.c b/crypto_sign/fndsa_provisional-1024/m4f/sign_core.c new file mode 120000 index 00000000..5bb17b4b --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/sign_core.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/sign_core.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sign_fpoly.c b/crypto_sign/fndsa_provisional-1024/m4f/sign_fpoly.c new file mode 120000 index 00000000..a0b083ce --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/sign_fpoly.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/sign_fpoly.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sign_fpr.c b/crypto_sign/fndsa_provisional-1024/m4f/sign_fpr.c new file mode 120000 index 00000000..dc3b5cd0 --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/sign_fpr.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/sign_fpr.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sign_fpr_cm4.s b/crypto_sign/fndsa_provisional-1024/m4f/sign_fpr_cm4.s new file mode 120000 index 00000000..4cecbb3b --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/sign_fpr_cm4.s @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/sign_fpr_cm4.s \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sign_inner.h b/crypto_sign/fndsa_provisional-1024/m4f/sign_inner.h new file mode 120000 index 00000000..3a34addb --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/sign_inner.h @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/sign_inner.h \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sign_sampler.c b/crypto_sign/fndsa_provisional-1024/m4f/sign_sampler.c new file mode 120000 index 00000000..f402644f --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/sign_sampler.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/sign_sampler.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sign_sampler_cm4.s b/crypto_sign/fndsa_provisional-1024/m4f/sign_sampler_cm4.s new file mode 120000 index 00000000..bfde9bed --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/sign_sampler_cm4.s @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/sign_sampler_cm4.s \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sysrng.c b/crypto_sign/fndsa_provisional-1024/m4f/sysrng.c new file mode 120000 index 00000000..dfb4edcb --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/sysrng.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/sysrng.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/util.c b/crypto_sign/fndsa_provisional-1024/m4f/util.c new file mode 120000 index 00000000..bcc36e58 --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/util.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/util.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-1024/m4f/vrfy.c b/crypto_sign/fndsa_provisional-1024/m4f/vrfy.c new file mode 120000 index 00000000..9b515e7e --- /dev/null +++ b/crypto_sign/fndsa_provisional-1024/m4f/vrfy.c @@ -0,0 +1 @@ +../../fndsa_provisional-512/m4f/vrfy.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/LICENSE b/crypto_sign/fndsa_provisional-512/m4f/LICENSE new file mode 120000 index 00000000..53ea2086 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/LICENSE @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/LICENSE \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/api.c b/crypto_sign/fndsa_provisional-512/m4f/api.c new file mode 120000 index 00000000..42027e55 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/api.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/api.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/api.h b/crypto_sign/fndsa_provisional-512/m4f/api.h new file mode 120000 index 00000000..7377f660 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/api.h @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/api.h \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/archflags.h b/crypto_sign/fndsa_provisional-512/m4f/archflags.h new file mode 100644 index 00000000..99a4b205 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/archflags.h @@ -0,0 +1,2 @@ +/* Architecture-specific flags (if any). */ +#define FNDSA_ASM_CORTEXM4 1 diff --git a/crypto_sign/fndsa_provisional-512/m4f/codec.c b/crypto_sign/fndsa_provisional-512/m4f/codec.c new file mode 120000 index 00000000..706495e7 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/codec.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/codec.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/codec_cm4.s b/crypto_sign/fndsa_provisional-512/m4f/codec_cm4.s new file mode 100644 index 00000000..cb80767d --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/codec_cm4.s @@ -0,0 +1,203 @@ + .syntax unified + .cpu cortex-m4 + .file "mq_cm4.s" + .text + + .equ Q, 12289 + +@ ======================================================================= +@ size_t fndsa_mqpoly_decode(unsigned logn, const uint8_t *f, uint16_t *h) +@ ======================================================================= + + .align 2 + .global fndsa_mqpoly_decode + .thumb + .thumb_func + .type fndsa_mqpoly_decode, %function +fndsa_mqpoly_decode: + push { r4, r5, r6, r7, r8, r10, r11 } + + @ ASSUMPTIONS: + @ - logn >= 2 (hence, n is a multiple of 4) + @ - output buffer is 32-bit aligned + @ We process input by chunks of 7 bytes, to produce 4 values. + @ TODO: try using chunks of 28 bytes when source is aligned; it + @ would avoid most unaligned penalties and save 1/8 of reads. + + @ r0 <- n = 2^logn + movs r3, #1 + lsl r0, r3, r0 + @ r11 <- original source pointer + mov r11, r1 + + @ r3 <- 0x3FFF:0x3FFF + movw r3, 0x3FFF + movt r3, 0x3FFF + @ r10 <- q:q + movw r10, #Q + movt r10, #Q + @ r12 <- 0xFFFFFFFF + @ If any value overflows, then bit 15 or 31 of r12 will be cleared. + mov r12, #0xFFFFFFFF + +fndsa_mqpoly_decode__L1: + @ Get next 7-byte value as integer r7:r5 with big-endian + @ interpretation. + ldr r5, [r1], #3 + ldr r4, [r1], #4 + lsls r5, #8 + rev r5, r5 + rev r4, r4 + @ We assemble the 4 values in r6:r7 (packed 16-bit): + @ x0: r6<0,13> <- r5<10,23> + @ x1: r6<16,19> <- r4<28,31>, r6<20,29> <- r5<0,9> + @ x2: r7<0,13> <- r4<14,27> + @ x3: r7<16,29> <- r4<0,13> + ubfx r6, r5, #10, #14 + bfi r6, r5, #20, #10 + lsrs r7, r4, #28 + orr r6, r6, r7, lsl #16 + lsrs r7, r4, #14 + bfi r7, r4, #16, #14 + ands r6, r3 + ands r7, r3 + @ Update the overflow mask. + usub16 r8, r6, r10 + and r12, r12, r8 + usub16 r8, r7, r10 + and r12, r12, r8 + @ Store the extracted values. + strd r6, r7, [r2], #8 + @ Loop until all values have been decoded. + subs r0, #4 + bne fndsa_mqpoly_decode__L1 + + @ Get output value (number of consumed bytes). + @ Clamp it to 0 on overflow. + sub r0, r1, r11 + and r12, r12, r12, lsl #16 + and r0, r0, r12, asr #31 + + pop { r4, r5, r6, r7, r8, r10, r11 } + bx lr + .size fndsa_mqpoly_decode,.-fndsa_mqpoly_decode + +@ ======================================================================= +@ size_t fndsa_comp_decode(unsigned logn, +@ const uint8_t *d, size_t dlen, int16_t *s) +@ ======================================================================= + + .align 2 + .global fndsa_comp_decode + .thumb + .thumb_func + .type fndsa_comp_decode, %function +fndsa_comp_decode: + push { r4, r5, r6, r7 } + + @ r0 <- n = 2^logn + movs r4, #1 + lsl r0, r4, r0 + @ r2 <- upper bound for d + adds r2, r1 + + @ r4 acc + @ r5 acc_ptr + @ Unprocessed bits are in the top bits of acc. First unprocessed bit + @ is at index acc_ptr + 8. + eors r4, r4 + movs r5, #24 + +fndsa_comp_decode__L1: + @ Invariant: acc_ptr >= 17 (i.e. there are at most 7 unprocessed bits). + + @ Get next 8 bits. + cmp r1, r2 + beq fndsa_comp_decode__Lerr + ldrb r6, [r1], #1 + lsls r6, r5 + orrs r4, r6 + + @ r6 <- low 7 absolute value bits + @ r12 <- sign (word-extended) + ubfx r6, r4, #24, #7 + asr r12, r4, #31 + lsls r4, #8 + + @ We injected 8 bits then consumed 8 bits: acc_ptr is unmodified. + + @ Locate next bit of value 1. If necessary, read one or two + @ extra bytes. Heuristically, values are small, so the fast + @ path is that the extra bit is already there. + cbz r4, fndsa_comp_decode__Lzb1 + clz r7, r4 +fndsa_comp_decode__L2: + @ There are r7 zeros, then a one. r7 <= 15. + add r6, r6, r7, lsl #7 + @ Consume the zeros and the final one. + adds r7, #1 + lsls r4, r7 + adds r5, r7 + @ Mantissa is in r6, sign in r12. Reject "minus zero" encoding, + @ i.e. r6 = 0 and r12 = -1 + orn r7, r6, r12 + cbz r7, fndsa_comp_decode__Lerr + @ We assemble the value in r6 + eor r6, r6, r12 + sub r6, r6, r12 + strh r6, [r3], #2 + + @ Loop until all values have been obtained. + subs r0, #1 + bne fndsa_comp_decode__L1 + + @ Check that remaining unused bits are zero (accumulator and + @ all unused bytes). + movs r0, #1 + cbnz r4, fndsa_comp_decode__Lerr + cmp r1, r2 + beq fndsa_comp_decode__Lexit +fndsa_comp_decode__L3: + ldrb r6, [r1], #1 + cbnz r6, fndsa_comp_decode__Lerr + cmp r1, r2 + bne fndsa_comp_decode__L3 +fndsa_comp_decode__Lexit: + pop { r4, r5, r6, r7 } + bx lr + +fndsa_comp_decode__Lzb1: + @ All currently buffered bits are zero, we must get an extra byte. + @ Get next byte. + cmp r1, r2 + beq fndsa_comp_decode__Lerr + ldrb r7, [r1], #1 + lsls r7, r5 + orrs r4, r7 + cbz r4, fndsa_comp_decode__Lzb2 + subs r5, #8 + clz r7, r4 + b fndsa_comp_decode__L2 + +fndsa_comp_decode__Lzb2: + @ All currently buffered bits are zero, and the next byte was + @ all-zeros too; we must get another byte. + cmp r1, r2 + beq fndsa_comp_decode__Lerr + ldrb r7, [r1], #1 + subs r5, #8 + lsls r7, r5 + orrs r4, r7 + cbz r4, fndsa_comp_decode__Lerr + subs r5, #8 + clz r7, r4 + @ Since we added two bytes and the accumulator already contained + @ up to 7 bits, then we may have up to 23 bits at this point, + @ hence r7 can be up to 22. Values greater than 15 are invalid. + cmp r7, #15 + bls fndsa_comp_decode__L2 + @ Fall through to error sequence. +fndsa_comp_decode__Lerr: + eors r0, r0 + b fndsa_comp_decode__Lexit + .size fndsa_comp_decode,.-fndsa_comp_decode diff --git a/crypto_sign/fndsa_provisional-512/m4f/fndsa.h b/crypto_sign/fndsa_provisional-512/m4f/fndsa.h new file mode 120000 index 00000000..52ec0735 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/fndsa.h @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/fndsa.h \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/inner.h b/crypto_sign/fndsa_provisional-512/m4f/inner.h new file mode 120000 index 00000000..85121904 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/inner.h @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/inner.h \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/kgen.c b/crypto_sign/fndsa_provisional-512/m4f/kgen.c new file mode 120000 index 00000000..74038217 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/kgen.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/kgen.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/kgen_fxp.c b/crypto_sign/fndsa_provisional-512/m4f/kgen_fxp.c new file mode 120000 index 00000000..aba35701 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/kgen_fxp.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/kgen_fxp.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/kgen_gauss.c b/crypto_sign/fndsa_provisional-512/m4f/kgen_gauss.c new file mode 120000 index 00000000..50228b17 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/kgen_gauss.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/kgen_gauss.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/kgen_inner.h b/crypto_sign/fndsa_provisional-512/m4f/kgen_inner.h new file mode 120000 index 00000000..3359c80c --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/kgen_inner.h @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/kgen_inner.h \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/kgen_mp31.c b/crypto_sign/fndsa_provisional-512/m4f/kgen_mp31.c new file mode 120000 index 00000000..0b7b5e55 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/kgen_mp31.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/kgen_mp31.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/kgen_ntru.c b/crypto_sign/fndsa_provisional-512/m4f/kgen_ntru.c new file mode 120000 index 00000000..d58fdd5a --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/kgen_ntru.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/kgen_ntru.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/kgen_poly.c b/crypto_sign/fndsa_provisional-512/m4f/kgen_poly.c new file mode 120000 index 00000000..7af990e2 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/kgen_poly.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/kgen_poly.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/kgen_zint31.c b/crypto_sign/fndsa_provisional-512/m4f/kgen_zint31.c new file mode 120000 index 00000000..b8f3907a --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/kgen_zint31.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/kgen_zint31.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/mq.c b/crypto_sign/fndsa_provisional-512/m4f/mq.c new file mode 120000 index 00000000..b881f59c --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/mq.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/mq.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/mq_cm4.s b/crypto_sign/fndsa_provisional-512/m4f/mq_cm4.s new file mode 100644 index 00000000..e9ba11df --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/mq_cm4.s @@ -0,0 +1,758 @@ + .syntax unified + .cpu cortex-m4 + .file "mq_cm4.s" + .text + + .equ Q, 12289 + .equ Q1I, 4143984639 + .equ R, 10952 + .equ R2, 5664 + +@ ======================================================================= +@ void fndsa_mqpoly_small_to_int(unsigned logn, const int8_t *f, uint16_t *d) +@ ======================================================================= + + .align 2 + .global fndsa_mqpoly_small_to_int + .thumb + .thumb_func + .type fndsa_mqpoly_small_to_int, %function +fndsa_mqpoly_small_to_int: + @ This implementation actually normalizes to [1,q] (strict internal + @ representation). + push { r4, r5, r6, r7 } + @ Set r3 to n = 2^logn + movs r3, #1 + lsls r3, r0 + @ Set both halves of r0 to Q + movw r0, #Q + movt r0, #Q + @ Set both halves of r7 to 1 + mov r7, #0x00010001 +fndsa_mqpoly_small_to_int__L1: + @ Get next four source bytes. + ldr r5, [r1], #4 + @ Expand bytes to 16-bit each; for each byte whose value is negative + @ or zero, we need to add Q. + sxtb16 r4, r5 + sadd16 r6, r4, r0 + ssub16 r12, r4, r7 + sel r4, r4, r6 + sxtb16 r5, r5, ror #8 + sadd16 r6, r5, r0 + ssub16 r12, r5, r7 + sel r5, r5, r6 + @ We need to interleave the values to get them in the right order. + pkhtb r6, r5, r4, asr #16 + pkhbt r5, r4, r5, lsl #16 + @ We can use strd because the caller ensured that the output is + @ aligned. + strd r5, r6, [r2], #8 + subs r3, #4 + bne fndsa_mqpoly_small_to_int__L1 + + pop { r4, r5, r6, r7 } + bx lr + .size fndsa_mqpoly_small_to_int,.-fndsa_mqpoly_small_to_int + +@ ======================================================================= +@ void fndsa_mqpoly_signed_to_int(unsigned logn, uint16_t *d) +@ ======================================================================= + + .align 2 + .global fndsa_mqpoly_signed_to_int + .thumb + .thumb_func + .type fndsa_mqpoly_signed_to_int, %function +fndsa_mqpoly_signed_to_int: + @ This implementation actually normalizes to [1,q] (strict internal + @ representation). + push { r4, r5, r6 } + movs r3, #1 + lsls r3, r0 + @ Set both halves of r0 to Q + movw r0, #Q + movt r0, #Q + @ Set both halves of r2 to 1 + mov r2, #0x00010001 +fndsa_mqpoly_signed_to_int__L1: + @ We can use ldrd because the caller ensured that the input is + @ aligned. + ldrd r4, r5, [r1] + @ For each word half, we want to add q if the value is negative or 0. + sadd16 r6, r4, r0 + ssub16 r12, r4, r2 + sel r4, r4, r6 + sadd16 r6, r5, r0 + ssub16 r12, r5, r2 + sel r5, r5, r6 + strd r4, r5, [r1], #8 + subs r3, #4 + bne fndsa_mqpoly_signed_to_int__L1 + + pop { r4, r5, r6 } + bx lr + .size fndsa_mqpoly_signed_to_int,.-fndsa_mqpoly_signed_to_int + +@ ======================================================================= +@ void fndsa_mqpoly_int_to_ext(unsigned logn, uint16_t *d) +@ ======================================================================= + + .align 2 + .global fndsa_mqpoly_int_to_ext + .thumb + .thumb_func + .type fndsa_mqpoly_int_to_ext, %function +fndsa_mqpoly_int_to_ext: + push.w { r4, r5, r6 } + movs r3, #1 + lsls r3, r0 + @ Set both halves of r0 to Q + movw r0, #Q + movt r0, #Q + @ Set both halves of r2 to 0xFFFF + mov r2, #0xFFFFFFFF + @ Set r6 to zero + movw r6, #0 +fndsa_mqpoly_int_to_ext__L1: + @ We can use ldrd because the caller ensured that the input is + @ aligned. + ldrd r4, r5, [r1] + @ Each word half equal to q must be set to 0; others are untouched. + ssub16 r12, r4, r0 + sel r4, r6, r4 + ssub16 r12, r5, r0 + sel r5, r6, r5 + strd r4, r5, [r1], #8 + subs r3, #4 + bne fndsa_mqpoly_int_to_ext__L1 + + pop { r4, r5, r6 } + bx lr + .size fndsa_mqpoly_int_to_ext,.-fndsa_mqpoly_int_to_ext + +@ ======================================================================= +@ void fndsa_mqpoly_mul_ntt(unsigned logn, uint16_t *a, const uint16_t *b) +@ ======================================================================= + + .align 2 + .global fndsa_mqpoly_mul_ntt + .thumb + .thumb_func + .type fndsa_mqpoly_mul_ntt, %function +fndsa_mqpoly_mul_ntt: + push.w { r4, r5, r6, r7, r8, r10, lr } + movs r3, #1 + lsls r3, r0 + + @ r10 <- q + movw r10, #Q + @ r14 <- -1/q mod 2^32 + movw r14, #(Q1I & 0xFFFF) + movt r14, #(Q1I >> 16) + @ r0 <- 2^64 mod q + movw r0, #R2 + + @ r12 is a temporary + +fndsa_mqpoly_mul_ntt__L1: + @ A sequence of four ldr is faster than two ldrd or two ldm. + ldr r5, [r1] + ldr r6, [r1, #4] + ldr.w r7, [r2], #4 + ldr.w r8, [r2], #4 + + @ First pair of words (r5 and r7) + @ Products over integers. + smulbb r4, r5, r7 + smultt r5, r5, r7 + @ Montgomery reduction. + mul r12, r4, r14 + umaal r14, r4, r12, r10 + mul r12, r5, r14 + umaal r14, r5, r12, r10 + @ Conversion Montgomery -> normal + muls r4, r0 + muls r5, r0 + mul r12, r4, r14 + umaal r14, r4, r12, r10 + mul r12, r5, r14 + umaal r14, r5, r12, r10 + @ Repack the two output values and write word + pkhbt r7, r4, r5, lsl #16 + str.w r7, [r1], #4 + + @ Second pair of words (r6 and r8) + @ Products over integers. + smulbb r4, r6, r8 + smultt r5, r6, r8 + @ Montgomery reduction. + mul r12, r4, r14 + umaal r14, r4, r12, r10 + mul r12, r5, r14 + umaal r14, r5, r12, r10 + @ Conversion Montgomery -> normal + muls r4, r0 + muls r5, r0 + mul r12, r4, r14 + umaal r14, r4, r12, r10 + mul r12, r5, r14 + umaal r14, r5, r12, r10 + @ Repack the two output values and write word + pkhbt r8, r4, r5, lsl #16 + str.w r8, [r1], #4 + + @ Store the four output values. + subs r3, #4 + bne fndsa_mqpoly_mul_ntt__L1 + + pop { r4, r5, r6, r7, r8, r10, pc } + .size fndsa_mqpoly_mul_ntt,.-fndsa_mqpoly_mul_ntt + +@ ======================================================================= +@ void fndsa_mqpoly_sub(unsigned logn, uint16_t *a, const uint16_t *b) +@ ======================================================================= + + .align 2 + .global fndsa_mqpoly_sub + .thumb + .thumb_func + .type fndsa_mqpoly_sub, %function +fndsa_mqpoly_sub: + push.w { r4, r5, r6, r7, lr } + movs r3, #1 + lsls r3, r0 + + @ r0 <- 0 + movw r0, #0 + @ r14 <- q (both halves) + movw r14, #Q + movt r14, #Q + +fndsa_mqpoly_sub__L1: + @ Four ldr are faster than two ldrd or two ldm. + ldr r4, [r1] + ldr r5, [r1, #4] + ldr.w r6, [r2], #4 + ldr.w r7, [r2], #4 + + @ We do the subtraction over the integers, then add q back if + @ the result is negative. + ssub16 r4, r4, r6 + sel r12, r0, r14 + sadd16 r4, r4, r12 + str.w r4, [r1], #4 + ssub16 r5, r5, r7 + sel r12, r0, r14 + sadd16 r5, r5, r12 + str.w r5, [r1], #4 + + subs r3, #4 + bne fndsa_mqpoly_sub__L1 + + pop { r4, r5, r6, r7, pc } + .size fndsa_mqpoly_sub,.-fndsa_mqpoly_sub + +@ ======================================================================= +@ uint32_t fndsa_mqpoly_sqnorm_signed(unsigned logn, const uint16_t *a) +@ ======================================================================= + + .align 2 + .global fndsa_mqpoly_sqnorm_signed + .thumb + .thumb_func + .type fndsa_mqpoly_sqnorm_signed, %function +fndsa_mqpoly_sqnorm_signed: + movs r3, #1 + lsls r3, r0 + + movw r0, #0 +fndsa_mqpoly_sqnorm_signed__L1: + @ We can use ldrd because the caller ensured that the input is + @ aligned. + ldrd r2, r12, [r1], #8 + smlad r0, r2, r2, r0 + smlad r0, r12, r12, r0 + @ The whole operation cannot overflow in unsigned convention, + @ since signed values are at most 2047 (in absolute value) and + @ there are at most 1024 of them, hence a maximum squared norm + @ of 1024*2047*2047 = 4290774016, which fits on 32 bits. + subs r3, #4 + bne fndsa_mqpoly_sqnorm_signed__L1 + + bx lr + .size fndsa_mqpoly_sqnorm_signed,.-fndsa_mqpoly_sqnorm_signed + +@ ======================================================================= +@ uint32_t fndsa_mqpoly_sqnorm_ext(unsigned logn, const uint16_t *a) +@ ======================================================================= + + .align 2 + .global fndsa_mqpoly_sqnorm_ext + .thumb + .thumb_func + .type fndsa_mqpoly_sqnorm_ext, %function +fndsa_mqpoly_sqnorm_ext: + push.w { r4, r5, r6, r7 } + movs r3, #1 + lsls r3, r0 + + @ r5 <- q (in both halves) + movw r5, #Q + movt r5, #Q + @ r6 <- ceil(q/2) (in both halves) + movw r6, #((Q + 1) >> 1) + movt r6, #((Q + 1) >> 1) + + movw r0, #0 + @ We clear the Q flag, which we will use to detect overflows. + msr APSR_nzcvq, r0 +fndsa_mqpoly_sqnorm_ext__L1: + @ We can use ldrd because the caller ensured that the input is + @ aligned. + ldrd r2, r4, [r1], #8 + + @ Normalize values to [-q/2,+q/2] + ssub16 r7, r2, r5 + ssub16 r12, r2, r6 + sel r2, r7, r2 + ssub16 r7, r4, r5 + ssub16 r12, r4, r6 + sel r4, r7, r4 + @ If any addition overflows (signed interpretation), then the Q + @ flag will be set. + smlad r0, r2, r2, r0 + smlad r0, r4, r4, r0 + subs r3, #4 + bne fndsa_mqpoly_sqnorm_ext__L1 + + @ If the Q flag is set, saturate the returned value to 0xFFFFFFFF + mrs r1, APSR + sbfx r1, r1, #27, #1 + orrs r0, r1 + + pop { r4, r5, r6, r7 } + bx lr + .size fndsa_mqpoly_sqnorm_ext,.-fndsa_mqpoly_sqnorm_ext + +@ ======================================================================= +@ void fndsa_mqpoly_int_to_ntt(unsigned logn, uint16_t *d) +@ ======================================================================= + + .align 2 + .global fndsa_mqpoly_int_to_ntt + .thumb + .thumb_func + .type fndsa_mqpoly_int_to_ntt, %function +fndsa_mqpoly_int_to_ntt: + push.w { r4, r5, r6, r7, r8, r10, r11, lr } + + @ ASSUMPTION: logn >= 2 + + @ State: + @ r0 0 + @ r1 &d[j1] + @ r2 t = ht*2 + @ r3 middle loop counter + @ r6 s + @ r7 innermost loop counter + @ r8 &mq_GM[i + m] + @ r10 q + @ r11 q:q + @ r12 scratch + @ r14 -1/q mod 2^32 + @ + @ s2 d + @ s3 m + + vmov s2, r1 @ original &d[0] + movs r2, #1 + lsls r2, r0 @ r2 <- t = ht*2 (initially equal to n) + movw r0, #1 @ m <- 1 + vmov s3, r0 + + @ Constants. + @ r0 <- 0 + movw r0, #0 + @ r10 <- q + movw r10, #Q + @ r11 <- q (both halves) + orr r11, r10, r10, lsl #16 + @ r14 <- -1/q mod 2^32 + movw r14, #(Q1I & 0xFFFF) + movt r14, #(Q1I >> 16) + + @ r8 <- &mq_GM[1] + adr r8, fndsa_mqpoly_int_to_ntt__gmaddr_plus1 + ldr r8, [r8] + + @ If n = 4, then skip directly to the specialized code for the + @ last two iterations. + cmp r2, #4 + beq fndsa_mqpoly_int_to_ntt__L4 + +fndsa_mqpoly_int_to_ntt__L1: + @ Middle loop has m iterations. + vmov r3, s3 + lsl r6, r3, #1 @ prepare m for next iteration + vmov s3, r6 +fndsa_mqpoly_int_to_ntt__L2: + ldrh r6, [r8], #2 @ s <- mq_GM[i + m] + lsr r7, r2, #1 @ r7 <- ht +fndsa_mqpoly_int_to_ntt__L3: + @ Each inner loop iteration processes two pairs (x1,x2) and (y1,y2). + ldr.w r4, [r1, r2] @ r4 <- x2:y2 + + @ r5 <- mmul(y2, s) + smultb r5, r4, r6 + mul r12, r5, r14 + umaal r14, r5, r12, r10 + @ r4 <- mmul(x2, s) + smulbb r4, r4, r6 + mul r12, r4, r14 + umaal r14, r4, r12, r10 + @ r5 <- x2:y2 + pkhbt r5, r4, r5, lsl #16 + + @ r4 <- x1:y1 + ldr.w r4, [r1] @ r4 <- x1:y1 + + @ d[j1] <- x1+x2 : y1+y2 + @ d[j2] <- x1-x2 : y1-y2 + sadd16 r12, r4, r5 + ssub16 r5, r4, r5 + sel r4, r0, r11 + sadd16 r5, r5, r4 + str.w r5, [r1, r2] + ssub16 r4, r12, r11 + sel r4, r4, r12 + str.w r4, [r1], #4 + + @ loop ht/2 times + subs r7, #2 + bne fndsa_mqpoly_int_to_ntt__L3 + + @ --------------------------- + + @ j0 <- j0 + t + @ j0 is implicit in r1, which has been increased for ht elements, + @ hence we add ht here (ht*2, since elements are 2-byte values) + add.w r1, r1, r2 + @ We loop m times + subs r3, #1 + bne fndsa_mqpoly_int_to_ntt__L2 + + @ r1 now contains &d[n], we must reset it to &d[0] for the next + @ iteration. + vmov r1, s2 + + @ replace t with ht + @ Loop until t reaches 2 + lsr r2, r2, #1 + cmp r2, #4 + bne fndsa_mqpoly_int_to_ntt__L1 + +fndsa_mqpoly_int_to_ntt__L4: + @ Last two outer iterations use specialized code. + @ m = n/4 + @ t = 4 + @ We do n/4 inner iterations, each processing four consecutive values. + + @ Loop counter (m = n/4). + vmov r3, s3 + + @ We need two pointers to read s values; we use r8 and r7. + @ At this point, r8 is correct (&mq_GM[m]) and we set r7 to + @ &mq_GM[2*m] by adding 2*m (in bytes) to r8. + add r7, r8, r3, lsl #1 + + @ r2 is free, since we know it contains 4. + +fndsa_mqpoly_int_to_ntt__L5: + @ Next-to-last outer iteration: the four values are, in RAM order: + @ x1 y1 x2 y2 + @ We load x2:y2 (into r5) and s (into r6) + ldr.w r5, [r1, #4] + ldrh r6, [r8], #2 + + @ r4 <- mmul(x2, s) + smulbb r4, r5, r6 + mul r12, r4, r14 + umaal r14, r4, r12, r10 + @ r5 <- mmul(y2, s) + smultb r5, r5, r6 + mul r12, r5, r14 + umaal r14, r5, r12, r10 + @ r5 <- mmul(x2, s) : mmul(y2, s) + pkhbt r5, r4, r5, lsl #16 + + @ Load x1:y1 (into r4) + ldr.w r4, [r1] + + @ r4 <- (x1+mmul(x2,s)):(y1+mmul(y2,s)) + @ r5 <- (x1-mmul(x2,s)):(y1-mmul(y2,s)) + sadd16 r12, r4, r5 + ssub16 r5, r4, r5 + sel r4, r0, r11 + sadd16 r5, r5, r4 + ssub16 r4, r12, r11 + sel r4, r4, r12 + + @ Last iteration: the four values are, in RAM order: x1 x2 y1 y2 + @ The values have not been really written to RAM, though; they + @ are in r4 (x1:x2) and r5 (y1:y2). + @ Get the two relevant s values into r6. + ldr r6, [r7], #4 + + @ r2 <- x1:y1 + pkhbt r2, r4, r5, lsl #16 + @ r5 <- mmul(x2,s):mmul(y2,s) + smultb r4, r4, r6 + mul r12, r4, r14 + umaal r14, r4, r12, r10 + smultt r5, r5, r6 + mul r12, r5, r14 + umaal r14, r5, r12, r10 + pkhbt r5, r4, r5, lsl #16 + @ r4 <- (x1+mmul(x2,s):(y1+mmul(y2,s)) + sadd16 r4, r2, r5 + ssub16 r12, r4, r11 + sel r4, r12, r4 + @ r5 <- (x1-mmul(x2,s):(y1-mmul(y2,s)) + ssub16 r5, r2, r5 + sel r12, r0, r11 + sadd16 r5, r5, r12 + + @ We write the four final values in x1 x2 y1 y2 order. + pkhbt r12, r4, r5, lsl #16 + str.w r12, [r1], #4 + pkhtb r12, r5, r4, asr #16 + str.w r12, [r1], #4 + + subs r3, #1 + bne fndsa_mqpoly_int_to_ntt__L5 + +fndsa_mqpoly_int_to_ntt__Lend: + pop { r4, r5, r6, r7, r8, r10, r11, pc } + .align 2 +fndsa_mqpoly_int_to_ntt__gmaddr_plus1: + .word fndsa_mq_GM + 2 + .size fndsa_mqpoly_int_to_ntt,.-fndsa_mqpoly_int_to_ntt + +@ ======================================================================= +@ void fndsa_mqpoly_ntt_to_int(unsigned logn, uint16_t *d) +@ ======================================================================= + + .align 2 + .global fndsa_mqpoly_ntt_to_int + .thumb + .thumb_func + .type fndsa_mqpoly_ntt_to_int, %function +fndsa_mqpoly_ntt_to_int: + push.w { r4, r5, r6, r7, r8, r10, r11, lr } + + @ ASSUMPTION: logn >= 2 + + @ State: + @ r0 scratch + @ r1 &d[j1] + @ r2 dt = 2*t + @ r3 middle loop counter + @ r4 x1 + @ r5 x2 + @ r6 s + @ r7 innermost loop counter + @ r8 &mq_GM[i + hm] + @ r10 q + @ r11 q:q + @ r12 scratch + @ r14 -1/q mod 2^32 + @ + @ s2 d + @ s3 m + + @ We save the original d in s2. + vmov s2, r1 + @ m = n initially; we save m/2 to s3, and set r8 to &mq_iGM[m/2] + adr r8, fndsa_mqpoly_ntt_to_int__igmaddr + ldr r8, [r8] + movs r3, #1 + subs r0, #1 + lsl r0, r3, r0 @ r0 <- n/2 = 2^(logn-1) + add.w r8, r8, r0 @ r8 <- &mq_iGM[n/4] + lsr r3, r0, #1 + vmov s3, r3 @ s3 <- n/4 + + @ r0 <- 0 + movw r0, #0 + @ r10 <- q + movw r10, #Q + @ r11 <- q:q + orr r11, r10, r10, lsl #16 + @ r14 <- -1/q mod 2^32 + movw r14, #(Q1I & 0xFFFF) + movt r14, #(Q1I >> 16) + + @ r8 is the pointer into mq_GM[] for the second outer iteration. + @ r7 is the pointer into mq_GM[] for the first outer iteration. + add r7, r8, r3, lsl #1 + + @ r3 is the loop counter. r10, r11 and r14 are constants used for + @ modular reduction. r2, r4, r5 and r12 are scratch. + + @ First two iterations are specialized. +fndsa_mqpoly_ntt_to_int__L0: + @ First iteration: values are in x1 x2 y1 y2 order. + ldr r2, [r1] @ r2 <- x1:x2 + ldr r5, [r1, #4] @ r5 <- y1:y2 + ldr.w r6, [r7], #4 @ r6 <- s1:s2 + + @ r4 <- x1:y1 + pkhbt r4, r2, r5, lsl #16 + @ r5 <- x2:y2 + pkhtb r5, r5, r2, asr #16 + @ r2 <- (x1+x2)/2:(y1+y2)/2 + sadd16 r2, r4, r5 + ssub16 r12, r2, r11 + sel r2, r12, r2 + and r12, r2, #0x00010001 + umlal r2, r12, r12, r10 + lsr.w r2, r2, #1 + @ r5 <- (x1-x2):(y1-y2) + ssub16 r5, r4, r5 + sel r12, r0, r11 + sadd16 r5, r5, r12 + @ r4 <- mmul(x1-x2,s) + smulbb r4, r5, r6 + mul r12, r4, r14 + umaal r14, r4, r12, r10 + @ r5 <- mmul(y1-y2,s) + smultt r5, r5, r6 + mul r12, r5, r14 + umaal r14, r5, r12, r10 + + @ Second iteration. Normally we get x1 y1 x2 y2 from RAM; here, + @ we have x1:x2 in r2, y1 in r4 and y2 in r5. + @ Reorganize the values: + pkhbt r4, r2, r4, lsl #16 @ r4 <- x1:y1 + lsl.w r5, r5, #16 + orr r5, r5, r2, lsr #16 @ r5 <- x2:y2 + @ Read s for the second iteration. + ldrh r6, [r8], #2 + + @ r2 <- (x1+x2)/2:(y1+y2)/2 + sadd16 r2, r4, r5 + ssub16 r12, r2, r11 + sel r2, r12, r2 + and r12, r2, #0x00010001 + umlal r2, r12, r12, r10 + lsr.w r2, r2, #1 + @ r5 <- (x1-x2):(y1-y2) + ssub16 r5, r4, r5 + sel r12, r0, r11 + sadd16 r5, r5, r12 + @ r4 <- mmul(x1-x2,s) + smulbb r4, r5, r6 + mul r12, r4, r14 + umaal r14, r4, r12, r10 + @ r5 <- mmul(y1-y2,s) + smultb r5, r5, r6 + mul r12, r5, r14 + umaal r14, r5, r12, r10 + + @ Repack values, to write them in x1 y1 x2 y2 order. + str.w r2, [r1], #4 + pkhbt r4, r4, r5, lsl #16 + str.w r4, [r1], #4 + + @ Loop n/4 times. + subs r3, #1 + bne fndsa_mqpoly_ntt_to_int__L0 + + @ Prepare for remaining iterations. + @ r2 <- -2*t = -8 + movs r2, #8 + rsbs r2, #0 + @ r3 <- m (for next iteration) + vmov r3, s3 + + @ If logn=2 then m=1 and we are finished. + cmp r3, #1 + beq fndsa_mqpoly_ntt_to_int__Lend + +fndsa_mqpoly_ntt_to_int__L1: + @ Rewind r1 to start of array. + vmov r1, s2 + + @ m is in r3. r8 was left at &mq_iGM[2*m]; we need to adjust it + @ to &mq_iGM[m/2], by subtracting 3*m (each element is two bytes). + sub r8, r8, r3, lsl #1 + sub.w r8, r8, r3 + + @ Middle loop has m/2 iterations (r3 is used as counter). +fndsa_mqpoly_ntt_to_int__L2: + ldrh r6, [r8], #2 @ s <- mq_iGM[i + m/2] + + asrs r7, r2, #1 @ r7 <- -t + @ We use r1 to point to the second pair (x2:y2); r2 is negative. + @ The inner loop will inherently adjust r1 to point to the start + @ of the next chunk for the next middle loop iteration. + subs r1, r1, r2 + +fndsa_mqpoly_ntt_to_int__L3: + @ Each inner loop iteration processes two pairs (x1,x2) and (y1,y2). + ldr r4, [r1, r2] @ r4 <- x1:y1 + ldr r5, [r1] @ r5 <- x2:y2 + + @ r4 <- (x1+x2):(y1+y2) + @ r5 <- (x1-x2):(y1-x2) + sadd16 r12, r4, r5 + ssub16 r5, r4, r5 + sel r4, r0, r11 + sadd16 r5, r5, r4 + ssub16 r4, r12, r11 + sel r4, r4, r12 + @ r4 <- (x1+x2)/2:(y1+y2)/2 + and r12, r4, #0x00010001 + umlal r4, r12, r12, r10 + lsr.w r4, r4, #1 + @ Write first output word + str.w r4, [r1, r2] + + @ r5 <- mmul(x1-x2,s):mmul(y1-y2,s) + smulbb r4, r5, r6 + mul r12, r4, r14 + umaal r14, r4, r12, r10 + smultb r5, r5, r6 + mul r12, r5, r14 + umaal r14, r5, r12, r10 + pkhbt r5, r4, r5, lsl #16 + @ Write second output word + str.w r5, [r1], #4 + + @ We should do t iterations, but since we process a pair of elements + @ each time, we only do t/2 iterations. Take care that the r7 counter + @ is negative. + adds r7, #2 + bne fndsa_mqpoly_ntt_to_int__L3 + + @ We loop m/2 times + subs r3, #2 + bne fndsa_mqpoly_ntt_to_int__L2 + + @ Replace -t with -dt = 2*(-t) + lsl.w r2, r2, #1 + + @ Replace m with m/2. We are finished when m becomes 1. + vmov r3, s3 + lsr.w r3, r3, #1 + vmov s3, r3 + cmp r3, #1 + bne fndsa_mqpoly_ntt_to_int__L1 + +fndsa_mqpoly_ntt_to_int__Lend: + pop { r4, r5, r6, r7, r8, r10, r11, pc } + .align 2 +fndsa_mqpoly_ntt_to_int__igmaddr: + .word fndsa_mq_iGM + .size fndsa_mqpoly_ntt_to_int,.-fndsa_mqpoly_ntt_to_int diff --git a/crypto_sign/fndsa_provisional-512/m4f/sha3.c b/crypto_sign/fndsa_provisional-512/m4f/sha3.c new file mode 120000 index 00000000..07f53388 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/sha3.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/sha3.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/sha3_cm4.s b/crypto_sign/fndsa_provisional-512/m4f/sha3_cm4.s new file mode 100644 index 00000000..fce3fd00 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/sha3_cm4.s @@ -0,0 +1,1061 @@ + .syntax unified + .cpu cortex-m4 + .file "sha3_cm4.s" + .text + +@ ======================================================================= +@ void fndsa_sha3_inject_chunk(void *dst, const void *src, size_t len) +@ ======================================================================= + + .align 2 + .global fndsa_sha3_inject_chunk + .thumb + .thumb_func + .type fndsa_sha3_inject_chunk, %function +fndsa_sha3_inject_chunk: + push { r4, r5 } + + @ If less than 8 bytes to inject, do it byte-by-byte. + cmp r2, #8 + blo fndsa_sha3_inject_chunk__L4 + + @ Process some bytes until the destination is aligned. + rsbs r5, r0, #0 + ands r5, r5, #3 + beq fndsa_sha3_inject_chunk__L2 + subs r2, r5 +fndsa_sha3_inject_chunk__L1: + ldrb.w r3, [r0] + ldrb r4, [r1], #1 + eor r3, r3, r4 + strb r3, [r0], #1 + subs r5, #1 + bne fndsa_sha3_inject_chunk__L1 + +fndsa_sha3_inject_chunk__L2: + @ Destination is aligned. Source might be unaligned, but the + @ Cortex-M4 tolerates unaligns accesses with a penalty which is + @ lower than doing word reassembly in software. + lsr r5, r2, #2 +fndsa_sha3_inject_chunk__L3: + ldr.w r3, [r0] + ldr r4, [r1], #4 + eor r3, r3, r4 + str r3, [r0], #4 + subs r5, #1 + bne fndsa_sha3_inject_chunk__L3 + + @ We may have a remaining tail of up to 3 bytes. + ands r2, r2, #3 + beq.w fndsa_sha3_inject_chunk__L5 + +fndsa_sha3_inject_chunk__L4: + @ Byte-by-byte processing for the data tail. + ldrb.w r3, [r0] + ldrb r4, [r1], #1 + eor r3, r3, r4 + strb r3, [r0], #1 + subs r2, #1 + bne fndsa_sha3_inject_chunk__L4 + +fndsa_sha3_inject_chunk__L5: + pop { r4, r5 } + bx lr + .size fndsa_sha3_inject_chunk,.-fndsa_sha3_inject_chunk + +@ ======================================================================= +@ bit_split_5(uint64_t x0, uint64_t x1, uint64_t x2, uint64_t x3, uint64_t x4) +@ Split inputs x0 to x4 into even-indexed and odd-indexed bits. +@ Internal function only; non-standard ABI: +@ input: +@ r0:r1 x0 +@ r2:r3 x1 +@ r4:r5 x2 +@ r6:r7 x3 +@ r10:r11 x4 +@ ASPR.GE flags must have pattern 0110. +@ +@ output: +@ r0 even-indexed bits of x0 +@ r1 odd-indexed bits of x0 +@ r2 even-indexed bits of x1 +@ r3 odd-indexed bits of x1 +@ r4 even-indexed bits of x2 +@ r5 odd-indexed bits of x2 +@ r6 even-indexed bits of x3 +@ r7 odd-indexed bits of x3 +@ r10 even-indexed bits of x4 +@ r11 odd-indexed bits of x4 +@ clobbers: +@ r8, r14 +@ +@ bit_split_1, bit_split_2, bit_split_3 and bit_split_4 are alternate +@ entry points that process only the first 1, 2, 3 or 4 words. +@ ======================================================================= + + @ This macro splits a word (input register xx) into its + @ even-indexed bits (into the low half of output register dd) + @ and odd-indexed bits (high half of dd). + @ This macro assumes that the ASPR.GE flags have the 0110 pattern. + @ dd and xx cannot be the same register. xx is consumed. +.macro BIT_SPLIT_32 xx, dd + eor \dd, \xx, \xx, lsr #1 + and \dd, \dd, #0x22222222 + eor \xx, \xx, \dd + eor \xx, \xx, \dd, lsl #1 + eor \dd, \xx, \xx, lsr #2 + and \dd, \dd, #0x0C0C0C0C + eor \xx, \xx, \dd + eor \xx, \xx, \dd, lsl #2 + eor \dd, \xx, \xx, lsr #4 + and \dd, \dd, #0x00F000F0 + eor \xx, \xx, \dd + eor \xx, \xx, \dd, lsl #4 + rev \dd, \xx + sel \dd, \dd, \xx +.endm + + @ Split a 64-bit value x0:x1 into its even-indexed bits (into x0) + @ and high-indexed bits (into x1). xt is a scratch register. + @ This macro assumes that the ASPR.GE flags have the 0110 pattern. +.macro BIT_SPLIT_64 x0, x1, xt + BIT_SPLIT_32 \x0, \xt + BIT_SPLIT_32 \x1, \x0 + pkhtb \x1, \x0, \xt, asr #16 + pkhbt \x0, \xt, \x0, lsl #16 +.endm + + .align 2 + .thumb + .thumb_func + .type bit_split_5, %function +bit_split_5: + BIT_SPLIT_64 r10, r11, r8 +bit_split_4: + BIT_SPLIT_64 r6, r7, r8 +bit_split_3: + BIT_SPLIT_64 r4, r5, r8 +bit_split_2: + BIT_SPLIT_64 r2, r3, r8 +bit_split_1: + BIT_SPLIT_64 r0, r1, r8 + bx lr + .size bit_split_5, .-bit_split_5 + +@ ======================================================================= +@ bit_merge_5(uint64_t x0, uint64_t x1, uint64_t x2, uint64_t x3, uint64_t x4) +@ Merge inputs x0 to x4 with bit interleaving. For i = 0 to 4, the +@ low word of x_i contains the even-indexed bits, and the high word +@ contains the odd-indexed bits. +@ Internal function only; non-standard ABI: +@ input: +@ r0:r1 x0 +@ r2:r3 x1 +@ r4:r5 x2 +@ r6:r7 x3 +@ r10:r11 x4 +@ ASPR.GE flags must have pattern 0110. +@ +@ output: +@ r0:r1 merged x0 +@ r2:r3 merged x1 +@ r4:r5 merged x2 +@ r6:r7 merged x3 +@ r10:r11 merged x4 +@ clobbers: +@ r8, r14 +@ +@ bit_merge_1, bit_merge_2, bit_merge_3 and bit_merge_4 are alternate +@ entry points that process only the first 1, 2, 3 or 4 words. +@ ======================================================================= + + @ This macro merges a word (input register xx): low half yields + @ the even-indexed bits, and hight half provides the odd-indexed + @ bits. Output is written into register dd. + @ This macro assumes that the ASPR.GE flags have the 0110 pattern. + @ dd and xx cannot be the same register. xx is consumed. +.macro BIT_MERGE_32 xx, dd + rev \dd, \xx + sel \xx, \dd, \xx + eor \dd, \xx, \xx, lsr #4 + and \dd, \dd, #0x00F000F0 + eor \xx, \xx, \dd + eor \xx, \xx, \dd, lsl #4 + eor \dd, \xx, \xx, lsr #2 + and \dd, \dd, #0x0C0C0C0C + eor \xx, \xx, \dd + eor \xx, \xx, \dd, lsl #2 + eor \dd, \xx, \xx, lsr #1 + and \dd, \dd, #0x22222222 + eor \xx, \xx, \dd + eor \dd, \xx, \dd, lsl #1 +.endm + + @ BIT_MERGE_64 interleaves the bits from x0 and from x1, result + @ is written back to x0:x1. xt is a scratch register. + @ This macro assumes that the ASPR.GE flags have the 0110 pattern. +.macro BIT_MERGE_64 x0, x1, xt + pkhtb \xt, \x1, \x0, asr #16 + pkhbt \x1, \x0, \x1, lsl #16 + BIT_MERGE_32 \x1, \x0 + BIT_MERGE_32 \xt, \x1 +.endm + + .align 2 + .thumb + .thumb_func + .type bit_merge_5, %function +bit_merge_5: + BIT_MERGE_64 r10, r11, r8 +bit_merge_4: + BIT_MERGE_64 r6, r7, r8 +bit_merge_3: + BIT_MERGE_64 r4, r5, r8 +bit_merge_2: + BIT_MERGE_64 r2, r3, r8 +bit_merge_1: + BIT_MERGE_64 r0, r1, r8 + bx lr + .size bit_merge_5, .-bit_merge_5 + +@ ======================================================================= +@ void fndsa_sha3_process_block(uint64_t *A, unsigned r) +@ ======================================================================= + + .align 2 + .global fndsa_sha3_process_block + .thumb + .thumb_func + .type fndsa_sha3_process_block, %function +fndsa_sha3_process_block: + push.w { r4, r5, r6, r7, r8, r10, r11, lr } + vpush.64 { d8, d9, d10, d11, d12, d13, d14, d15 } + + @ Source state is read from the provided buffer. The first r + @ words (with "word" being a 64-bit state element) are split + @ into even-indexed and odd-indexed bits. The state is loaded + @ into FP registers and a stack buffer: + @ d0 to d15 receive A[0] to A[15] + @ sp[] receives A[i] at offset 8*(i-16) for i >= 16 + + @ TODO: with the split, most of each round is really two separate + @ sequences, each working on 25 32-bit values; they communicate + @ with each other only through the rotation of the XOR of the + @ lanes (step 2 of theta, section 3.2.1 in FIPS 202) and when + @ further lane rotations use an odd count (rho function, section + @ 3.2.2). We might be able to leverage that to improve locality, + @ i.e. keep more values in integer registers and reduce traffic + @ with storage (FP registers and stack). The state layout in FP + @ registers and the stack would have to change so that accesses + @ to FP can still be done with the double-width vmov most of + @ the time. + + @ Stack: + @ off size + @ 0 72 state words 16 to 24 + @ 72 4 pointer to state array + @ 76 4 number of data words + @ 80 8 temporary for one state word + sub sp, #88 + str r0, [sp, #72] @ Save state pointer + str r1, [sp, #76] @ Save rate (in 64-bit words) + mov.n r14, r1 + + @ Read word A[idx] into the specified register pair. +.macro A_LD x0, x1, idx + .if ((\idx) == 0) + vmov \x0, \x1, s0, s1 + .elseif ((\idx) == 1) + vmov \x0, \x1, s2, s3 + .elseif ((\idx) == 2) + vmov \x0, \x1, s4, s5 + .elseif ((\idx) == 3) + vmov \x0, \x1, s6, s7 + .elseif ((\idx) == 4) + vmov \x0, \x1, s8, s9 + .elseif ((\idx) == 5) + vmov \x0, \x1, s10, s11 + .elseif ((\idx) == 6) + vmov \x0, \x1, s12, s13 + .elseif ((\idx) == 7) + vmov \x0, \x1, s14, s15 + .elseif ((\idx) == 8) + vmov \x0, \x1, s16, s17 + .elseif ((\idx) == 9) + vmov \x0, \x1, s18, s19 + .elseif ((\idx) == 10) + vmov \x0, \x1, s20, s21 + .elseif ((\idx) == 11) + vmov \x0, \x1, s22, s23 + .elseif ((\idx) == 12) + vmov \x0, \x1, s24, s25 + .elseif ((\idx) == 13) + vmov \x0, \x1, s26, s27 + .elseif ((\idx) == 14) + vmov \x0, \x1, s28, s29 + .elseif ((\idx) == 15) + vmov \x0, \x1, s30, s31 + .else + ldrd \x0, \x1, [sp, #(8 * ((\idx) - 16))] + .endif +.endm + + @ Like A_LD, except that it uses two ldr opcodes instead of one + @ ldrd for the words which are on the stack. This allows that + @ load to pipeline with a previous load. + @ WARNING: the two destination registers shall be both low + @ (r0 to r7) or both high (r8 to r14), otherwise misalignment + @ may occur. When the two registers are high, the footprint is + @ 8 bytes, while A_LD would use 4 bytes. +.macro A_LDX x0, x1, idx + .if (\idx) <= 15 + A_LD \x0, \x1, \idx + .else + ldr \x0, [sp, #(8 * ((\idx) - 16))] + ldr \x1, [sp, #(8 * ((\idx) - 16) + 4)] + .endif +.endm + + @ Write into word A[idx] from the specified register pair. + @ WARNING: the two destination registers shall be both low + @ (r0 to r7) or both high (r8 to r14), otherwise misalignment + @ may occur. +.macro A_ST x0, x1, idx + .if ((\idx) == 0) + vmov s0, s1, \x0, \x1 + .elseif ((\idx) == 1) + vmov s2, s3, \x0, \x1 + .elseif ((\idx) == 2) + vmov s4, s5, \x0, \x1 + .elseif ((\idx) == 3) + vmov s6, s7, \x0, \x1 + .elseif ((\idx) == 4) + vmov s8, s9, \x0, \x1 + .elseif ((\idx) == 5) + vmov s10, s11, \x0, \x1 + .elseif ((\idx) == 6) + vmov s12, s13, \x0, \x1 + .elseif ((\idx) == 7) + vmov s14, s15, \x0, \x1 + .elseif ((\idx) == 8) + vmov s16, s17, \x0, \x1 + .elseif ((\idx) == 9) + vmov s18, s19, \x0, \x1 + .elseif ((\idx) == 10) + vmov s20, s21, \x0, \x1 + .elseif ((\idx) == 11) + vmov s22, s23, \x0, \x1 + .elseif ((\idx) == 12) + vmov s24, s25, \x0, \x1 + .elseif ((\idx) == 13) + vmov s26, s27, \x0, \x1 + .elseif ((\idx) == 14) + vmov s28, s29, \x0, \x1 + .elseif ((\idx) == 15) + vmov s30, s31, \x0, \x1 + .else + @ Two str opcodes will pair and run in 2 cycles (as long as there + @ is no stall from another memory access immediately before or + @ after); strd would be shorter (one instruction) but use 3 cycles. + str \x0, [sp, #(8 * ((\idx) - 16))] + str \x1, [sp, #(8 * ((\idx) - 16) + 4)] + .endif +.endm + + @ Rotate-right registers x0 and x1 by e0 and e1 bits, respectively; + @ rotation counts must be in [0,31]. Rotation is skipped when the + @ rotation count is zero. +.macro ROR_WORD x0, x1, e0, e1 + .if (\e0) != 0 + ror \x0, \x0, #(\e0) + .endif + .if (\e1) != 0 + ror \x1, \x1, #(\e1) + .endif +.endm + + @ XOR right-rotated registers xa into registers xd. Rotation count + @ must be in [0,31]. +.macro XOR_ROR_WORD xd0, xd1, xa0, xa1, e0, e1 + .if (\e0) == 0 + eor \xd0, \xd0, \xa0 + .else + eor \xd0, \xd0, \xa0, ror #(\e0) + .endif + .if (\e1) == 0 + eor \xd1, \xd1, \xa1 + .else + eor \xd1, \xd1, \xa1, ror #(\e1) + .endif +.endm + + @ Prepare the ASPR.GE flags with pattern 0110. + @ All operations in the complete routine preserve these flags. This + @ flag pattern is used in bit_split_5 and bit_merge_5. + movw r7, #0xFF00 + movt r7, #0x00FF + uadd8 r7, r7, r7 + + @ For the initial input, we must split even/odd bits from data word + @ (the non-data words are assumed to be already split), and + @ pre-rotate all words in the way the loop expects. + @ + @ Possible rate values (in 64-bit words): + @ SHAKE128 21 + @ SHA3-224 18 + @ SHAKE256, SHA3-256 17 + @ SHA3-384 13 + @ SHA3-512 9 + @ + @ We fast-path the case r = 17, which corresponds to SHAKE256 and + @ SHA3-256. + + @ A[0] to A[5] + mov.w r12, r0 @ move state pointer to r12 + ldm r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 } + bl bit_split_5 + ROR_WORD r0, r1, 0, 0 + ROR_WORD r2, r3, 22, 22 + ROR_WORD r4, r5, 22, 21 + ROR_WORD r6, r7, 11, 10 + ROR_WORD r10, r11, 7, 7 + A_ST r0, r1, 0 + A_ST r2, r3, 1 + A_ST r4, r5, 2 + A_ST r6, r7, 3 + A_ST r10, r11, 4 + @ A[5] to A[10] + ldm r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 } + @ If r = 9 then we split only four words; otherwise, r >= 13 + @ and we split 5 words. + cmp r14, #9 + bhi.w fndsa_sha3_process_block__L1 + bl bit_split_4 + b.w fndsa_sha3_process_block__L2 +fndsa_sha3_process_block__L1: + bl bit_split_5 +fndsa_sha3_process_block__L2: + ROR_WORD r0, r1, 14, 14 + ROR_WORD r2, r3, 10, 10 + ROR_WORD r4, r5, 2, 1 + ROR_WORD r6, r7, 23, 22 + ROR_WORD r10, r11, 31, 30 + A_ST r0, r1, 5 + A_ST r2, r3, 6 + A_ST r4, r5, 7 + A_ST r6, r7, 8 + A_ST r10, r11, 9 + @ A[10] to A[14] + ldm r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 } + @ If r = 9 then we do not split any word. + @ If r = 13 then we split three words. + @ Otherwise, r >= 17 and we split 5 words. + cmp r14, #13 + bhi.w fndsa_sha3_process_block__L3 + cmp r14, #10 + bls.w fndsa_sha3_process_block__L4 + bl bit_split_3 + b.w fndsa_sha3_process_block__L4 +fndsa_sha3_process_block__L3: + bl bit_split_5 +fndsa_sha3_process_block__L4: + ROR_WORD r0, r1, 1, 0 + ROR_WORD r2, r3, 3, 3 + ROR_WORD r4, r5, 13, 12 + ROR_WORD r6, r7, 4, 4 + ROR_WORD r10, r11, 9, 9 + A_ST r0, r1, 10 + A_ST r2, r3, 11 + A_ST r4, r5, 12 + A_ST r6, r7, 13 + A_ST r10, r11, 14 + @ A[15] to A[19] + ldm r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 } + @ If r = 9 or 13, then we do not split any word. + @ If r = 17 then we split two words. + @ If r = 18 then we split three words. + @ Otherwise, r = 21 and we split 5 words. + cmp r14, #17 + beq.w fndsa_sha3_process_block__L6 + cmp r14, #18 + beq.w fndsa_sha3_process_block__L5 + cmp r14, #15 + bls.w fndsa_sha3_process_block__L7 + bl bit_split_5 + b.w fndsa_sha3_process_block__L7 +fndsa_sha3_process_block__L5: + bl bit_split_3 + b.w fndsa_sha3_process_block__L7 +fndsa_sha3_process_block__L6: + bl bit_split_2 +fndsa_sha3_process_block__L7: + ROR_WORD r0, r1, 14, 13 + ROR_WORD r2, r3, 18, 18 + ROR_WORD r4, r5, 5, 5 + ROR_WORD r6, r7, 8, 7 + ROR_WORD r10, r11, 28, 28 + A_ST r0, r1, 15 + A_ST r2, r3, 16 + A_ST r4, r5, 17 + A_ST r6, r7, 18 + A_ST r10, r11, 19 + @ A[20] to A[24] + ldm r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 } + @ If r = 9, 13, 17 or 18, then no split happens here. + @ Otherwise, r = 21 and we split word A[20]. + cmp r14, #20 + bls.w fndsa_sha3_process_block__L8 + bl bit_split_1 +fndsa_sha3_process_block__L8: + @ We have the split but not rotated words in the registers; we + @ want to keep them that way, hence we have to copy through r8:r12 + @ for the pre-rotation. + mov r8, r0 + mov r12, r1 + ROR_WORD r8, r12, 31, 31 + A_ST r8, r12, 20 + mov r8, r2 + mov r12, r3 + ROR_WORD r8, r12, 28, 27 + A_ST r8, r12, 21 + mov r8, r4 + mov r12, r5 + ROR_WORD r8, r12, 20, 19 + A_ST r8, r12, 22 + mov r8, r6 + mov r12, r7 + ROR_WORD r8, r12, 21, 20 + A_ST r8, r12, 23 + mov r8, r10 + mov r12, r11 + ROR_WORD r8, r12, 1, 1 + A_ST r8, r12, 24 + + @ Here begins the preamble for the first iteration (XORing the + @ words into t0..t4). Afterwards, that operation is done at the + @ end of each iteration (in preparation for the next one) so + @ this sequence is done only once. + + @ xor(A[5*i+0]) -> r0:r1 + @ xor(A[5*i+1]) -> r2:r3 + @ xor(A[5*i+2]) -> r4:r5 + @ xor(A[5*i+3]) -> r6:r7 + @ xor(A[5*i+4]) -> r10:r11 + + @ Previous code left A[20..24] into the registers, we do not have + @ to read them again. + @add r12, sp, #32 + @ldm r12, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 } + A_LD r8, r12, 0 + XOR_ROR_WORD r0, r1, r8, r12, 0, 0 + A_LD r8, r12, 5 + XOR_ROR_WORD r0, r1, r8, r12, 18, 18 + A_LD r8, r12, 10 + XOR_ROR_WORD r0, r1, r8, r12, 31, 0 + A_LD r8, r12, 15 + XOR_ROR_WORD r0, r1, r8, r12, 18, 19 + + A_LD r8, r12, 1 + XOR_ROR_WORD r2, r3, r8, r12, 10, 10 + A_LD r8, r12, 6 + XOR_ROR_WORD r2, r3, r8, r12, 22, 22 + A_LD r8, r12, 11 + XOR_ROR_WORD r2, r3, r8, r12, 29, 29 + A_LD r8, r12, 16 + XOR_ROR_WORD r2, r3, r8, r12, 14, 14 + + A_LD r8, r12, 2 + XOR_ROR_WORD r4, r5, r8, r12, 10, 11 + A_LD r8, r12, 7 + XOR_ROR_WORD r4, r5, r8, r12, 30, 31 + A_LD r8, r12, 12 + XOR_ROR_WORD r4, r5, r8, r12, 19, 20 + A_LD r8, r12, 17 + XOR_ROR_WORD r4, r5, r8, r12, 27, 27 + + A_LD r8, r12, 3 + XOR_ROR_WORD r6, r7, r8, r12, 21, 22 + A_LD r8, r12, 8 + XOR_ROR_WORD r6, r7, r8, r12, 9, 10 + A_LD r8, r12, 13 + XOR_ROR_WORD r6, r7, r8, r12, 28, 28 + A_LD r8, r12, 18 + XOR_ROR_WORD r6, r7, r8, r12, 24, 25 + + A_LD r8, r12, 4 + XOR_ROR_WORD r10, r11, r8, r12, 25, 25 + A_LD r8, r12, 9 + XOR_ROR_WORD r10, r11, r8, r12, 1, 2 + A_LD r8, r12, 14 + XOR_ROR_WORD r10, r11, r8, r12, 23, 23 + A_LD r8, r12, 19 + XOR_ROR_WORD r10, r11, r8, r12, 4, 4 + + @ We will perform 24 rounds. Each loop iteration performs one round. + @ We keep minus eight times the current round counter in r14 (i.e. a + @ multiple of 8, from -192 to -8). + mvn r14, #0xBF + +fndsa_sha3_process_block__loop_step2: + @ The A[] words have delayed rotations from the previous round: + @ A[ 0] er0: 0 er1: 0 + @ A[ 1] er0: 10 er1: 10 + @ A[ 2] er0: 10 er1: 11 + @ A[ 3] er0: 21 er1: 22 + @ A[ 4] er0: 25 er1: 25 + @ A[ 5] er0: 18 er1: 18 + @ A[ 6] er0: 22 er1: 22 + @ A[ 7] er0: 30 er1: 31 + @ A[ 8] er0: 9 er1: 10 + @ A[ 9] er0: 1 er1: 2 + @ A[10] er0: 31 er1: 0 + @ A[11] er0: 29 er1: 29 + @ A[12] er0: 19 er1: 20 + @ A[13] er0: 28 er1: 28 + @ A[14] er0: 23 er1: 23 + @ A[15] er0: 18 er1: 19 + @ A[16] er0: 14 er1: 14 + @ A[17] er0: 27 er1: 27 + @ A[18] er0: 24 er1: 25 + @ A[19] er0: 4 er1: 4 + @ A[20] er0: 1 er1: 1 + @ A[21] er0: 4 er1: 5 + @ A[22] er0: 12 er1: 13 + @ A[23] er0: 11 er1: 12 + @ A[24] er0: 31 er1: 31 + + @ t0 = xor(A[5*i+4]) ^ rotl1(xor(A[5*i+1])) -> r8:r12 + @ t1 = xor(A[5*i+0]) ^ rotl1(xor(A[5*i+2])) -> r0:r1 + @ t2 = xor(A[5*i+1]) ^ rotl1(xor(A[5*i+3])) -> r2:r3 + @ t3 = xor(A[5*i+2]) ^ rotl1(xor(A[5*i+4])) -> r4:r5 + @ t4 = xor(A[5*i+3]) ^ rotl1(xor(A[5*i+0])) -> r6:r7 + + eor r12, r11, r2 + eor r8, r10, r3, ror #31 + eor r3, r3, r6 + eor r2, r2, r7, ror #31 + eor r7, r7, r0 + eor r6, r6, r1, ror #31 + eor r1, r1, r4 + eor r0, r0, r5, ror #31 + eor r5, r5, r10 + eor r4, r4, r11, ror #31 + + @ XOR each t_i into A[5*j+i] (for j = 0 to 4). + @ t0:t1 value t_i (register pair) + @ idx index of A[] word + @ e0, e1 delayed rotations for the A[] word + @ swap non-zero for a register swap + @ The delayed rotations (from the previous round) are absorbed here. + @ New delayed rotations are created here; only the register swap + @ is performed (if the new rotation count, over 64 bits, is odd). + @ Clobbers: r10, r11 + +.macro XOR_T t0, t1, idx, e0, e1, swap + A_LD r10, r11, \idx + .if (\e0) == 0 + eor r10, \t0, r10 + .else + eor r10, \t0, r10, ror #(\e0) + .endif + .if (\e1) == 0 + eor r11, \t1, r11 + .else + eor r11, \t1, r11, ror #(\e1) + .endif + .if (\swap) != 0 + A_ST r11, r10, \idx + .else + A_ST r10, r11, \idx + .endif +.endm + + @ We process all words except 0, 6, 12, 18 and 24, which come last + @ with a special sequence. + @ We also interleave "high" (16+) and "low" (0 to 15) words, so as + @ to avoid str + ldrd sequences which create memory stalls. + + XOR_T r0, r1, 1, 10, 10, 1 + XOR_T r0, r1, 16, 14, 14, 1 + XOR_T r2, r3, 2, 10, 11, 0 + XOR_T r2, r3, 17, 27, 27, 1 + XOR_T r4, r5, 3, 21, 22, 0 + XOR_T r6, r7, 19, 4, 4, 0 + XOR_T r6, r7, 4, 25, 25, 1 + XOR_T r8, r12, 20, 1, 1, 0 + XOR_T r8, r12, 5, 18, 18, 0 + XOR_T r0, r1, 21, 4, 5, 0 + XOR_T r2, r3, 7, 30, 31, 0 + XOR_T r2, r3, 22, 12, 13, 1 + XOR_T r4, r5, 8, 9, 10, 1 + XOR_T r4, r5, 23, 11, 12, 0 + XOR_T r6, r7, 9, 1, 2, 0 + XOR_T r8, r12, 10, 31, 0, 1 + XOR_T r0, r1, 11, 29, 29, 0 + XOR_T r4, r5, 13, 28, 28, 1 + XOR_T r6, r7, 14, 23, 23, 1 + XOR_T r8, r12, 15, 18, 19, 1 + + @ For words 0, 6, 12, 18 and 24, we omit writing back to storage + @ because we'll need them right away in the first KHI_STEP. + A_LD r10, r11, 24 + eor r10, r6, r10, ror #31 + eor r11, r7, r11, ror #31 + A_LD r7, r6, 18 + eor r7, r4, r7, ror #24 + eor r6, r5, r6, ror #25 + A_LD r5, r4, 12 + eor r5, r2, r5, ror #19 + eor r4, r3, r4, ror #20 + A_LD r2, r3, 6 + eor r2, r0, r2, ror #22 + eor r3, r1, r3, ror #22 + A_LD r0, r1, 0 + eor r0, r8, r0 + eor r1, r12, r1 + + @ Delayed right-rotations on low (even) and high (odd) words: + @ A[ 0] dr0: 0 dr1: 0 + @ A[ 1] dr0: 31 dr1: 0 + @ A[ 2] dr0: 1 dr1: 1 + @ A[ 3] dr0: 18 dr1: 18 + @ A[ 4] dr0: 18 dr1: 19 + @ A[ 5] dr0: 14 dr1: 14 + @ A[ 6] dr0: 10 dr1: 10 + @ A[ 7] dr0: 29 dr1: 29 + @ A[ 8] dr0: 4 dr1: 5 + @ A[ 9] dr0: 22 dr1: 22 + @ A[10] dr0: 30 dr1: 31 + @ A[11] dr0: 27 dr1: 27 + @ A[12] dr0: 10 dr1: 11 + @ A[13] dr0: 19 dr1: 20 + @ A[14] dr0: 12 dr1: 13 + @ A[15] dr0: 11 dr1: 12 + @ A[16] dr0: 9 dr1: 10 + @ A[17] dr0: 24 dr1: 25 + @ A[18] dr0: 21 dr1: 22 + @ A[19] dr0: 28 dr1: 28 + @ A[20] dr0: 23 dr1: 23 + @ A[21] dr0: 31 dr1: 31 + @ A[22] dr0: 1 dr1: 2 + @ A[23] dr0: 4 dr1: 4 + @ A[24] dr0: 25 dr1: 25 + +@ Apply operation 'op' (Boolean bitwise opcode) on values xa0:xa1 +@ and xb0:xb1 (register pairs), then XOR with xd0:xd1 and write +@ result at index j. Each register comes with a "delayed rotation" count +@ which is applied here. +.macro KHI_OP xa0, da0, xa1, da1, xb0, db0, xb1, db1, xd0, dd0, xd1, dd1, j + @ 'op' on xa and xb + .if (\da0) == (\db0) + bic r8, \xb0, \xa0 + .else + bic r8, \xb0, \xa0, ror #((32 + (\da0) - (\db0)) & 31) + .endif + .if (\da1) == (\db1) + bic r12, \xb1, \xa1 + .else + bic r12, \xb1, \xa1, ror #((32 + (\da1) - (\db1)) & 31) + .endif + @ XOR with xd, result back in r8:r12 (xd itself is unmodified) + @ r8 and r12 have delayed rotations by da0 and da1, respectively + .if (\db0) == (\dd0) + eor r8, \xd0, r8 + .else + eor r8, \xd0, r8, ror #((32 + (\db0) - (\dd0)) & 31) + .endif + .if (\db1) == (\dd1) + eor r12, \xd1, r12 + .else + eor r12, \xd1, r12, ror #((32 + (\db1) - (\dd1)) & 31) + .endif + @ Store back XOR result + A_ST r8, r12, \j +.endm + +@ Apply Khi on five words. Word indexes are i0 to i4. Each word comes with +@ its two "delayed rotation" counts. +@ If notx1 is non-zero, then word i1 is complemented before the computations. +@ If notx3 is non-zero, then word i3 is complemented before the computations. +@ If swap1 is non-zero, then the two operands for the second operation are +@ swapped. +.macro KHI_STEP i0, e00, e01, i1, e10, e11, i2, e20, e21, i3, e30, e31, i4, e40, e41 + @ Load all five state words. + A_LDX r0, r1, \i0 + A_LDX r2, r3, \i1 + A_LDX r4, r5, \i2 + A_LDX r6, r7, \i3 + A_LDX r10, r11 \i4 + @ Apply operations. + KHI_OP r2, \e10, r3, \e11, r4, \e20, r5, \e21, r0, \e00, r1, \e01, \i0 + KHI_OP r4, \e20, r5, \e21, r6, \e30, r7, \e31, r2, \e10, r3, \e11, \i1 + KHI_OP r6, \e30, r7, \e31, r10, \e40, r11, \e41, r4, \e20, r5, \e21, \i2 + KHI_OP r10, \e40, r11, \e41, r0, \e00, r1, \e01, r6, \e30, r7, \e31, \i3 + KHI_OP r0, \e00, r1, \e01, r2, \e10, r3, \e11, r10, \e40, r11, \e41, \i4 +.endm + +@ Special case for first KHI_STEP: +@ Words are received already loaded in registers (permuted). +.macro KHI_STEP_1 i0, e00, e01, i1, e10, e11, i2, e20, e21, i3, e30, e31, i4, e40, e41, op0, op1, op2, op3, op4 + KHI_OP r2, \e10, r3, \e11, r4, \e20, r5, \e21, r0, \e00, r1, \e01, \i0 + KHI_OP r4, \e20, r5, \e21, r6, \e30, r7, \e31, r2, \e10, r3, \e11, \i1 + KHI_OP r6, \e30, r7, \e31, r10, \e40, r11, \e41, r4, \e20, r5, \e21, \i2 + KHI_OP r10, \e40, r11, \e41, r0, \e00, r1, \e01, r6, \e30, r7, \e31, \i3 + KHI_OP r0, \e00, r1, \e01, r2, \e10, r3, \e11, r10, \e40, r11, \e41, \i4 +.endm + + @ 0, 6, 12, 18, 24 + KHI_STEP_1 0, 0, 0, 6, 10, 10, 12, 10, 11, 18, 21, 22, 24, 25, 25 + + @ 3, 9, 10, 16, 22 + KHI_STEP 3, 18, 18, 9, 22, 22, 10, 30, 31, 16, 9, 10, 22, 1, 2 + + @ 1, 7, 13, 19, 20 + KHI_STEP 1, 31, 0, 7, 29, 29, 13, 19, 20, 19, 28, 28, 20, 23, 23 + + @ 4, 5, 11, 17, 23 + KHI_STEP 4, 18, 19, 5, 14, 14, 11, 27, 27, 17, 24, 25, 23, 4, 4 + + @ 2, 8, 14, 15, 21 + KHI_STEP 2, 1, 1, 8, 4, 5, 14, 12, 13, 15, 11, 12, 21, 31, 31 + + @ XOR next round constant into A[0] + adr.w r5, process_block_RC__end + add.w r5, r14 + ldrd r2, r3, [r5] + A_LD r0, r1, 0 + eors r0, r2 + eors r1, r3 + @ Increment counter for next iteration. Since the counter starts at + @ -192, it reaches 0 when 24 rounds have been completed. + adds r14, #8 + beq.w fndsa_sha3_process_block__final + + @ We store back the modified A[0] only if looping (exit sequence + @ uses the r0:r1 registers directly). + A_ST r0, r1, 0 + + @ Permute the state words for next round. + @ 6 -> 1 + @ 1 -> 10 + @ 10 -> 7 + @ 7 -> 11 + @ 11 -> 17 + @ 17 -> 18 + @ 18 -> 3 + @ 3 -> 5 + @ 5 -> 16 + @ 16 -> 8 + @ 8 -> 21 + @ 21 -> 24 + @ 24 -> 4 + @ 4 -> 15 + @ 15 -> 23 + @ 23 -> 19 + @ 19 -> 13 + @ 13 -> 12 + @ 12 -> 2 + @ 2 -> 20 + @ 20 -> 14 + @ 14 -> 22 + @ 22 -> 9 + @ 9 -> 6 + @ Word 0 is not permuted. + @ We compute the XOR of the permuted words, as would normally + @ be done at the start of the next iteration. For that computation, + @ we need to take the delayed rotations into account. + + @ Load word i, store in j, and also rotate the in-register copy + @ to absorbate the specified delayed rotations. +.macro A_LD_ST i, j, e0, e1 + .if (\j) % 5 == 0 + A_LD r0, r1, \i + A_ST r0, r1, \j + ROR_WORD r0, r1, \e0, \e1 + .elseif (\j) % 5 == 1 + A_LD r2, r3, \i + A_ST r2, r3, \j + ROR_WORD r2, r3, \e0, \e1 + .elseif (\j) % 5 == 2 + A_LD r4, r5, \i + A_ST r4, r5, \j + ROR_WORD r4, r5, \e0, \e1 + .elseif (\j) % 5 == 3 + A_LD r6, r7, \i + A_ST r6, r7, \j + ROR_WORD r6, r7, \e0, \e1 + .else + A_LD r10, r11, \i + A_ST r10, r11, \j + ROR_WORD r10, r11, \e0, \e1 + .endif +.endm + + @ Load word i, store in j, and also XOR that word into the + @ appropriate registers (based on j mod 5), applying the specified + @ delayed rotations. +.macro A_LD_XOR_ST i, j, e0, e1 + A_LD r8, r12, \i + .if (\j) % 5 == 0 + XOR_ROR_WORD r0, r1, r8, r12, \e0, \e1 + .elseif (\j) % 5 == 1 + XOR_ROR_WORD r2, r3, r8, r12, \e0, \e1 + .elseif (\j) % 5 == 2 + XOR_ROR_WORD r4, r5, r8, r12, \e0, \e1 + .elseif (\j) % 5 == 3 + XOR_ROR_WORD r6, r7, r8, r12, \e0, \e1 + .else + XOR_ROR_WORD r10, r11, r8, r12, \e0, \e1 + .endif + A_ST r8, r12, \j +.endm + + @ r0:r1 still contains A[0], for whom the delayed rotations are zero. + A_LD r2, r3, 6 + str r2, [sp, #80] + str r3, [sp, #84] + ror r2, r2, #10 + ror r3, r3, #10 + A_LD_XOR_ST 9, 6, 22, 22 + A_LD_ST 22, 9, 1, 2 + A_LD_ST 14, 22, 12, 13 + A_LD_XOR_ST 20, 14, 23, 23 + A_LD_XOR_ST 2, 20, 1, 1 + A_LD_XOR_ST 12, 2, 10, 11 + A_LD_XOR_ST 13, 12, 19, 20 + A_LD_ST 19, 13, 28, 28 + A_LD_XOR_ST 23, 19, 4, 4 + A_LD_XOR_ST 15, 23, 11, 12 + A_LD_XOR_ST 4, 15, 18, 19 + A_LD_XOR_ST 24, 4, 25, 25 + A_LD_XOR_ST 21, 24, 31, 31 + A_LD_XOR_ST 8, 21, 4, 5 + A_LD_XOR_ST 16, 8, 9, 10 + A_LD_XOR_ST 5, 16, 14, 14 + A_LD_XOR_ST 3, 5, 18, 18 + A_LD_XOR_ST 18, 3, 21, 22 + A_LD_XOR_ST 17, 18, 24, 25 + A_LD_XOR_ST 11, 17, 27, 27 + A_LD_XOR_ST 7, 11, 29, 29 + A_LD_XOR_ST 10, 7, 30, 31 + A_LD_XOR_ST 1, 10, 31, 0 + ldrd r8, r12, [sp, #80] + A_ST r8, r12, 1 + + b.w fndsa_sha3_process_block__loop_step2 + +fndsa_sha3_process_block__final: + @ Recombine even-indexed and odd-indexed bits. + @ Everything is written back into the original state array. + @ Words are still in permuted state, and have delayed rotations + @ that should be applied here. + ldr r12, [sp, #72] @ Pointer to state array + ldr r14, [sp, #76] @ Rate (9, 13, 17, 18 or 21) + + @ Load word i into the given registers, and apply the specified + @ rotations. Moreover, if donot is non-zero, the word is negated. +.macro A_LDROR x0, x1, i, e0, e1 + A_LD \x0, \x1, \i + ROR_WORD \x0, \x1, \e0, \e1 +.endm + + @ r0:r1 was set in the last iteration and has no delayed rotation + @A_LD r0, r1, 0 + A_LDROR r2, r3, 6, 10, 10 + A_LDROR r4, r5, 12, 10, 11 + A_LDROR r6, r7, 18, 21, 22 + A_LDROR r10, r11, 24, 25, 25 + bl bit_merge_5 + stm r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 } + A_LDROR r0, r1, 3, 18, 18 + A_LDROR r2, r3, 9, 22, 22 + A_LDROR r4, r5, 10, 30, 31 + A_LDROR r6, r7, 16, 9, 10 + A_LDROR r10, r11, 22, 1, 2 + cmp r14, #9 + bhi.w fndsa_sha3_process_block__L10 + bl bit_merge_4 + b.w fndsa_sha3_process_block__L11 +fndsa_sha3_process_block__L10: + bl bit_merge_5 +fndsa_sha3_process_block__L11: + stm r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 } + A_LDROR r0, r1, 1, 31, 0 + A_LDROR r2, r3, 7, 29, 29 + A_LDROR r4, r5, 13, 19, 20 + A_LDROR r6, r7, 19, 28, 28 + A_LDROR r10, r11, 20, 23, 23 + cmp r14, #13 + bhi.w fndsa_sha3_process_block__L12 + cmp r14, #10 + bls.w fndsa_sha3_process_block__L13 + bl bit_merge_3 + b.w fndsa_sha3_process_block__L13 +fndsa_sha3_process_block__L12: + bl bit_merge_5 +fndsa_sha3_process_block__L13: + stm r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 } + A_LDROR r0, r1, 4, 18, 19 + A_LDROR r2, r3, 5, 14, 14 + A_LDROR r4, r5, 11, 27, 27 + A_LDROR r6, r7, 17, 24, 25 + A_LDROR r10, r11, 23, 4, 4 + cmp r14, #17 + beq.w fndsa_sha3_process_block__L15 + cmp r14, #18 + beq.w fndsa_sha3_process_block__L14 + cmp r14, #15 + bls.w fndsa_sha3_process_block__L16 + bl bit_merge_5 + b.w fndsa_sha3_process_block__L16 +fndsa_sha3_process_block__L14: + bl bit_merge_3 + b.w fndsa_sha3_process_block__L16 +fndsa_sha3_process_block__L15: + bl bit_merge_2 +fndsa_sha3_process_block__L16: + stm r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 } + A_LDROR r0, r1, 2, 1, 1 + A_LDROR r2, r3, 8, 4, 5 + A_LDROR r4, r5, 14, 12, 13 + A_LDROR r6, r7, 15, 11, 12 + A_LDROR r10, r11, 21, 31, 31 + cmp r14, #20 + bls.w fndsa_sha3_process_block__L17 + bl bit_merge_1 +fndsa_sha3_process_block__L17: + stm r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 } + + add sp, #88 + vpop.64 { d8, d9, d10, d11, d12, d13, d14, d15 } + pop { r4, r5, r6, r7, r8, r10, r11, pc } + + .align 2 +process_block_RC: + .word 0x00000001, 0x00000000 + .word 0x00000000, 0x00000089 + .word 0x00000000, 0x8000008B + .word 0x00000000, 0x80008080 + .word 0x00000001, 0x0000008B + .word 0x00000001, 0x00008000 + .word 0x00000001, 0x80008088 + .word 0x00000001, 0x80000082 + .word 0x00000000, 0x0000000B + .word 0x00000000, 0x0000000A + .word 0x00000001, 0x00008082 + .word 0x00000000, 0x00008003 + .word 0x00000001, 0x0000808B + .word 0x00000001, 0x8000000B + .word 0x00000001, 0x8000008A + .word 0x00000001, 0x80000081 + .word 0x00000000, 0x80000081 + .word 0x00000000, 0x80000008 + .word 0x00000000, 0x00000083 + .word 0x00000000, 0x80008003 + .word 0x00000001, 0x80008088 + .word 0x00000000, 0x80000088 + .word 0x00000001, 0x00008000 + .word 0x00000000, 0x80008082 +process_block_RC__end: + + .size fndsa_sha3_process_block,.-fndsa_sha3_process_block diff --git a/crypto_sign/fndsa_provisional-512/m4f/sign.c b/crypto_sign/fndsa_provisional-512/m4f/sign.c new file mode 120000 index 00000000..c0dd0533 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/sign.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/sign.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/sign_core.c b/crypto_sign/fndsa_provisional-512/m4f/sign_core.c new file mode 120000 index 00000000..879d614a --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/sign_core.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/sign_core.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/sign_fpoly.c b/crypto_sign/fndsa_provisional-512/m4f/sign_fpoly.c new file mode 120000 index 00000000..b5d1b080 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/sign_fpoly.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/sign_fpoly.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/sign_fpr.c b/crypto_sign/fndsa_provisional-512/m4f/sign_fpr.c new file mode 120000 index 00000000..26216c14 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/sign_fpr.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/sign_fpr.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/sign_fpr_cm4.s b/crypto_sign/fndsa_provisional-512/m4f/sign_fpr_cm4.s new file mode 100644 index 00000000..d8e85a83 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/sign_fpr_cm4.s @@ -0,0 +1,1020 @@ + .syntax unified + .cpu cortex-m4 + .file "sign_fpr_cm4.s" + .text + +@ ======================================================================= +@ fpr fndsa_fpr_scaled(int64_t i, int sc) +@ ======================================================================= + + .align 2 + .global fndsa_fpr_scaled + .thumb + .thumb_func + .type fndsa_fpr_scaled, %function +fndsa_fpr_scaled: + @push { r4, r5 } + vmov s0, s1, r4, r5 + + @ Get absolute value into r0:r5. + eor r0, r0, r1, asr #31 + eor r5, r1, r1, asr #31 + subs r0, r0, r1, asr #31 + sbc r5, r5, r1, asr #31 + + @ Count leading zeros of r0:r5 (into r3). + @ r12 = 1 if r3 >= 32, 0 otherwise. + clz r3, r5 + clz r4, r0 + lsrs r12, r3, #5 + umlal r3, r4, r4, r12 + + @ Normalize absolute value to [2^63,2^64-1]: we shift-left the value + @ by r3 bits. We also adjust the scaling (sc, in r2) accordingly. + subs r2, r2, r3 + + @ At this point, r12 = 1 if r3 >= 32, 0 otherwise. + rsbs r12, #0 @ r1 <- -1 if r12 = 1, 0 otherwise + umlal r0, r5, r0, r12 @ if r5 = 0 then r0:r5 <- 0:r0 + and r12, r3, #31 + movs r4, #1 + lsls r4, r12 + umull r0, r12, r0, r4 + umlal r12, r4, r5, r4 + + @ Normalized absolute value is now in r0:r12. + @ If the source integer was zero, then r0:r12 = 0 at this point. + @ Since the pre-normalized absolute value was at most 2^63-1, the + @ lowest bit of r0 is necessarily zero. + + @ Adjust exponent. The mantissa will spill an extra 1 into the + @ exponent. + addw r2, r2, #1085 + + @ Shrink mantissa to [2^52,2^53-1] with rounding. + @ See fpr_add() for details. Since we can only guarantee that the + @ lowest bit is 0, the method involves adding 0x7FE00000, which + @ cannot fit in a representable constant for add; we have to + @ use movw and a shift. + lsls r2, r2, #20 @ exponent + lsls r4, r0, #21 + lsrs r0, r0, #11 + bfi r4, r0, #21, #1 + movw r5, #0x7FE0 + adds r4, r4, r5, lsl #16 + adcs r0, r0, r12, lsl #21 + adcs r12, r2, r12, lsr #11 + + @ If the source value was zero then the mantissa is still zero, + @ but the exponent field is wrong and must be adjusted. We still + @ have the count of leading zeros in r3; source was 0 if and only + @ if r3 = 64. + sbfx r3, r3, #6, #1 + bics r12, r3 + + @ Insert back the sign. + bfi r1, r12, #0, #31 + + @pop { r4, r5 } + vmov r4, r5, s0, s1 + bx lr + .size fndsa_fpr_scaled,.-fndsa_fpr_scaled + +@ ======================================================================= +@ fpr fndsa_fpr_add(fpr x, fpr y) +@ ======================================================================= + + .align 2 + .global fndsa_fpr_add + .thumb + .thumb_func + .type fndsa_fpr_add, %function +fndsa_fpr_add: + @push { r4, r5, r6, r7, r8 } + vmov s0, s1, r4, r5 + vmov s2, s3, r6, r7 + vmov s4, r8 + + @ Operands are in r0:r1 and r2:r3. We want to conditionally swap + @ them, so that x (r0:r1) has the greater absolute value of the two; + @ if both have the same absolute value and different signs, then + @ x should be positive. This ensures that the exponent of y is not + @ greater than that of x, and the result has the sign of x. + @ + @ To ignore the sign bit in the comparison, we left-shift the high + @ word of both operands by 1 bit (this does not change the order of + @ the absolute values). To cover the case of two equal absolute + @ values, we inject the sign of x as an initial borrow (thus, if + @ the absolute values are equal but x is negative, then the + @ comparison will decide that x is "lower" and do the swap). We + @ leverage the fact that r1 cannot be 0xFFFFFFFF (it would mean that + @ x is a NaN), and therefore subtracting the word-extended sign bit + @ will produce the expected borrow. + lsls r7, r1, #1 @ Left-shift high word of x + subs r6, r1, r1, asr #31 @ Initial borrow if x is negative + sbcs r6, r0, r2 @ Sub: low words + sbcs r6, r7, r3, lsl #1 @ Sub: high words (with shift of y) + sbcs r4, r4 @ r4 is set to 0xFFFFFFFF for a swap + uadd8 r4, r4, r4 + sel r6, r2, r0 + sel r7, r3, r1 + sel r2, r0, r2 + sel r3, r1, r3 + + @ Now x is in r6:r7, and y is in r2:r3. + + @ Extract mantissa of x into r6:r7, exponent in r4, sign in r5. + @ For the mantissa, we must set bit 52 to 1, except if the (encoded) + @ exponent is zero; in the latter case, the whole value must be zero + @ or minus zero (we do not support subnormals). + asrs r5, r7, #31 @ Sign bit (extended to whole word) + ubfx r4, r7, #20, #11 @ Exponent in r4 (without sign) + addw r8, r4, #2047 @ r8 >= 2048 if and only if r4 != 0 + lsrs r8, r8, #11 @ r8 = 1 except if r4 = 0 + bfi r7, r8, #20, #12 @ Set high mantissa bits + + @ Extract mantissa of y into r2:r3, exponent in r0. + @ r1 receives the xor of the signs of x and y (extended). + eor r1, r5, r3, asr #31 + ubfx r0, r3, #20, #11 @ Exponent in r0 (without sign) + addw r8, r0, #2047 @ r8 >= 2048 if and only if r0 != 0 + lsrs r8, r8, #11 @ r8 = 1 except if r0 = 0 + bfi r3, r8, #20, #12 @ Set high mantissa bits + + @ Scale mantissas up by three bits (i.e. multiply both by 8). + mov r8, #7 + lsls r7, #3 + umlal r6, r7, r6, r8 + lsls r3, #3 + umlal r2, r3, r2, r8 + + @ x: exponent=r4, sign=r5, mantissa=r6:r7 (scaled up 3 bits) + @ y: exponent=r0, sign-xor=r1, mantissa=r2:r3 (scaled up 3 bits) + + @ At that point, the exponent of x (in r4) is larger than that + @ of y (in r0). The difference is the amount of shifting that + @ should be done on y. If that amount is larger than 59 then + @ we clamp y to 0. We won't need y's exponent beyond that point, + @ so we store that shift count in r0. + subs r0, r4, r0 + subs r8, r0, #60 + ands r2, r2, r8, asr #31 + ands r3, r3, r8, asr #31 + + @ Shift right r2:r3 by r0 bits (with result in r3:r0). The + @ shift count is in the 0..59 range. r12 will be non-zero if and + @ only if some non-zero bits were dropped. + + @ If r0 >= 32, then right-shift by 32 bits; r12 is set to the + @ dropped bits (or 0 if r0 < 32). + sbfx r8, r0, #5, #1 + and r12, r2, r8 + bic r2, r2, r8 + umlal r3, r2, r3, r8 + @ Right-shift by r0 mod 32 bits; dropped bits (from r3) are + @ accumulated into r12 (with OR). + and r0, r0, #31 + mov r8, #0xFFFFFFFF + lsr r8, r0 @ r8 <- 2^(32-sc) - 1 + eors r0, r0 + umlal r3, r0, r3, r8 + umlal r2, r3, r2, r8 + orr r12, r12, r2 + + @ If r12 is non-zero then some non-zero bit was dropped and the + @ low bit of r2 must be forced to 1 ('sticky bit'). + rsbs r2, r12, #0 + orrs r2, r2, r12 + orrs r3, r3, r2, lsr #31 + + @ x: exponent=r4, sign=r5, mantissa=r6:r7 (scaled up 3 bits) + @ y: sign=r1, value=r3:r0 (scaled to same exponent as x) + + @ If x and y have the same sign (r1 = 0), then we add r3:r0 to r6:r7. + @ Otherwise (r1 = -1), we subtract r3:r0 from r6:r7. Both values are + @ less than 2^56, and output cannot be negative. + orr r2, r1, #1 @ r2 = 1 if r1 = 0, or -1 if r1 = -1 + umlal r6, r7, r3, r2 + muls r3, r1 + umaal r7, r3, r0, r2 + + @ result: exponent=r4, sign=r5, mantissa=r6:r7 (scaled up 3 bits) + @ Value in r6:r7 is necessarily less than 2^57. + + @ Normalize the result with some left-shifting to full 64-bit + @ width. Shift count goes to r2, and exponent (r4) is adjusted. + clz r2, r7 + clz r3, r6 + sbfx r0, r2, #5, #1 + umlal r3, r2, r3, r0 + subs r4, r4, r2 + + @ Shift r6:r7 to the left by r2 bits. + @ If r2 >= 32, then r7 = 0 and r0 = -1, and we set: r6:r7 <- 0:r6 + umlal r6, r7, r6, r0 + @ Left-shift by r2 mod 32 + and r2, #31 + movs r1, #1 + lsls r1, r2 + umull r6, r12, r6, r1 + umlal r12, r7, r7, r1 + + @ Normalized mantissa is now in r6:r12 + @ Since the mantissa was at most 57-bit pre-normalization, the low + @ 7 bits of r6 must be zero. + + @ The exponent of x was in r4. The left-shift operation has + @ subtracted some value from it, 8 in case the result has the + @ same exponent as x. However, the high bit of the mantissa will + @ add 1 to the exponent, so we only add back 7 (the exponent is + @ added in because rounding might have produced a carry, which + @ should then spill into the exponent). + adds r4, #7 + + @ If the new mantissa is non-zero, then its bit 63 is non-zero + @ (thanks to the normalizing shift). Otherwise, that bit is + @ zero, and we should then set the exponent to zero as well. + ands r4, r4, r12, asr #31 + + @ We have a 64-bit value which we must shrink down to 53 bits, i.e. + @ removing the low 11 bits. Rounding must be applied. The low 12 + @ bits of r6 (in high-to-low order) are: + @ b4 b3 b2 b1 b0 0000000 + @ (as mentioned earlier, the lowest 7 bits must be zero) + @ After a strict right shift, b4 is the lowest bit. Rounding will + @ add +1 to the value if and only if: + @ - b4 = 0 and b3:b2:b1:b0 >= 1001 + @ - b4 = 1 and b3:b2:b1:b0 >= 1000 + @ Equivalently, we must add +1 after the shift if and only if: + @ b3:b2:b1:b0:b4 + 01111 >= 100000 + lsls r5, #31 @ sign of output is sign of x + orr r1, r5, r4, lsl #20 @ exponent and sign + lsls r3, r6, #21 @ top(r3) = b3:b2:b1:b0:00... + lsrs r0, r6, #11 + bfi r3, r0, #27, #1 @ top(r3) = b3:b2:b1:b0:b4:00... + adds r3, r3, #0x78000000 @ add 01111 to top bits, carry is adjust + adcs r0, r0, r12, lsl #21 + adcs r1, r1, r12, lsr #11 + + @ If the mantissa in r6:r7 was zero, then r0:r1 contains zero at + @ this point, and the exponent r4 was cleared before, so there is + @ not need for further correcting actions. + + @pop { r4, r5, r6, r7, r8 } + vmov r4, r5, s0, s1 + vmov r6, r7, s2, s3 + vmov r8, s4 + bx lr + .size fndsa_fpr_add,.-fndsa_fpr_add + +@ ======================================================================= +@ fpr*2 fndsa_fpr_add_sub(fpr x, fpr y) +@ This function returns two 64-bit values: x+y in r0:r1, and x-y in r2:r3 +@ +@ This does not follow the AAPCS, hence the caller must be custom (inline) +@ assembly that specifies clobbers and dispatches the two results +@ appropriately. +@ Clobbers: r4, r5, r6, r7, r8, r10, r11, r12, r14, s15, flags +@ ======================================================================= + + .align 2 + .global fndsa_fpr_add_sub + .thumb + .thumb_func + .type fndsa_fpr_add_sub, %function +fndsa_fpr_add_sub: + @ Operands are in r0:r1 and r2:r3. We want to conditionally swap + @ them, so that x (r0:r1) has the greater absolute value of the two; + @ if both have the same absolute value and different signs, then + @ x should be positive. This ensures that the exponent of y is not + @ greater than that of x, and the result of the addition has the + @ sign of x. We must still remember whether a swap occurred, because + @ in that case the subtraction will compute y-x instead of x-y, + @ and we will have to negate the second output. + @ + @ Signs for zeros: for any z, z + (-z) and z - z should be +0, + @ never -0. The exact process is: + @ + @ swap <- false + @ if abs(x) < abs(y): + @ swap <- true + @ elif abs(x) == abs(y): + @ if is_neg(x): + @ swap <- true + @ if swap: + @ (x, y) <- (y, x) + @ a <- abs(x) + abs(y) + @ b <- abs(x) - abs(y) + @ sign(a) <- sign(x) + @ if swap: + @ sign(b) <- sign(x) + @ else: + @ sign(b) <- sign(-x) + @ + @ Indeed, if abs(x) = abs(y): + @ x y x+y x-y + @ + + + + no swap + @ + - + + no swap + @ - + + - swap + @ - - - + swap + @ + @ To ignore the sign bit in the comparison, we left-shift the high + @ word of both operands by 1 bit (this does not change the order of + @ the absolute values). To cover the case of two equal absolute + @ values, we inject the sign of x as an initial borrow (thus, if + @ the absolute values are equal but x is negative, then the + @ comparison will decide that x is "lower" and do the swap). We + @ leverage the fact that r1 cannot be 0xFFFFFFFF (it would mean that + @ x is a NaN), and therefore subtracting the word-extended sign bit + @ will produce the expected borrow. + lsls r7, r1, #1 @ Left-shift high word of x + subs r6, r1, r1, asr #31 @ Initial borrow if x is negative + sbcs r6, r0, r2 @ Sub: low words + sbcs r6, r7, r3, lsl #1 @ Sub: high words (with shift of y) + sbc r11, r11 @ r11 is set to 0xFFFFFFFF for a swap + uadd8 r4, r11, r11 + sel r6, r2, r0 + sel r7, r3, r1 + sel r2, r0, r2 + sel r3, r1, r3 + + @ Now x is in r6:r7, and y is in r2:r3. + + @ Extract mantissa of x into r6:r7, exponent in r4, sign in r5. + @ For the mantissa, we must set bit 52 to 1, except if the (encoded) + @ exponent is zero; in the latter case, the whole value must be zero + @ or minus zero (we do not support subnormals). + asrs r5, r7, #31 @ Sign bit (extended to whole word) + ubfx r4, r7, #20, #11 @ Exponent in r4 (without sign) + addw r8, r4, #2047 @ r8 >= 2048 if and only if r4 != 0 + lsrs r8, r8, #11 @ r8 = 1 except if r4 = 0 + bfi r7, r8, #20, #12 @ Set high mantissa bits + + @ Extract mantissa of y into r2:r3, exponent in r0. + @ r1 receives the xor of the signs of x and y (extended). + eor r1, r5, r3, asr #31 + ubfx r0, r3, #20, #11 @ Exponent in r0 (without sign) + addw r8, r0, #2047 @ r8 >= 2048 if and only if r0 != 0 + lsrs r8, r8, #11 @ r8 = 1 except if r0 = 0 + bfi r3, r8, #20, #12 @ Set high mantissa bits + + @ Scale mantissas up by three bits (i.e. multiply both by 8). + mov r8, #7 + lsls r7, #3 + umlal r6, r7, r6, r8 + lsls r3, #3 + umlal r2, r3, r2, r8 + + @ x: exponent=r4, sign=r5, mantissa=r6:r7 (scaled up 3 bits) + @ y: exponent=r0, sign-xor=r1, mantissa=r2:r3 (scaled up 3 bits) + + @ At that point, the exponent of x (in r4) is larger than that + @ of y (in r0). The difference is the amount of shifting that + @ should be done on y. If that amount is larger than 59 then + @ we clamp y to 0. We won't need y's exponent beyond that point, + @ so we store that shift count in r0. + subs r0, r4, r0 + subs r8, r0, #60 + ands r2, r2, r8, asr #31 + ands r3, r3, r8, asr #31 + + @ Shift right r2:r3 by r0 bits (with result in r3:r0). The + @ shift count is in the 0..59 range. r12 will be non-zero if and + @ only if some non-zero bits were dropped. + + @ If r0 >= 32, then right-shift by 32 bits; r12 is set to the + @ dropped bits (or 0 if r0 < 32). + sbfx r8, r0, #5, #1 + and r12, r2, r8 + bic r2, r2, r8 + umlal r3, r2, r3, r8 + @ Right-shift by r0 mod 32 bits; dropped bits (from r3) are + @ accumulated into r12 (with OR). + and r0, r0, #31 + mov r8, #0xFFFFFFFF + lsr r8, r0 @ r8 <- 2^(32-sc) - 1 + eors r0, r0 + umlal r3, r0, r3, r8 + umlal r2, r3, r2, r8 + orr r12, r12, r2 + + @ If r12 is non-zero then some non-zero bit was dropped and the + @ low bit of r2 must be forced to 1 ('sticky bit'). + rsbs r2, r12, #0 + orrs r2, r2, r12 + orrs r3, r3, r2, lsr #31 + + @ x: exponent=r4, sign=r5, mantissa=r6:r7 (scaled up 3 bits) + @ y: sign=r1, value=r3:r0 (scaled to same exponent as x) + + @ Compute the sum (into r6:r7) and the difference (into r12:r8). + subs r12, r6, r3 + sbcs r8, r7, r0 + adds r6, r6, r3 + adcs r7, r7, r0 + + @ Swap the values if r1 = -1. Second output goes to: r10:r12 + uadd8 r10, r1, r1 + sel r10, r6, r12 + sel r6, r12, r6 + sel r12, r7, r8 + sel r7, r8, r7 + + @ Save high word of second output (low word is kept in r10). + vmov s15, r12 + + @ Post-processing for first output + @ -------------------------------- + + @ result: exponent=r4, sign=r5, mantissa=r6:r7 (scaled up 3 bits) + @ Value in r6:r7 is necessarily less than 2^57. + + @ Normalize the result with some left-shifting to full 64-bit + @ width. Shift count goes to r2, and exponent (r4) is adjusted. + @ The adjusted exponent goes to r8 (we want to keep r4 untouched). + clz r2, r7 + clz r3, r6 + sbfx r0, r2, #5, #1 + umlal r3, r2, r3, r0 + sub r8, r4, r2 + + @ Shift r6:r7 to the left by r2 bits. + @ If r2 >= 32, then r7 = 0 and r0 = -1, and we set: r6:r7 <- 0:r6 + umlal r6, r7, r6, r0 + @ Left-shift by r2 mod 32 + and r2, #31 + movs r1, #1 + lsls r1, r2 + umull r6, r12, r6, r1 + umlal r12, r7, r7, r1 + + @ Normalized mantissa is now in r6:r12 + @ Since the mantissa was at most 57-bit pre-normalization, the low + @ 7 bits of r6 must be zero. + + @ The exponent of x was in r8. The left-shift operation has + @ subtracted some value from it, 8 in case the result has the + @ same exponent as x. However, the high bit of the mantissa will + @ add 1 to the exponent, so we only add back 7 (the exponent is + @ added in because rounding might have produced a carry, which + @ should then spill into the exponent). + add r8, r8, #7 + + @ If the new mantissa is non-zero, then its bit 63 is non-zero + @ (thanks to the normalizing shift). Otherwise, that bit is + @ zero, and we should then set the exponent to zero as well. + and r8, r8, r12, asr #31 + + @ We have a 64-bit value which we must shrink down to 53 bits, i.e. + @ removing the low 11 bits. Rounding must be applied. The low 12 + @ bits of r6 (in high-to-low order) are: + @ b4 b3 b2 b1 b0 0000000 + @ (as mentioned earlier, the lowest 7 bits must be zero) + @ After a strict right shift, b4 is the lowest bit. Rounding will + @ add +1 to the value if and only if: + @ - b4 = 0 and b3:b2:b1:b0 >= 1001 + @ - b4 = 1 and b3:b2:b1:b0 >= 1000 + @ Equivalently, we must add +1 after the shift if and only if: + @ b3:b2:b1:b0:b4 + 01111 >= 100000 + lsls r5, #31 @ sign of output is sign of x + orr r1, r5, r8, lsl #20 @ exponent and sign + lsls r3, r6, #21 @ top(r3) = b3:b2:b1:b0:00... + lsrs r0, r6, #11 + bfi r3, r0, #27, #1 @ top(r3) = b3:b2:b1:b0:b4:00... + adds r3, r3, #0x78000000 @ add 01111 to top bits, carry is adjust + adcs r0, r0, r12, lsl #21 + adcs r1, r1, r12, lsr #11 + + @ If the mantissa in r6:r7 was zero, then r0:r1 contains zero at + @ this point, and the exponent r8 was cleared before, so there is + @ not need for further correcting actions. + + @ Post-processing for second output + @ --------------------------------- + + @ Unprocessed second output is in r10:s15 + vmov r7, s15 + + @ result: exponent=r4, sign=r5 (top), mantissa=r10:r7 (scaled up 3 bits) + @ Value in r10:r7 is necessarily less than 2^57. + + @ Normalize the result with some left-shifting to full 64-bit + @ width. Shift count goes to r2, and exponent (r4) is adjusted. + clz r2, r7 + clz r3, r10 + sbfx r8, r2, #5, #1 + umlal r3, r2, r3, r8 + subs r4, r4, r2 + + @ Shift r10:r7 to the left by r2 bits (into r6:r12) + @ If r2 >= 32, then r7 = 0 and r8 = -1, and we set: r10:r7 <- 0:r10 + umlal r10, r7, r10, r8 + @ Left-shift by r2 mod 32 + and r2, #31 + movw r8, #1 + lsl r8, r2 + umull r6, r12, r10, r8 + umlal r12, r7, r7, r8 + + @ Normalized mantissa is now in r6:r12 + @ Since the mantissa was at most 57-bit pre-normalization, the low + @ 7 bits of r6 must be zero. + + @ The exponent of x was in r4. The left-shift operation has + @ subtracted some value from it, 8 in case the result has the + @ same exponent as x. However, the high bit of the mantissa will + @ add 1 to the exponent, so we only add back 7 (the exponent is + @ added in because rounding might have produced a carry, which + @ should then spill into the exponent). + adds r4, #7 + + @ If the new mantissa is non-zero, then its bit 63 is non-zero + @ (thanks to the normalizing shift). Otherwise, that bit is + @ zero, and we should then set the exponent to zero as well. + ands r4, r4, r12, asr #31 + + @ We have a 64-bit value which we must shrink down to 53 bits, i.e. + @ removing the low 11 bits. Rounding must be applied. The low 12 + @ bits of r6 (in high-to-low order) are: + @ b4 b3 b2 b1 b0 0000000 + @ (as mentioned earlier, the lowest 7 bits must be zero) + @ After a strict right shift, b4 is the lowest bit. Rounding will + @ add +1 to the value if and only if: + @ - b4 = 0 and b3:b2:b1:b0 >= 1001 + @ - b4 = 1 and b3:b2:b1:b0 >= 1000 + @ Equivalently, we must add +1 after the shift if and only if: + @ b3:b2:b1:b0:b4 + 01111 >= 100000 + orr r7, r5, r4, lsl #20 @ exponent and sign + lsls r3, r6, #21 @ top(r3) = b3:b2:b1:b0:00... + lsr r8, r6, #11 + bfi r3, r8, #27, #1 @ top(r3) = b3:b2:b1:b0:b4:00... + adds r3, r3, #0x78000000 @ add 01111 to top bits, carry is adjust + adcs r2, r8, r12, lsl #21 + adcs r3, r7, r12, lsr #11 + + @ If there was an operand swap, then we should reverse the sign + @ of the second operand here. As described previsouly, this also + @ correctly handles situations involving zeros. + @ Swap flag (-1 or 0) is still in r11. + eor r3, r3, r11, lsl #31 + + bx lr + .size fndsa_fpr_add_sub,.-fndsa_fpr_add_sub + +@ ======================================================================= +@ fpr fndsa_fpr_mul(fpr x, fpr y) +@ ======================================================================= + + .align 2 + .global fndsa_fpr_mul + .thumb + .thumb_func + .type fndsa_fpr_mul, %function +fndsa_fpr_mul: + @push { r4, r5, r6, r7 } + vmov s0, s1, r4, r5 + vmov s2, s3, r6, r7 + + @ Extract mantissas: x.m = r0:r4, y.m = r2:r5 + @ We assume both operands are non-zero. + ubfx r4, r1, #0, #20 + ubfx r5, r3, #0, #20 + orr r4, r4, #0x00100000 + orr r5, r5, #0x00100000 + + @ Extract signs and exponent. We want to store the aggregate sign + @ (XOR of the two signs) in r1 (top bit, other bits cleared), + @ and in r3 the aggregate exponent. + ubfx r6, r1, #20, #11 + ubfx r7, r3, #20, #11 + eors r1, r3 + bfc r1, #0, #31 + adds r3, r6, r7 + sub r3, r3, #1024 + @ If either of the exponents is zero, then we clear the exponent + @ and the first mantissa, which will lead through all subsequent + @ computations to a zero result (except for the sign bit). + muls r6, r7 + rsbs r6, #0 + and r3, r3, r6, asr #31 + and r0, r0, r6, asr #31 + and r4, r4, r6, asr #31 + @ Move the exponent to its correct position in r1. + add r1, r1, r3, lsl #20 + @ r3 is now free. + + @ Compute mantissa product into r6:r7:r3:r0. + umull r6, r7, r0, r2 + umull r3, r0, r0, r5 + umaal r7, r3, r4, r2 + umaal r3, r0, r4, r5 + + @ r2, r4 and r5 are free. + + @ Product is in [2^104, 2^106 - 2^54 + 1]. We right-shift it + @ by 52 or 53 bits, into r5:r7, so that the output is in + @ [2^52, 2^53-1]. We must keep track of dropped bits so that we + @ may apply rounding properly. + @ Set r5 to 1 if we need to shift by 53, or to 0 otherwise. + @ If r5 is 1 then we must adjust the exponent. + lsrs r5, r0, #9 + add r1, r1, r5, lsl #20 + @ Set r4 to 2^11 (if r5 = 1) or 2^12 (if r5 = 0). We will use + @ it to perform a left shift by 11 or 12 bits, which is the same + @ as a right shift by 53 or 52 bits if we use the correct output + @ registers. + movw r4, #0x1000 + lsrs r4, r5 + @ r5 is now free. + @ Do the shift. Dropped bits are r6 (entire register) and r2 (top + @ bits, in order, rest of the register bits are zero). + umull r2, r5, r7, r4 + umull r7, r12, r0, r4 + umlal r5, r7, r3, r4 + + @ Rounding may need to add 1. The top bits of r2 are the top dropped + @ bits. We keep bit 31 as is, then compact all other dropped bits + @ into bit 30 ("sticky bit") and finally push a copy of the least + @ significant kept bit (lowest bit of r5) into bit 29 of r2. + orr r6, r6, r2, lsl #1 + clz r6, r6 @ 32 if all bits are 0 + mvns r6, r6, lsr #5 + bfi r2, r6, #30, #1 + bfi r2, r5, #29, #1 + @ By adding 011 to the top bits of r2, we generate the rounding + @ adjustment into the carry, which we can then apply to the + @ mantissa. + adds r2, r2, #0x60000000 + adcs r0, r5, #0 + adcs r1, r7 + + @pop { r4, r5, r6, r7 } + vmov r4, r5, s0, s1 + vmov r6, r7, s2, s3 + bx lr + .size fndsa_fpr_mul,.-fndsa_fpr_mul + +@ ======================================================================= +@ fpr fndsa_fpr_div(fpr x, fpr y) +@ ======================================================================= + + .align 2 + .global fndsa_fpr_div + .thumb + .thumb_func + .type fndsa_fpr_div, %function +fndsa_fpr_div: + push { r4, r5, r6, r7, r8, r10, r11, r14 } + + @ Save high words of inputs (signs, exponents). + vmov s0, r1 + vmov s1, r3 + + @ Extract mantissas (assuming values are non-zero). + @ r0:r1 <- x.m + @ r2:r3 <- y.m + ubfx r1, r1, #0, #20 + ubfx r3, r3, #0, #20 + orr r1, r1, #0x00100000 + orr r3, r3, #0x00100000 + + @ Bit-by-bit division of the mantissas: we run it for 55 iterations + @ then append an extra 56-th sticky bit (non-zero if the remainder + @ is not zero at this point). Quotient goes to r10:r12. + eor r10, r10 + + @ For divisor mantissa y.m, we prepare the following: + @ r2:r3 y.m*2 + @ r4 hi(y.m*4) + @ r5 hi(y.m*8) + @ r6 hi(y.m*16) + @ r7:r8 -(y.m*2) + adds r2, r2 + adcs r3, r3 + adds r7, r2, r2 + adcs r4, r3, r3 + adds r7, r7 + adcs r5, r4, r4 + adds r7, r7 + adcs r6, r5, r5 + subs r7, r10, r2 + sbcs r8, r10, r3 + + mov r12, #15 +.macro DIVIDEND_MUL16 + lsls r1, #4 + umlal r0, r1, r0, r12 +.endm + mov r14, #2 + + @ Parameter sh is 1, 2, 3 or 4. + @ DIVSTEP_SH takes current dividend in r0:r1 and assumes that it + @ is left-shifted by sh bits compared to its theoretical value. + @ Divisor if subtracted (if possible), yielding the next quotient + @ bit, which is pushed into r10. After the conditional subtraction, + @ the dividend is formally left-shifted by 1 bit, but this macro + @ omits the shift. +.macro DIVSTEP_SH sh + @ Check whether the dividend can be subtracted; we must use the + @ properly shifted dividend to match the divisor shift. + subs r11, r0, r2, lsl #(\sh) + .if (\sh) == 1 + sbcs r11, r1, r3 + .elseif (\sh) == 2 + sbcs r11, r1, r4 + .elseif (\sh) == 3 + sbcs r11, r1, r5 + .else + sbcs r11, r1, r6 + .endif + @ Inject next quotient bit in r10. Also extract that bit into r11, + @ left-shifted by sh-1 bits (r7:r8 is negation of a shifted divisor). + adcs r10, r10 + .if (\sh) == 2 + and r11, r14, r10, lsl #1 + .else + and r11, r10, #1 + .if (\sh) != 1 + lsl r11, r11, #((\sh) - 1) + .endif + .endif + @ Subtract the dividend conditionally on the quotient bit. + umlal r0, r1, r7, r11 + umlal r1, r11, r8, r11 +.endm + + @ Four successive division steps. +.macro DIVSTEP4 + DIVIDEND_MUL16 + DIVSTEP_SH 4 + DIVSTEP_SH 3 + DIVSTEP_SH 2 + DIVSTEP_SH 1 +.endm + + @ Eight successive division steps. +.macro DIVSTEP8 + DIVSTEP4 + DIVSTEP4 +.endm + + @ First 24 iterations to get the upper 24 quotient bits. + DIVSTEP8 + DIVSTEP8 + DIVSTEP8 + + @ Save upper quotient bits. + vmov s2, r10 + + @ 31 iterations for the next bits. + DIVSTEP8 + DIVSTEP8 + DIVSTEP8 + DIVSTEP4 + DIVIDEND_MUL16 + DIVSTEP_SH 4 + DIVSTEP_SH 3 + DIVSTEP_SH 2 + + @ Current remainder is in r0:r1 (left-shifted by 1 bit). If it is + @ non-zero then we must set the last bit of the quotient (sticky bit). + subs r0, #1 + sbcs r1, #0 + adcs r10, r10 + + @ Restore upper quotient bits into r12. + vmov r12, s2 + + @ We have a quotient q in r10:r12, with value up to 2^56-1. It cannot + @ be lower than 2^54, since both operands were in [2^52, 2^53-1]. + @ This is a situation similar to that of multiplication. We + @ normalize r10:r12 to 2^54..2^55-1 (into r6:r7) with a conditional + @ shift (low bit is sticky). r5 contains -1 if the shift was done, + @ 0 otherwise. + sbfx r5, r12, #23, #1 + subs r4, r5, #1 + rors r4, #1 + eors r7, r7 + umlal r12, r7, r12, r4 + umlal r10, r12, r10, r4 + orr r6, r12, r10, lsr #31 @ dropped bit is sticky + + @ We recover source top words into r1 and r3. r5 contains the extra + @ shift flag. r6:r7 is the 55-bit output mantissa. Other registers + @ are free. + vmov r1, s0 + vmov r3, s1 + + @ Extract source exponents ex and ey (encoded) into r0 and r2. + @ Also set r4 to a negative value if x = 0, or to 0 otherwise + @ (by our assumptions, divisor y is non-zero). + ubfx r0, r1, #20, #11 + ubfx r2, r3, #20, #11 + subs r4, r0, #1 + + @ Compute aggregate exponent: ex - ey + 1022 + w + @ (where w = 1 if the conditional shift was done, 0 otherwise) + @ But we subtract 1 because the injection of the mantissa high + @ bit will increment the exponent by 1. + subs r2, r0, r2 + add r2, r2, #1021 + subs r2, r2, r5 + + @ If dividend is zero, then clamp mantissa and aggregate exponent + @ to zero. + bic r2, r2, r4, asr #31 + bic r6, r6, r4, asr #31 + bic r7, r7, r4, asr #31 + + @ Sign is the XOR of the sign of the operands. This is true in + @ all cases, including very small results (exponent underflow) + @ and zeros. + eors r1, r3 + bfc r1, #0, #31 + + @ Plug in the exponent. + bfi r1, r2, #20, #11 + + @ r2 and r3 are free. + @ Shift back to the normal 53-bit mantissa, with rounding. + @ Mantissa goes into r0:r1. r1 already contains the exponent and + @ sign bit; we must do an addition, which will also cover the case + @ of a carry (from rounding) spilling into the exponent. + @ Rounding adds 1 to the shifted mantissa when the three low bits + @ of the mantissa (before the shift) are 011, 110 or 111, i.e. + @ exactly when: (bit0 and bit1) or (bit1 and bit2) = 1. + and r3, r6, r6, lsr #1 + orr r3, r3, r3, lsr #1 + and r0, r3, #1 + add r0, r0, r6, lsr #2 + adds r0, r0, r7, lsl #30 + adcs r1, r1, r7, lsr #2 + + pop { r4, r5, r6, r7, r8, r10, r11, pc } + .size fndsa_fpr_div,.-fndsa_fpr_div + +@ ======================================================================= +@ fpr fndsa_fpr_sqrt(fpr x) +@ ======================================================================= + + .align 2 + .global fndsa_fpr_sqrt + .thumb + .thumb_func + .type fndsa_fpr_sqrt, %function +fndsa_fpr_sqrt: + push { r4, r5, r6, r7, r8, r10 } + + @ Extract exponent and mantissa. By assumption, the operand is + @ non-negative, hence we ignore the sign bit (sign bit could be 1 + @ if the operand is minus zero). We also decode the exponent + @ corresponding to a mantissa between 1 and 2. + @ For now, we suppose that the source is not zero. + @ r0:r1 <- mantissa + @ r12 <- encoded exponent + @ r2 <- decoded exponent + ubfx r12, r1, #20, #11 + sub r2, r12, #1023 + bfc r1, #20, #12 + orr r1, r1, #0x00100000 + + @ If the exponent is odd, then multiply mantissa by 2 and subtract 1 + @ from the exponent. + sbfx r3, r2, #0, #1 + and r4, r0, r3 + and r5, r1, r3 + adds r0, r4 + adcs r1, r5 + adds r2, r3 + + @ Exponent is now even, we can halve it. + asrs r2, #1 + + @ Left-shift the mantissa so that it is in [2^61, 2^63-1]. This + @ allows performing the first 30 iterations with some shortcuts + @ (one-word operations). + lsls r1, r1, #9 + orr r1, r1, r0, lsr #23 + lsls r0, r0, #9 + + @ r0:r1 is an integer between 1 (inclusive) and 4 (exclusive) in + @ a fixed-point notation (53 fractional bits). We compute the + @ square root bit by bit (54 iterations). We'll then append an + @ extra sticky bit. + eors r3, r3 + eors r5, r5 + +.macro SQRTSTEP_HI bit + orr r6, r5, #(1 << (\bit)) + subs r7, r1, r6 + rrx r3, r3 + and r6, r6, r3, asr #31 + subs r1, r1, r6 + lsrs r6, r3, #31 + orr r5, r5, r6, lsl #((\bit) + 1) + adds r0, r0 + adcs r1, r1 +.endm + +.macro SQRTSTEP_HI_x5 bb + SQRTSTEP_HI ((\bb) + 4) + SQRTSTEP_HI ((\bb) + 3) + SQRTSTEP_HI ((\bb) + 2) + SQRTSTEP_HI ((\bb) + 1) + SQRTSTEP_HI ((\bb) + 0) +.endm + + SQRTSTEP_HI_x5 25 + SQRTSTEP_HI_x5 20 + SQRTSTEP_HI_x5 15 + SQRTSTEP_HI_x5 10 + SQRTSTEP_HI_x5 5 + SQRTSTEP_HI_x5 0 + + @ We got top 30 bits of the result, in reverse order. + rbit r3, r3 + + @ For the next 24 iterations, we must use two-word operations. + @ First iteration is special because the potential bit goes into + @ r5, not r6. + eors r4, r4 + eors r6, r6 + + orr r7, r6, #(1 << 31) + subs r8, r0, r7 + sbcs r10, r1, r5 + rrx r4, r4 + and r7, r7, r4, asr #31 + and r8, r5, r4, asr #31 + subs r0, r0, r7 + sbcs r1, r1, r8 + lsrs r7, r4, #31 + orr r5, r5, r4, lsr #31 + adds r0, r0 + adcs r1, r1 + +.macro SQRTSTEP_LO bit + orr r7, r6, #(1 << (\bit)) + subs r8, r0, r7 + sbcs r10, r1, r5 + rrx r4, r4 + and r7, r7, r4, asr #31 + and r8, r5, r4, asr #31 + subs r0, r0, r7 + sbcs r1, r1, r8 + lsrs r7, r4, #31 + orr r6, r6, r7, lsl #((\bit) + 1) + adds r0, r0 + adcs r1, r1 +.endm + +.macro SQRTSTEP_LO_x4 bb + SQRTSTEP_LO ((\bb) + 3) + SQRTSTEP_LO ((\bb) + 2) + SQRTSTEP_LO ((\bb) + 1) + SQRTSTEP_LO ((\bb) + 0) +.endm + + SQRTSTEP_LO 30 + SQRTSTEP_LO 29 + SQRTSTEP_LO 28 + SQRTSTEP_LO_x4 24 + SQRTSTEP_LO_x4 20 + SQRTSTEP_LO_x4 16 + SQRTSTEP_LO_x4 12 + SQRTSTEP_LO_x4 8 + + @ Put low 24 bits in the right order. + rbit r4, r4 + + @ We now have a 54-bit result (low 24 bits in r4, top 30 bits in r3). + @ We need to round the value; the sticky bit is implicit (it is 1 if + @ the remainder in r0:r1 is non-zero at this point). + orrs r0, r1 + rsbs r1, r0, #0 + orrs r0, r1 @ sticky bit is in r0[31] + and r0, r4, r0, lsr #31 + and r1, r4, r4, lsr #1 + orrs r0, r1 + ands r0, #1 @ r0 contains the rounding adjustment + lsrs r1, r3, #9 + add r0, r0, r4, lsr #1 + adds r0, r0, r3, lsl #23 + adcs r1, #0 + + @ We have a rounded mantissa (including its top bit). We plug the + @ exponent, which is currently in r2 in decoded format. Since the + @ mantissa top bit is present, we encode r2 by adding 1022. + add r2, #1022 + add r1, r1, r2, lsl #20 + + @ We have the result, except if the source operand was zero, in + @ which case we must clamp the value to 0. Original exponent + @ (encoded) is still in r12. + rsb r3, r12, #0 + and r0, r0, r3, asr #31 + and r1, r1, r3, asr #31 + + pop { r4, r5, r6, r7, r8, r10 } + bx lr + .size fndsa_fpr_sqrt,.-fndsa_fpr_sqrt diff --git a/crypto_sign/fndsa_provisional-512/m4f/sign_inner.h b/crypto_sign/fndsa_provisional-512/m4f/sign_inner.h new file mode 120000 index 00000000..b36c72da --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/sign_inner.h @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/sign_inner.h \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/sign_sampler.c b/crypto_sign/fndsa_provisional-512/m4f/sign_sampler.c new file mode 120000 index 00000000..7ed648ae --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/sign_sampler.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/sign_sampler.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/sign_sampler_cm4.s b/crypto_sign/fndsa_provisional-512/m4f/sign_sampler_cm4.s new file mode 100644 index 00000000..fe19b6a4 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/sign_sampler_cm4.s @@ -0,0 +1,143 @@ + .syntax unified + .cpu cortex-m4 + .file "sign_sampler_cm4.s" + .text + +@ ======================================================================= +@ int32_t fndsa_gaussian0_helper(uint64_t lo, uint32_t hi) +@ ======================================================================= + + .align 2 + .global fndsa_gaussian0_helper + .thumb + .thumb_func + .type fndsa_gaussian0_helper, %function +fndsa_gaussian0_helper: + push.w { r4, r5, r6, r7, r8 } + + adr.w r12, fndsa_gaussian0_helper__gauss0_low + + @ 0 and 1 + ldm r12!, { r4, r5, r6, r7 } + subs r8, r0, r4 + sbcs r8, r1, r5 + sbcs r8, r2, #163 @ high[0] + lsr.w r3, r8, #31 + subs r8, r0, r6 + sbcs r8, r1, r7 + sbcs r8, r2, #84 @ high[1] + add.w r3, r3, r8, lsr #31 + + @ 2 and 3 + ldm r12!, { r4, r5, r6, r7 } + subs r8, r0, r4 + sbcs r8, r1, r5 + sbcs r8, r2, #34 @ high[2] + add.w r3, r3, r8, lsr #31 + subs r8, r0, r6 + sbcs r8, r1, r7 + sbcs r8, r2, #10 @ high[3] + add.w r3, r3, r8, lsr #31 + + @ 4 and 5 + ldm r12!, { r4, r5, r6, r7 } + subs r8, r0, r4 + sbcs r8, r1, r5 + sbcs r8, r2, #2 @ high[4] + add.w r3, r3, r8, lsr #31 + subs r8, r0, r6 + sbcs r8, r1, r7 + sbcs r8, r2, #0 @ high[5] + add.w r3, r3, r8, lsr #31 + + @ 6 and 7 + ldm r12!, { r4, r5, r6, r7 } + subs r8, r0, r4 + sbcs r8, r1, r5 + sbcs r8, r2, #0 @ high[6] + add.w r3, r3, r8, lsr #31 + subs r8, r0, r6 + sbcs r8, r1, r7 + sbcs r8, r2, #0 @ high[7] + add.w r3, r3, r8, lsr #31 + + @ 8 and 9 + ldm r12!, { r4, r5, r6, r7 } + subs r8, r0, r4 + sbcs r8, r1, r5 + sbcs r8, r2, #0 @ high[8] + add.w r3, r3, r8, lsr #31 + subs r8, r0, r6 + sbcs r8, r1, r7 + sbcs r8, r2, #0 @ high[9] + add.w r3, r3, r8, lsr #31 + + @ 10, 11 and 12 + ldm r12!, { r4, r5, r6, r7 } + subs r8, r0, r4 + sbcs r8, r1, r5 + sbcs r8, r2, #0 @ high[10] + add.w r3, r3, r8, lsr #31 + subs r8, r0, r6 + sbcs r8, r1, #148 @ mid[11] + sbcs r8, r2, #0 @ high[11] + add.w r3, r3, r8, lsr #31 + subs r8, r0, r7 + sbcs r8, r1, #3 @ mid[12] + sbcs r8, r2, #0 @ high[12] + add.w r3, r3, r8, lsr #31 + + @ 13, 14, 15, 16 + ldm r12!, { r4, r5, r6, r7 } + subs r8, r0, r4 + sbcs r8, r1, #0 @ mid[13] + sbcs r8, r2, #0 @ high[13] + add.w r3, r3, r8, lsr #31 + subs r8, r0, r5 + sbcs r8, r1, #0 @ mid[14] + sbcs r8, r2, #0 @ high[14] + add.w r3, r3, r8, lsr #31 + subs r8, r0, r6 + sbcs r8, r1, #0 @ mid[15] + sbcs r8, r2, #0 @ high[15] + add.w r3, r3, r8, lsr #31 + subs r8, r0, r7 + sbcs r8, r1, #0 @ mid[16] + sbcs r8, r2, #0 @ high[16] + add.w r3, r3, r8, lsr #31 + + @ 17 + ldr.w r4, [r12] + subs r8, r0, r4 + sbcs r8, r1, #0 @ mid[17] + sbcs r8, r2, #0 @ high[17] + add.w r3, r3, r8, lsr #31 + + mov.w r0, r3 + pop { r4, r5, r6, r7, r8 } + bx lr + .align 3 +fndsa_gaussian0_helper__gauss0_low: + @ This is the RCDT table from the specification. Only the low 64 bits + @ of each value are stored here; the high 8 bits are provided in + @ comments but otherwise hardcoded in the instructions above. + .word 2889422850, 4159975123 @ high: 163 + .word 1065212802, 3542816799 @ high: 84 + .word 1210696191, 2110640275 @ high: 34 + .word 3348712164, 3514123127 @ high: 10 + .word 4081000303, 2508483758 @ high: 2 + .word 3983850847, 2001389396 @ high: 0 + .word 729246436, 270851412 @ high: 0 + .word 1705862106, 27394012 @ high: 0 + .word 2323342376, 2064600 @ high: 0 + .word 2986609769, 115709 @ high: 0 + .word 617624059, 4815 @ high: 0 + @ Starting at value 11, we only store the low 32 bits. + .word 2676689183 @ mid: 148 high: 0 + .word 1717414296 @ mid: 3 high: 0 + .word 247426747 @ mid: 0 high: 0 + .word 3104126 @ mid: 0 high: 0 + .word 28824 @ mid: 0 high: 0 + .word 198 @ mid: 0 high: 0 + .word 1 @ mid: 0 high: 0 + .size fndsa_gaussian0_helper,.-fndsa_gaussian0_helper diff --git a/crypto_sign/fndsa_provisional-512/m4f/sysrng.c b/crypto_sign/fndsa_provisional-512/m4f/sysrng.c new file mode 120000 index 00000000..fa7fb5a7 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/sysrng.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/sysrng.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/util.c b/crypto_sign/fndsa_provisional-512/m4f/util.c new file mode 120000 index 00000000..6736b902 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/util.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/util.c \ No newline at end of file diff --git a/crypto_sign/fndsa_provisional-512/m4f/vrfy.c b/crypto_sign/fndsa_provisional-512/m4f/vrfy.c new file mode 120000 index 00000000..da558d38 --- /dev/null +++ b/crypto_sign/fndsa_provisional-512/m4f/vrfy.c @@ -0,0 +1 @@ +../../../mupq/crypto_sign/fndsa_provisional-512/ref/vrfy.c \ No newline at end of file diff --git a/mupq b/mupq index 61850b39..27157f09 160000 --- a/mupq +++ b/mupq @@ -1 +1 @@ -Subproject commit 61850b39de4add2616b20a440a7376b6d4e396de +Subproject commit 27157f09c3e450100101ef9f1cd7167131402a3a diff --git a/skiplist.py b/skiplist.py index 4ae65098..5757114f 100644 --- a/skiplist.py +++ b/skiplist.py @@ -59,19 +59,11 @@ {'scheme': 'cross-sha3-r-sdpg-3-small', 'implementation': 'ref', 'estmemory': 776192}, {'scheme': 'cross-sha3-r-sdpg-5-fast', 'implementation': 'ref', 'estmemory': 440320}, {'scheme': 'cross-sha3-r-sdpg-5-small', 'implementation': 'ref', 'estmemory': 1063936}, - {'scheme': 'falcon-1024', 'implementation': 'clean', 'estmemory': 91136}, - {'scheme': 'falcon-1024', 'implementation': 'm4-ct', 'estmemory': 89088}, - {'scheme': 'falcon-1024', 'implementation': 'opt-ct', 'estmemory': 89088}, - {'scheme': 'falcon-1024', 'implementation': 'opt-leaktime', 'estmemory': 90112}, - {'scheme': 'falcon-1024-tree', 'implementation': 'opt-ct', 'estmemory': 185344}, - {'scheme': 'falcon-1024-tree', 'implementation': 'opt-leaktime', 'estmemory': 186368}, - {'scheme': 'falcon-512', 'implementation': 'clean', 'estmemory': 48128}, - {'scheme': 'falcon-512', 'implementation': 'm4-ct', 'estmemory': 46080}, - {'scheme': 'falcon-512', 'implementation': 'opt-ct', 'estmemory': 46080}, - {'scheme': 'falcon-512', 'implementation': 'opt-leaktime', 'estmemory': 47104}, - {'scheme': 'falcon-512-tree', 'implementation': 'm4-ct', 'estmemory': 90112}, - {'scheme': 'falcon-512-tree', 'implementation': 'opt-ct', 'estmemory': 90112}, - {'scheme': 'falcon-512-tree', 'implementation': 'opt-leaktime', 'estmemory': 91136}, + # skip outdated Falcon implementations from PQClean (see https://github.com/mupq/pqm4/pull/377) + {'scheme': 'falcon-1024', 'implementation': 'clean', 'estmemory': 999999999999}, + {'scheme': 'falcon-512', 'implementation': 'clean', 'estmemory': 999999999999}, + {'scheme': 'falcon-padded-1024', 'implementation': 'clean', 'estmemory': 999999999999}, + {'scheme': 'falcon-padded-512', 'implementation': 'clean', 'estmemory': 999999999999}, {'scheme': 'haetae2', 'implementation': 'm4f', 'estmemory': 60416}, {'scheme': 'haetae2', 'implementation': 'ref', 'estmemory': 59392}, {'scheme': 'haetae3', 'implementation': 'm4f', 'estmemory': 90112}, @@ -213,8 +205,6 @@ {'scheme': 'sphincs-shake-192s-simple', 'implementation': 'clean', 'estmemory': 22528}, {'scheme': 'sphincs-shake-256f-simple', 'implementation': 'clean', 'estmemory': 59392}, {'scheme': 'sphincs-shake-256s-simple', 'implementation': 'clean', 'estmemory': 38912}, - {'scheme': 'falcon-padded-1024', 'implementation': 'clean', 'estmemory': 91136}, - {'scheme': 'falcon-padded-512', 'implementation': 'clean', 'estmemory': 48128}, {'scheme': 'ml-dsa-87', 'implementation': 'm4fstack', 'estmemory': 21504}, {'scheme': 'ml-dsa-87', 'implementation': 'm4f', 'estmemory': 129024}, {'scheme': 'ml-dsa-65', 'implementation': 'm4fstack', 'estmemory': 17408}, @@ -224,4 +214,8 @@ {'scheme': 'ml-dsa-87', 'implementation': 'clean', 'estmemory': 136192}, {'scheme': 'ml-dsa-65', 'implementation': 'clean', 'estmemory': 90112}, {'scheme': 'ml-dsa-44', 'implementation': 'clean', 'estmemory': 59392}, + {'scheme': 'fndsa_provisional-1024', 'implementation': 'ref', 'estmemory': 89088}, + {'scheme': 'fndsa_provisional-512', 'implementation': 'ref', 'estmemory': 46080}, + {'scheme': 'fndsa_provisional-1024', 'implementation': 'm4f', 'estmemory': 89088}, + {'scheme': 'fndsa_provisional-512', 'implementation': 'm4f', 'estmemory': 46080}, ]