diff --git a/benchmarks.csv b/benchmarks.csv
index 4bc2981a..5e3ba323 100644
--- a/benchmarks.csv
+++ b/benchmarks.csv
@@ -52,21 +52,10 @@ cross-sha3-r-sdpg-1-fast (10 executions),ref,290136,287742,297758,29963868,29960
 cross-sha3-r-sdpg-1-small (10 executions),ref,290135,287741,297757,102853622,102847774,102861948,75137510,75126803,75159685
 cross-sha3-r-sdpg-3-fast (10 executions),ref,627948,625525,637639,43573841,43565461,43582933,27513830,27493024,27525746
 cross-sha3-r-sdpg-5-fast (10 executions),ref,1146280,1142409,1153794,93557878,93547167,93566329,59948216,59857434,60043852
-falcon-1024 (10 executions),clean,602066436,377135260,1488065363,136241759,136017549,136556585,1678109,1677732,1678566
-falcon-1024 (10 executions),m4-ct,408725773,314885208,712370124,87706019,87549942,87839508,990541,984448,997160
-falcon-1024 (10 executions),opt-ct,448194494,301446952,784390745,87699336,87550679,87857833,992822,983184,998271
-falcon-1024 (10 executions),opt-leaktime,371539477,261831977,576613448,80134413,79844667,80338608,992815,982774,998600
-falcon-1024-tree (10 executions),opt-ct,469168139,341160847,733947155,39197559,39095597,39392055,995190,984826,998305
-falcon-1024-tree (10 executions),opt-leaktime,418213501,284879287,699555143,42181577,41844047,42456098,991791,983935,997742
-falcon-512 (10 executions),clean,229742458,134930383,358460785,62255726,62124149,62424751,834970,834402,835533
-falcon-512 (10 executions),m4-ct,146357328,106015844,250638532,40191597,40123901,40381630,482280,472137,485160
-falcon-512 (10 executions),opt-ct,168942163,106015882,258726842,40136012,40046972,40195851,481102,472809,485947
-falcon-512 (10 executions),opt-leaktime,130638983,94352160,240934147,37196341,36969717,37564986,476152,471514,484487
-falcon-512-tree (10 executions),m4-ct,187840863,121618909,531189026,18199972,18111179,18297541,479819,472890,485685
-falcon-512-tree (10 executions),opt-ct,179501018,121618960,347996956,18222471,18064774,18329860,479635,472057,484767
-falcon-512-tree (10 executions),opt-leaktime,203618838,106760540,425495750,20110699,19752157,20375122,480119,472263,485743
-falcon-padded-1024 (10 executions),clean,464867653,351942875,908060882,136157961,135988344,136430038,1677719,1677506,1677932
-falcon-padded-512 (10 executions),clean,241548154,164862595,348699388,62231774,62096573,62365088,834766,834480,834957
+fndsa_provisional-1024 (10 executions),m4f,308608613,195536229,763483542,48321135,48158227,48398718,793856,782604,799182
+fndsa_provisional-1024 (10 executions),ref,274928016,217869128,448029028,107512779,107106716,107788566,1461795,1444739,1469817
+fndsa_provisional-512 (10 executions),m4f,67693338,57825106,81542705,22469685,22280159,22594542,396949,390368,406553
+fndsa_provisional-512 (10 executions),ref,85699591,64822516,132207505,49522949,49325465,49631598,731387,714301,738870
 haetae2 (100 executions),m4f,6743278,1555292,25393506,21993963,4721290,86765689,918459,918244,918668
 haetae2 (100 executions),ref,9363639,1716264,41895014,31631089,6247382,216853925,1104080,1103874,1104329
 haetae3 (100 executions),m4f,12925388,2752846,52240529,30891994,7467529,160522018,1760745,1760408,1761081
@@ -205,21 +194,10 @@ cross-sha3-r-sdpg-1-fast,ref,2328,130928,69560,,,,,,
 cross-sha3-r-sdpg-1-small,ref,2328,466400,245512,,,,,,
 cross-sha3-r-sdpg-3-fast,ref,4032,205080,108236,,,,,,
 cross-sha3-r-sdpg-5-fast,ref,6824,398600,213436,,,,,,
-falcon-1024,clean,35076,84604,8776,,,,,,
-falcon-1024,m4-ct,1156,2508,376,,,,,,
-falcon-1024,opt-ct,1204,2508,376,,,,,,
-falcon-1024,opt-leaktime,1252,2580,444,,,,,,
-falcon-1024-tree,opt-ct,1148,2884,376,,,,,,
-falcon-1024-tree,opt-leaktime,1196,2988,376,,,,,,
-falcon-512,clean,18180,43548,4680,,,,,,
-falcon-512,m4-ct,1148,2428,376,,,,,,
-falcon-512,opt-ct,1244,2428,376,,,,,,
-falcon-512,opt-leaktime,1148,2492,376,,,,,,
-falcon-512-tree,m4-ct,1172,2636,376,,,,,,
-falcon-512-tree,opt-ct,1156,2636,376,,,,,,
-falcon-512-tree,opt-leaktime,1196,2828,376,,,,,,
-falcon-padded-1024,clean,34988,84596,8776,,,,,,
-falcon-padded-512,clean,18092,43540,4680,,,,,,
+fndsa_provisional-1024,m4f,27772,81992,5024,,,,,,
+fndsa_provisional-1024,ref,27676,82276,5308,,,,,,
+fndsa_provisional-512,m4f,14348,41952,2976,,,,,,
+fndsa_provisional-512,ref,14380,42124,3260,,,,,,
 haetae2,m4f,19756,55568,23296,,,,,,
 haetae2,ref,26092,54444,29696,,,,,,
 haetae3,m4f,29596,83420,31784,,,,,,
@@ -359,21 +337,10 @@ cross-sha3-r-sdpg-1-fast,ref,71.8,74.8,77.1,,,,,,
 cross-sha3-r-sdpg-1-small,ref,71.8,74.7,78.4,,,,,,
 cross-sha3-r-sdpg-3-fast,ref,71.7,68.2,68.7,,,,,,
 cross-sha3-r-sdpg-5-fast,ref,71.1,66.1,66.8,,,,,,
-falcon-1024,clean,8.9,0.3,23.7,,,,,,
-falcon-1024,m4-ct,8.6,0.4,32.2,,,,,,
-falcon-1024,opt-ct,9.8,0.4,32.2,,,,,,
-falcon-1024,opt-leaktime,10.9,0.5,32.2,,,,,,
-falcon-1024-tree,opt-ct,9.2,0.9,32.3,,,,,,
-falcon-1024-tree,opt-leaktime,10.6,0.9,32.3,,,,,,
-falcon-512,clean,7.9,0.4,26.0,,,,,,
-falcon-512,m4-ct,13.7,0.5,33.9,,,,,,
-falcon-512,opt-ct,14.0,0.5,33.2,,,,,,
-falcon-512,opt-leaktime,17.3,0.5,33.6,,,,,,
-falcon-512-tree,m4-ct,12.6,1.1,33.7,,,,,,
-falcon-512-tree,opt-ct,14.6,1.1,34.2,,,,,,
-falcon-512-tree,opt-leaktime,20.5,1.0,34.3,,,,,,
-falcon-padded-1024,clean,7.3,0.3,23.7,,,,,,
-falcon-padded-512,clean,16.0,0.4,26.0,,,,,,
+fndsa_provisional-1024,m4f,0.0,0.0,0.0,,,,,,
+fndsa_provisional-1024,ref,0.0,0.0,0.0,,,,,,
+fndsa_provisional-512,m4f,0.0,0.0,0.0,,,,,,
+fndsa_provisional-512,ref,0.0,0.0,0.0,,,,,,
 haetae2,m4f,12.4,56.7,54.1,,,,,,
 haetae2,ref,10.6,42.4,45.1,,,,,,
 haetae3,m4f,14.6,56.6,57.1,,,,,,
@@ -512,21 +479,10 @@ cross-sha3-r-sdpg-1-fast,ref,18605,0,208,18813,,,,,
 cross-sha3-r-sdpg-1-small,ref,18846,0,208,19054,,,,,
 cross-sha3-r-sdpg-3-fast,ref,19689,0,208,19897,,,,,
 cross-sha3-r-sdpg-5-fast,ref,18593,0,208,18801,,,,,
-falcon-1024,clean,82703,0,0,82703,,,,,
-falcon-1024,m4-ct,81825,0,79872,161697,,,,,
-falcon-1024,opt-ct,81825,0,79872,161697,,,,,
-falcon-1024,opt-leaktime,75429,0,79872,155301,,,,,
-falcon-1024-tree,opt-ct,81569,0,55296,136865,,,,,
-falcon-1024-tree,opt-leaktime,75173,0,55296,130469,,,,,
-falcon-512,clean,82663,0,0,82663,,,,,
-falcon-512,m4-ct,81825,0,39936,121761,,,,,
-falcon-512,opt-ct,81825,0,39936,121761,,,,,
-falcon-512,opt-leaktime,75429,0,39936,115365,,,,,
-falcon-512-tree,m4-ct,81569,0,27648,109217,,,,,
-falcon-512-tree,opt-ct,81569,0,27648,109217,,,,,
-falcon-512-tree,opt-leaktime,75173,0,27648,102821,,,,,
-falcon-padded-1024,clean,82643,0,0,82643,,,,,
-falcon-padded-512,clean,82599,0,0,82599,,,,,
+fndsa_provisional-1024,m4f,103801,0,0,103801,,,,,
+fndsa_provisional-1024,ref,103089,0,0,103089,,,,,
+fndsa_provisional-512,m4f,103789,0,0,103789,,,,,
+fndsa_provisional-512,ref,103077,0,0,103077,,,,,
 haetae2,m4f,35708,0,0,35708,,,,,
 haetae2,ref,25568,0,0,25568,,,,,
 haetae3,m4f,35936,0,0,35936,,,,,
diff --git a/benchmarks.md b/benchmarks.md
index afe0ae3a..f5439c7d 100644
--- a/benchmarks.md
+++ b/benchmarks.md
@@ -54,21 +54,10 @@
 | cross-sha3-r-sdpg-1-small (10 executions) | ref | AVG: 290,135 <br /> MIN: 287,741 <br /> MAX: 297,757 | AVG: 102,853,622 <br /> MIN: 102,847,774 <br /> MAX: 102,861,948 | AVG: 75,137,510 <br /> MIN: 75,126,803 <br /> MAX: 75,159,685 |
 | cross-sha3-r-sdpg-3-fast (10 executions) | ref | AVG: 627,948 <br /> MIN: 625,525 <br /> MAX: 637,639 | AVG: 43,573,841 <br /> MIN: 43,565,461 <br /> MAX: 43,582,933 | AVG: 27,513,830 <br /> MIN: 27,493,024 <br /> MAX: 27,525,746 |
 | cross-sha3-r-sdpg-5-fast (10 executions) | ref | AVG: 1,146,280 <br /> MIN: 1,142,409 <br /> MAX: 1,153,794 | AVG: 93,557,878 <br /> MIN: 93,547,167 <br /> MAX: 93,566,329 | AVG: 59,948,216 <br /> MIN: 59,857,434 <br /> MAX: 60,043,852 |
-| falcon-1024 (10 executions) | clean | AVG: 602,066,436 <br /> MIN: 377,135,260 <br /> MAX: 1,488,065,363 | AVG: 136,241,759 <br /> MIN: 136,017,549 <br /> MAX: 136,556,585 | AVG: 1,678,109 <br /> MIN: 1,677,732 <br /> MAX: 1,678,566 |
-| falcon-1024 (10 executions) | m4-ct | AVG: 408,725,773 <br /> MIN: 314,885,208 <br /> MAX: 712,370,124 | AVG: 87,706,019 <br /> MIN: 87,549,942 <br /> MAX: 87,839,508 | AVG: 990,541 <br /> MIN: 984,448 <br /> MAX: 997,160 |
-| falcon-1024 (10 executions) | opt-ct | AVG: 448,194,494 <br /> MIN: 301,446,952 <br /> MAX: 784,390,745 | AVG: 87,699,336 <br /> MIN: 87,550,679 <br /> MAX: 87,857,833 | AVG: 992,822 <br /> MIN: 983,184 <br /> MAX: 998,271 |
-| falcon-1024 (10 executions) | opt-leaktime | AVG: 371,539,477 <br /> MIN: 261,831,977 <br /> MAX: 576,613,448 | AVG: 80,134,413 <br /> MIN: 79,844,667 <br /> MAX: 80,338,608 | AVG: 992,815 <br /> MIN: 982,774 <br /> MAX: 998,600 |
-| falcon-1024-tree (10 executions) | opt-ct | AVG: 469,168,139 <br /> MIN: 341,160,847 <br /> MAX: 733,947,155 | AVG: 39,197,559 <br /> MIN: 39,095,597 <br /> MAX: 39,392,055 | AVG: 995,190 <br /> MIN: 984,826 <br /> MAX: 998,305 |
-| falcon-1024-tree (10 executions) | opt-leaktime | AVG: 418,213,501 <br /> MIN: 284,879,287 <br /> MAX: 699,555,143 | AVG: 42,181,577 <br /> MIN: 41,844,047 <br /> MAX: 42,456,098 | AVG: 991,791 <br /> MIN: 983,935 <br /> MAX: 997,742 |
-| falcon-512 (10 executions) | clean | AVG: 229,742,458 <br /> MIN: 134,930,383 <br /> MAX: 358,460,785 | AVG: 62,255,726 <br /> MIN: 62,124,149 <br /> MAX: 62,424,751 | AVG: 834,970 <br /> MIN: 834,402 <br /> MAX: 835,533 |
-| falcon-512 (10 executions) | m4-ct | AVG: 146,357,328 <br /> MIN: 106,015,844 <br /> MAX: 250,638,532 | AVG: 40,191,597 <br /> MIN: 40,123,901 <br /> MAX: 40,381,630 | AVG: 482,280 <br /> MIN: 472,137 <br /> MAX: 485,160 |
-| falcon-512 (10 executions) | opt-ct | AVG: 168,942,163 <br /> MIN: 106,015,882 <br /> MAX: 258,726,842 | AVG: 40,136,012 <br /> MIN: 40,046,972 <br /> MAX: 40,195,851 | AVG: 481,102 <br /> MIN: 472,809 <br /> MAX: 485,947 |
-| falcon-512 (10 executions) | opt-leaktime | AVG: 130,638,983 <br /> MIN: 94,352,160 <br /> MAX: 240,934,147 | AVG: 37,196,341 <br /> MIN: 36,969,717 <br /> MAX: 37,564,986 | AVG: 476,152 <br /> MIN: 471,514 <br /> MAX: 484,487 |
-| falcon-512-tree (10 executions) | m4-ct | AVG: 187,840,863 <br /> MIN: 121,618,909 <br /> MAX: 531,189,026 | AVG: 18,199,972 <br /> MIN: 18,111,179 <br /> MAX: 18,297,541 | AVG: 479,819 <br /> MIN: 472,890 <br /> MAX: 485,685 |
-| falcon-512-tree (10 executions) | opt-ct | AVG: 179,501,018 <br /> MIN: 121,618,960 <br /> MAX: 347,996,956 | AVG: 18,222,471 <br /> MIN: 18,064,774 <br /> MAX: 18,329,860 | AVG: 479,635 <br /> MIN: 472,057 <br /> MAX: 484,767 |
-| falcon-512-tree (10 executions) | opt-leaktime | AVG: 203,618,838 <br /> MIN: 106,760,540 <br /> MAX: 425,495,750 | AVG: 20,110,699 <br /> MIN: 19,752,157 <br /> MAX: 20,375,122 | AVG: 480,119 <br /> MIN: 472,263 <br /> MAX: 485,743 |
-| falcon-padded-1024 (10 executions) | clean | AVG: 464,867,653 <br /> MIN: 351,942,875 <br /> MAX: 908,060,882 | AVG: 136,157,961 <br /> MIN: 135,988,344 <br /> MAX: 136,430,038 | AVG: 1,677,719 <br /> MIN: 1,677,506 <br /> MAX: 1,677,932 |
-| falcon-padded-512 (10 executions) | clean | AVG: 241,548,154 <br /> MIN: 164,862,595 <br /> MAX: 348,699,388 | AVG: 62,231,774 <br /> MIN: 62,096,573 <br /> MAX: 62,365,088 | AVG: 834,766 <br /> MIN: 834,480 <br /> MAX: 834,957 |
+| fndsa_provisional-1024 (10 executions) | m4f | AVG: 308,608,613 <br /> MIN: 195,536,229 <br /> MAX: 763,483,542 | AVG: 48,321,135 <br /> MIN: 48,158,227 <br /> MAX: 48,398,718 | AVG: 793,856 <br /> MIN: 782,604 <br /> MAX: 799,182 |
+| fndsa_provisional-1024 (10 executions) | ref | AVG: 274,928,016 <br /> MIN: 217,869,128 <br /> MAX: 448,029,028 | AVG: 107,512,779 <br /> MIN: 107,106,716 <br /> MAX: 107,788,566 | AVG: 1,461,795 <br /> MIN: 1,444,739 <br /> MAX: 1,469,817 |
+| fndsa_provisional-512 (10 executions) | m4f | AVG: 67,693,338 <br /> MIN: 57,825,106 <br /> MAX: 81,542,705 | AVG: 22,469,685 <br /> MIN: 22,280,159 <br /> MAX: 22,594,542 | AVG: 396,949 <br /> MIN: 390,368 <br /> MAX: 406,553 |
+| fndsa_provisional-512 (10 executions) | ref | AVG: 85,699,591 <br /> MIN: 64,822,516 <br /> MAX: 132,207,505 | AVG: 49,522,949 <br /> MIN: 49,325,465 <br /> MAX: 49,631,598 | AVG: 731,387 <br /> MIN: 714,301 <br /> MAX: 738,870 |
 | haetae2 (100 executions) | m4f | AVG: 6,743,278 <br /> MIN: 1,555,292 <br /> MAX: 25,393,506 | AVG: 21,993,963 <br /> MIN: 4,721,290 <br /> MAX: 86,765,689 | AVG: 918,459 <br /> MIN: 918,244 <br /> MAX: 918,668 |
 | haetae2 (100 executions) | ref | AVG: 9,363,639 <br /> MIN: 1,716,264 <br /> MAX: 41,895,014 | AVG: 31,631,089 <br /> MIN: 6,247,382 <br /> MAX: 216,853,925 | AVG: 1,104,080 <br /> MIN: 1,103,874 <br /> MAX: 1,104,329 |
 | haetae3 (100 executions) | m4f | AVG: 12,925,388 <br /> MIN: 2,752,846 <br /> MAX: 52,240,529 | AVG: 30,891,994 <br /> MIN: 7,467,529 <br /> MAX: 160,522,018 | AVG: 1,760,745 <br /> MIN: 1,760,408 <br /> MAX: 1,761,081 |
@@ -209,21 +198,10 @@
 | cross-sha3-r-sdpg-1-small | ref | 2,328 | 466,400 | 245,512 |
 | cross-sha3-r-sdpg-3-fast | ref | 4,032 | 205,080 | 108,236 |
 | cross-sha3-r-sdpg-5-fast | ref | 6,824 | 398,600 | 213,436 |
-| falcon-1024 | clean | 35,076 | 84,604 | 8,776 |
-| falcon-1024 | m4-ct | 1,156 | 2,508 | 376 |
-| falcon-1024 | opt-ct | 1,204 | 2,508 | 376 |
-| falcon-1024 | opt-leaktime | 1,252 | 2,580 | 444 |
-| falcon-1024-tree | opt-ct | 1,148 | 2,884 | 376 |
-| falcon-1024-tree | opt-leaktime | 1,196 | 2,988 | 376 |
-| falcon-512 | clean | 18,180 | 43,548 | 4,680 |
-| falcon-512 | m4-ct | 1,148 | 2,428 | 376 |
-| falcon-512 | opt-ct | 1,244 | 2,428 | 376 |
-| falcon-512 | opt-leaktime | 1,148 | 2,492 | 376 |
-| falcon-512-tree | m4-ct | 1,172 | 2,636 | 376 |
-| falcon-512-tree | opt-ct | 1,156 | 2,636 | 376 |
-| falcon-512-tree | opt-leaktime | 1,196 | 2,828 | 376 |
-| falcon-padded-1024 | clean | 34,988 | 84,596 | 8,776 |
-| falcon-padded-512 | clean | 18,092 | 43,540 | 4,680 |
+| fndsa_provisional-1024 | m4f | 27,772 | 81,992 | 5,024 |
+| fndsa_provisional-1024 | ref | 27,676 | 82,276 | 5,308 |
+| fndsa_provisional-512 | m4f | 14,348 | 41,952 | 2,976 |
+| fndsa_provisional-512 | ref | 14,380 | 42,124 | 3,260 |
 | haetae2 | m4f | 19,756 | 55,568 | 23,296 |
 | haetae2 | ref | 26,092 | 54,444 | 29,696 |
 | haetae3 | m4f | 29,596 | 83,420 | 31,784 |
@@ -364,21 +342,10 @@
 | cross-sha3-r-sdpg-1-small | ref | 71.8% | 74.7% | 78.4% |
 | cross-sha3-r-sdpg-3-fast | ref | 71.7% | 68.2% | 68.7% |
 | cross-sha3-r-sdpg-5-fast | ref | 71.1% | 66.1% | 66.8% |
-| falcon-1024 | clean | 8.9% | 0.3% | 23.7% |
-| falcon-1024 | m4-ct | 8.6% | 0.4% | 32.2% |
-| falcon-1024 | opt-ct | 9.8% | 0.4% | 32.2% |
-| falcon-1024 | opt-leaktime | 10.9% | 0.5% | 32.2% |
-| falcon-1024-tree | opt-ct | 9.2% | 0.9% | 32.3% |
-| falcon-1024-tree | opt-leaktime | 10.6% | 0.9% | 32.3% |
-| falcon-512 | clean | 7.9% | 0.4% | 26.0% |
-| falcon-512 | m4-ct | 13.7% | 0.5% | 33.9% |
-| falcon-512 | opt-ct | 14.0% | 0.5% | 33.2% |
-| falcon-512 | opt-leaktime | 17.3% | 0.5% | 33.6% |
-| falcon-512-tree | m4-ct | 12.6% | 1.1% | 33.7% |
-| falcon-512-tree | opt-ct | 14.6% | 1.1% | 34.2% |
-| falcon-512-tree | opt-leaktime | 20.5% | 1.0% | 34.3% |
-| falcon-padded-1024 | clean | 7.3% | 0.3% | 23.7% |
-| falcon-padded-512 | clean | 16.0% | 0.4% | 26.0% |
+| fndsa_provisional-1024 | m4f | 0.0% | 0.0% | 0.0% |
+| fndsa_provisional-1024 | ref | 0.0% | 0.0% | 0.0% |
+| fndsa_provisional-512 | m4f | 0.0% | 0.0% | 0.0% |
+| fndsa_provisional-512 | ref | 0.0% | 0.0% | 0.0% |
 | haetae2 | m4f | 12.4% | 56.7% | 54.1% |
 | haetae2 | ref | 10.6% | 42.4% | 45.1% |
 | haetae3 | m4f | 14.6% | 56.6% | 57.1% |
@@ -519,21 +486,10 @@
 | cross-sha3-r-sdpg-1-small | ref | 18,846 | 0 | 208 | 19,054 |
 | cross-sha3-r-sdpg-3-fast | ref | 19,689 | 0 | 208 | 19,897 |
 | cross-sha3-r-sdpg-5-fast | ref | 18,593 | 0 | 208 | 18,801 |
-| falcon-1024 | clean | 82,703 | 0 | 0 | 82,703 |
-| falcon-1024 | m4-ct | 81,825 | 0 | 79,872 | 161,697 |
-| falcon-1024 | opt-ct | 81,825 | 0 | 79,872 | 161,697 |
-| falcon-1024 | opt-leaktime | 75,429 | 0 | 79,872 | 155,301 |
-| falcon-1024-tree | opt-ct | 81,569 | 0 | 55,296 | 136,865 |
-| falcon-1024-tree | opt-leaktime | 75,173 | 0 | 55,296 | 130,469 |
-| falcon-512 | clean | 82,663 | 0 | 0 | 82,663 |
-| falcon-512 | m4-ct | 81,825 | 0 | 39,936 | 121,761 |
-| falcon-512 | opt-ct | 81,825 | 0 | 39,936 | 121,761 |
-| falcon-512 | opt-leaktime | 75,429 | 0 | 39,936 | 115,365 |
-| falcon-512-tree | m4-ct | 81,569 | 0 | 27,648 | 109,217 |
-| falcon-512-tree | opt-ct | 81,569 | 0 | 27,648 | 109,217 |
-| falcon-512-tree | opt-leaktime | 75,173 | 0 | 27,648 | 102,821 |
-| falcon-padded-1024 | clean | 82,643 | 0 | 0 | 82,643 |
-| falcon-padded-512 | clean | 82,599 | 0 | 0 | 82,599 |
+| fndsa_provisional-1024 | m4f | 103,801 | 0 | 0 | 103,801 |
+| fndsa_provisional-1024 | ref | 103,089 | 0 | 0 | 103,089 |
+| fndsa_provisional-512 | m4f | 103,789 | 0 | 0 | 103,789 |
+| fndsa_provisional-512 | ref | 103,077 | 0 | 0 | 103,077 |
 | haetae2 | m4f | 35,708 | 0 | 0 | 35,708 |
 | haetae2 | ref | 25,568 | 0 | 0 | 25,568 |
 | haetae3 | m4f | 35,936 | 0 | 0 | 35,936 |
diff --git a/crypto_sign/falcon-1024/m4-ct/README.txt b/crypto_sign/falcon-1024/m4-ct/README.txt
deleted file mode 100644
index 7bedf7f1..00000000
--- a/crypto_sign/falcon-1024/m4-ct/README.txt
+++ /dev/null
@@ -1,137 +0,0 @@
-Falcon implementation for PQM4 (or even mupq in general).
-
-
-There are multiple variants. Each variant is selected with the choice of
-api.h (four choices: api512dyn.h, api512tree.h, api1024dyn.h,
-api1024tree.h), and additional compile-time macro that are documented in
-config.h and can be set either in config.h, or through command-line
-flags passed to the C compiler.
-
-Choice of api.h:
-
-    api512dyn.h
-        "Normal" Falcon-512. Private key is reasonably compact. The
-        Falcon LDL tree is internally recomputed for each signature.
-
-    api512tree.h
-        Falcon-512 is key expansion. The Falcon LDL tree is computed
-        as part of the keygen, and returned as private key. This
-        speeds up signature generation, but also greatly enlarges
-        the private key size.
-
-    api1024dyn.h
-        "Normal" Falcon-1024.
-
-    api1024tree.h
-        Falcon-1024 with key expansion.
-
-Compile-time options (config.h):
-
-    FALCON_FPEMU
-        Set to 1 to enable use of the internal constant-time emulation
-        of floating-point operations.
-
-    FALCON_FPNATIVE
-        Set to 1 to use the native 'double' type and floating-point
-        operations. On architectures that lack a FPU, this will use the
-        compiler-provided floating-point emulation routines, which are
-        usually not constant-time (and sometimes return values which
-        do not follow IEEE-754 rounding rules).
-
-    FALCON_ASM_CORTEXM4
-        Set to 1 to use the M4 assembly routine for the constant-time
-        emulation of floating-point operations. These are faster than
-        the generic routines in C activated by FALCON_FPEMU.
-
-There is some internal autodetection that tries to select the right
-values automatically, but it's safer to explicitly select things:
-
-    To use the native 'double' type:
-        -DFALCON_FPNATIVE=1
-
-    To use the generic FP emulation code:
-        -DFALCON_FPEMU=1 -DFALCON_ASM_CORTEXM4=0
-
-    To use the M4 assembly code for FP emulation:
-        -DFALCON_FPEMU=1 -DFALCON_ASM_CORTEXM4=1
-
-The code relying on the native 'double' type requires an implementation
-that follows IEEE-754 rules with a 64-bit type. It works on 64-bit x86
-and PowerPC / POWER systems. On 32-bit x86, it tends to fail because the
-80387 FPU is used with more precision; on such a system, use
-'-msse2 -mfpmath=sse' to force use of the SSE2 unit (this might be the
-default on some systems, e.g. Darwin / macOS).
-
-
-IMPORTANT NOTES
-===============
-
-  * The PQM4 API is implemented in pqm4.c. Since the M4 stack is usually
-    small (usual default is 4 kB), temporary buffers are statically
-    allocated. This implies that the crypto_sign_keypair(), crypto_sign()
-    and crypto_sign_open() functions are not thread-safe or reentrant.
-    Also, the static allocation is "forever".
-
-    See the comments for the 'tmp' variable in pqm4.c; this gives the
-    relevant sizes.
-
-  * When using expanded keys, the private key contains 64-bit values
-    (floating-point, i.e. 'double' or 'uint64_t' depending on the kind
-    of floating-point emulation that is used). On many systems, this
-    implies some alignment requirements. I.e. crypto_sign_keypair() and
-    crypto_sign() then require the 'sk' pointer to be suitably aligned.
-    On an ARM Cortex M4, 32-bit alignment is required (while the basic
-    RAM access opcodes tolerate unaligned accesses, the 'ldm' and 'stm'
-    opcodes need 32-bit aligned pointers).
-
-  * When using the native 'double' type, the code has a dependency on
-    the sqrt() function. On x86, the relevant SSE2 opcode is inlined,
-    but the library function is still (potentially) invoked in case the
-    operand is negative, so that proper error management is performed.
-    This case does not happen in Falcon, but the library function is
-    still referenced, and explicitly linking with '-lm' may be
-    necessary.
-
-  * When using the native 'double' type, do _NOT_ enable -ffast-math.
-    The internal rounding function relies on the usual trick:
-        when x >= 0, round(x) = (x + 2**52) - 2**52
-
-    This trick works only as long as each addition is rounded as per
-    the IEEE-754 rules to the exact precision of the 64-bit type.
-    When -ffast-math is enabled, the compiler may assume commutativity
-    and "optimize" that expression into 'round(x) = x', which does not
-    work at all.
-
-
-TESTS
-=====
-
-In the 'tests/' directory is a generator for known-answer tests, and the
-expected file. The code comes from the NIST, but was modified to avoid a
-dependency on OpenSSL. When compiling the C source file against the
-selected Falcon implementation, an executable is produced, that, when
-executed, generates an '*.req' and an '*.rsp' files. The .req file is
-redundant (the .rsp file contains all the information, and some more).
-
-The expected .rsp files are provided as:
-    KAT512dyn.rsp        Falcon-512, no expanded key
-    KAT512tree.rsp       Falcon-512, with expanded key
-    KAT1024dyn.rsp       Falcon-1024, no expanded key
-    KAT1024tree.rsp      Falcon-1024, with expanded key
-
-
-Normally, all computations are exact and the files are exactly
-reproducible. However, some discrepancies may occur with the '*tree'
-files in the following cases:
-
-  - On big-endian architectures, the bytes in sk[] will be in a
-    different order. This is a side effect of putting the raw bytes
-    of the expanded key in sk[] (this could be fixed with some
-    reencoding pass, but this was not implemented yet).
-
-  - If a non-exact IEEE-754 implementation is used, some of the
-    low bits of the values may be changed. This may happen if the
-    underlying implementation is not strictly faithful to rounding.
-
-As long as only the 'sk' lines are changed, then the public keys
-and signature values are unimpacted.
diff --git a/crypto_sign/falcon-1024/m4-ct/api.h b/crypto_sign/falcon-1024/m4-ct/api.h
deleted file mode 100644
index e22e11f1..00000000
--- a/crypto_sign/falcon-1024/m4-ct/api.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#include <stddef.h>
-
-#define CRYPTO_SECRETKEYBYTES   2305
-#define CRYPTO_PUBLICKEYBYTES   1793
-#define CRYPTO_BYTES            1330
-
-#define CRYPTO_ALGNAME          "Falcon-1024"
-
-int crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
-
-int crypto_sign(unsigned char *sm, size_t *smlen,
-	const unsigned char *m, size_t mlen,
-	const unsigned char *sk);
-
-int crypto_sign_open(unsigned char *m, size_t *mlen,
-	const unsigned char *sm, size_t smlen,
-	const unsigned char *pk);
diff --git a/crypto_sign/falcon-1024/m4-ct/codec.c b/crypto_sign/falcon-1024/m4-ct/codec.c
deleted file mode 100644
index 5bd61424..00000000
--- a/crypto_sign/falcon-1024/m4-ct/codec.c
+++ /dev/null
@@ -1,559 +0,0 @@
-/*
- * Encoding/decoding of keys and signatures.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-/* see inner.h */
-size_t
-Zf(modq_encode)(
-	void *out, size_t max_out_len,
-	const uint16_t *x, unsigned logn)
-{
-	size_t n, out_len, u;
-	uint8_t *buf;
-	uint32_t acc;
-	int acc_len;
-
-	n = (size_t)1 << logn;
-	for (u = 0; u < n; u ++) {
-		if (x[u] >= 12289) {
-			return 0;
-		}
-	}
-	out_len = ((n * 14) + 7) >> 3;
-	if (out == NULL) {
-		return out_len;
-	}
-	if (out_len > max_out_len) {
-		return 0;
-	}
-	buf = out;
-	acc = 0;
-	acc_len = 0;
-	for (u = 0; u < n; u ++) {
-		acc = (acc << 14) | x[u];
-		acc_len += 14;
-		while (acc_len >= 8) {
-			acc_len -= 8;
-			*buf ++ = (uint8_t)(acc >> acc_len);
-		}
-	}
-	if (acc_len > 0) {
-		*buf = (uint8_t)(acc << (8 - acc_len));
-	}
-	return out_len;
-}
-
-/* see inner.h */
-size_t
-Zf(modq_decode)(
-	uint16_t *x, unsigned logn,
-	const void *in, size_t max_in_len)
-{
-	size_t n, in_len, u;
-	const uint8_t *buf;
-	uint32_t acc;
-	int acc_len;
-
-	n = (size_t)1 << logn;
-	in_len = ((n * 14) + 7) >> 3;
-	if (in_len > max_in_len) {
-		return 0;
-	}
-	buf = in;
-	acc = 0;
-	acc_len = 0;
-	u = 0;
-	while (u < n) {
-		acc = (acc << 8) | (*buf ++);
-		acc_len += 8;
-		if (acc_len >= 14) {
-			unsigned w;
-
-			acc_len -= 14;
-			w = (acc >> acc_len) & 0x3FFF;
-			if (w >= 12289) {
-				return 0;
-			}
-			x[u ++] = (uint16_t)w;
-		}
-	}
-	if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
-		return 0;
-	}
-	return in_len;
-}
-
-/* see inner.h */
-size_t
-Zf(trim_i16_encode)(
-	void *out, size_t max_out_len,
-	const int16_t *x, unsigned logn, unsigned bits)
-{
-	size_t n, u, out_len;
-	int minv, maxv;
-	uint8_t *buf;
-	uint32_t acc, mask;
-	unsigned acc_len;
-
-	n = (size_t)1 << logn;
-	maxv = (1 << (bits - 1)) - 1;
-	minv = -maxv;
-	for (u = 0; u < n; u ++) {
-		if (x[u] < minv || x[u] > maxv) {
-			return 0;
-		}
-	}
-	out_len = ((n * bits) + 7) >> 3;
-	if (out == NULL) {
-		return out_len;
-	}
-	if (out_len > max_out_len) {
-		return 0;
-	}
-	buf = out;
-	acc = 0;
-	acc_len = 0;
-	mask = ((uint32_t)1 << bits) - 1;
-	for (u = 0; u < n; u ++) {
-		acc = (acc << bits) | ((uint16_t)x[u] & mask);
-		acc_len += bits;
-		while (acc_len >= 8) {
-			acc_len -= 8;
-			*buf ++ = (uint8_t)(acc >> acc_len);
-		}
-	}
-	if (acc_len > 0) {
-		*buf ++ = (uint8_t)(acc << (8 - acc_len));
-	}
-	return out_len;
-}
-
-/* see inner.h */
-size_t
-Zf(trim_i16_decode)(
-	int16_t *x, unsigned logn, unsigned bits,
-	const void *in, size_t max_in_len)
-{
-	size_t n, in_len;
-	const uint8_t *buf;
-	size_t u;
-	uint32_t acc, mask1, mask2;
-	unsigned acc_len;
-
-	n = (size_t)1 << logn;
-	in_len = ((n * bits) + 7) >> 3;
-	if (in_len > max_in_len) {
-		return 0;
-	}
-	buf = in;
-	u = 0;
-	acc = 0;
-	acc_len = 0;
-	mask1 = ((uint32_t)1 << bits) - 1;
-	mask2 = (uint32_t)1 << (bits - 1);
-	while (u < n) {
-		acc = (acc << 8) | *buf ++;
-		acc_len += 8;
-		while (acc_len >= bits && u < n) {
-			uint32_t w;
-
-			acc_len -= bits;
-			w = (acc >> acc_len) & mask1;
-			w |= -(w & mask2);
-			if (w == -mask2) {
-				/*
-				 * The -2^(bits-1) value is forbidden.
-				 */
-				return 0;
-			}
-			w |= -(w & mask2);
-			x[u ++] = (int16_t)*(int32_t *)&w;
-		}
-	}
-	if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
-		/*
-		 * Extra bits in the last byte must be zero.
-		 */
-		return 0;
-	}
-	return in_len;
-}
-
-/* see inner.h */
-size_t
-Zf(trim_i8_encode)(
-	void *out, size_t max_out_len,
-	const int8_t *x, unsigned logn, unsigned bits)
-{
-	size_t n, u, out_len;
-	int minv, maxv;
-	uint8_t *buf;
-	uint32_t acc, mask;
-	unsigned acc_len;
-
-	n = (size_t)1 << logn;
-	maxv = (1 << (bits - 1)) - 1;
-	minv = -maxv;
-	for (u = 0; u < n; u ++) {
-		if (x[u] < minv || x[u] > maxv) {
-			return 0;
-		}
-	}
-	out_len = ((n * bits) + 7) >> 3;
-	if (out == NULL) {
-		return out_len;
-	}
-	if (out_len > max_out_len) {
-		return 0;
-	}
-	buf = out;
-	acc = 0;
-	acc_len = 0;
-	mask = ((uint32_t)1 << bits) - 1;
-	for (u = 0; u < n; u ++) {
-		acc = (acc << bits) | ((uint8_t)x[u] & mask);
-		acc_len += bits;
-		while (acc_len >= 8) {
-			acc_len -= 8;
-			*buf ++ = (uint8_t)(acc >> acc_len);
-		}
-	}
-	if (acc_len > 0) {
-		*buf ++ = (uint8_t)(acc << (8 - acc_len));
-	}
-	return out_len;
-}
-
-/* see inner.h */
-size_t
-Zf(trim_i8_decode)(
-	int8_t *x, unsigned logn, unsigned bits,
-	const void *in, size_t max_in_len)
-{
-	size_t n, in_len;
-	const uint8_t *buf;
-	size_t u;
-	uint32_t acc, mask1, mask2;
-	unsigned acc_len;
-
-	n = (size_t)1 << logn;
-	in_len = ((n * bits) + 7) >> 3;
-	if (in_len > max_in_len) {
-		return 0;
-	}
-	buf = in;
-	u = 0;
-	acc = 0;
-	acc_len = 0;
-	mask1 = ((uint32_t)1 << bits) - 1;
-	mask2 = (uint32_t)1 << (bits - 1);
-	while (u < n) {
-		acc = (acc << 8) | *buf ++;
-		acc_len += 8;
-		while (acc_len >= bits && u < n) {
-			uint32_t w;
-
-			acc_len -= bits;
-			w = (acc >> acc_len) & mask1;
-			w |= -(w & mask2);
-			if (w == -mask2) {
-				/*
-				 * The -2^(bits-1) value is forbidden.
-				 */
-				return 0;
-			}
-			x[u ++] = (int8_t)*(int32_t *)&w;
-		}
-	}
-	if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
-		/*
-		 * Extra bits in the last byte must be zero.
-		 */
-		return 0;
-	}
-	return in_len;
-}
-
-/* see inner.h */
-size_t
-Zf(comp_encode)(
-	void *out, size_t max_out_len,
-	const int16_t *x, unsigned logn)
-{
-	uint8_t *buf;
-	size_t n, u, v;
-	uint32_t acc;
-	unsigned acc_len;
-
-	n = (size_t)1 << logn;
-	buf = out;
-
-	/*
-	 * Make sure that all values are within the -2047..+2047 range.
-	 */
-	for (u = 0; u < n; u ++) {
-		if (x[u] < -2047 || x[u] > +2047) {
-			return 0;
-		}
-	}
-
-	acc = 0;
-	acc_len = 0;
-	v = 0;
-	for (u = 0; u < n; u ++) {
-		int t;
-		unsigned w;
-
-		/*
-		 * Get sign and absolute value of next integer; push the
-		 * sign bit.
-		 */
-		acc <<= 1;
-		t = x[u];
-		if (t < 0) {
-			t = -t;
-			acc |= 1;
-		}
-		w = (unsigned)t;
-
-		/*
-		 * Push the low 7 bits of the absolute value.
-		 */
-		acc <<= 7;
-		acc |= w & 127u;
-		w >>= 7;
-
-		/*
-		 * We pushed exactly 8 bits.
-		 */
-		acc_len += 8;
-
-		/*
-		 * Push as many zeros as necessary, then a one. Since the
-		 * absolute value is at most 2047, w can only range up to
-		 * 15 at this point, thus we will add at most 16 bits
-		 * here. With the 8 bits above and possibly up to 7 bits
-		 * from previous iterations, we may go up to 31 bits, which
-		 * will fit in the accumulator, which is an uint32_t.
-		 */
-		acc <<= (w + 1);
-		acc |= 1;
-		acc_len += w + 1;
-
-		/*
-		 * Produce all full bytes.
-		 */
-		while (acc_len >= 8) {
-			acc_len -= 8;
-			if (buf != NULL) {
-				if (v >= max_out_len) {
-					return 0;
-				}
-				buf[v] = (uint8_t)(acc >> acc_len);
-			}
-			v ++;
-		}
-	}
-
-	/*
-	 * Flush remaining bits (if any).
-	 */
-	if (acc_len > 0) {
-		if (buf != NULL) {
-			if (v >= max_out_len) {
-				return 0;
-			}
-			buf[v] = (uint8_t)(acc << (8 - acc_len));
-		}
-		v ++;
-	}
-
-	return v;
-}
-
-/* see inner.h */
-size_t
-Zf(comp_decode)(
-	int16_t *x, unsigned logn,
-	const void *in, size_t max_in_len)
-{
-	const uint8_t *buf;
-	size_t n, u, v;
-	uint32_t acc;
-	unsigned acc_len;
-
-	n = (size_t)1 << logn;
-	buf = in;
-	acc = 0;
-	acc_len = 0;
-	v = 0;
-	for (u = 0; u < n; u ++) {
-		unsigned b, s, m;
-
-		/*
-		 * Get next eight bits: sign and low seven bits of the
-		 * absolute value.
-		 */
-		if (v >= max_in_len) {
-			return 0;
-		}
-		acc = (acc << 8) | (uint32_t)buf[v ++];
-		b = acc >> acc_len;
-		s = b & 128;
-		m = b & 127;
-
-		/*
-		 * Get next bits until a 1 is reached.
-		 */
-		for (;;) {
-			if (acc_len == 0) {
-				if (v >= max_in_len) {
-					return 0;
-				}
-				acc = (acc << 8) | (uint32_t)buf[v ++];
-				acc_len = 8;
-			}
-			acc_len --;
-			if (((acc >> acc_len) & 1) != 0) {
-				break;
-			}
-			m += 128;
-			if (m > 2047) {
-				return 0;
-			}
-		}
-		x[u] = (int16_t)(s ? -(int)m : (int)m);
-	}
-	return v;
-}
-
-/*
- * Key elements and signatures are polynomials with small integer
- * coefficients. Here are some statistics gathered over many
- * generated key pairs (10000 or more for each degree):
- *
- *   log(n)     n   max(f,g)   std(f,g)   max(F,G)   std(F,G)
- *      1       2     129       56.31       143       60.02
- *      2       4     123       40.93       160       46.52
- *      3       8      97       28.97       159       38.01
- *      4      16     100       21.48       154       32.50
- *      5      32      71       15.41       151       29.36
- *      6      64      59       11.07       138       27.77
- *      7     128      39        7.91       144       27.00
- *      8     256      32        5.63       148       26.61
- *      9     512      22        4.00       137       26.46
- *     10    1024      15        2.84       146       26.41
- *
- * We want a compact storage format for private key, and, as part of
- * key generation, we are allowed to reject some keys which would
- * otherwise be fine (this does not induce any noticeable vulnerability
- * as long as we reject only a small proportion of possible keys).
- * Hence, we enforce at key generation time maximum values for the
- * elements of f, g, F and G, so that their encoding can be expressed
- * in fixed-width values. Limits have been chosen so that generated
- * keys are almost always within bounds, thus not impacting neither
- * security or performance.
- *
- * IMPORTANT: the code assumes that all coefficients of f, g, F and G
- * ultimately fit in the -127..+127 range. Thus, none of the elements
- * of max_fg_bits[] and max_FG_bits[] shall be greater than 8.
- */
-
-const uint8_t Zf(max_fg_bits)[] = {
-	0, /* unused */
-	8,
-	8,
-	8,
-	8,
-	8,
-	7,
-	7,
-	6,
-	6,
-	5
-};
-
-const uint8_t Zf(max_FG_bits)[] = {
-	0, /* unused */
-	8,
-	8,
-	8,
-	8,
-	8,
-	8,
-	8,
-	8,
-	8,
-	8
-};
-
-/*
- * When generating a new key pair, we can always reject keys which
- * feature an abnormally large coefficient. This can also be done for
- * signatures, albeit with some care: in case the signature process is
- * used in a derandomized setup (explicitly seeded with the message and
- * private key), we have to follow the specification faithfully, and the
- * specification only enforces a limit on the L2 norm of the signature
- * vector. The limit on the L2 norm implies that the absolute value of
- * a coefficient of the signature cannot be more than the following:
- *
- *   log(n)     n   max sig coeff (theoretical)
- *      1       2       412
- *      2       4       583
- *      3       8       824
- *      4      16      1166
- *      5      32      1649
- *      6      64      2332
- *      7     128      3299
- *      8     256      4665
- *      9     512      6598
- *     10    1024      9331
- *
- * However, the largest observed signature coefficients during our
- * experiments was 1077 (in absolute value), hence we can assume that,
- * with overwhelming probability, signature coefficients will fit
- * in -2047..2047, i.e. 12 bits.
- */
-
-const uint8_t Zf(max_sig_bits)[] = {
-	0, /* unused */
-	10,
-	11,
-	11,
-	12,
-	12,
-	12,
-	12,
-	12,
-	12,
-	12
-};
diff --git a/crypto_sign/falcon-1024/m4-ct/common.c b/crypto_sign/falcon-1024/m4-ct/common.c
deleted file mode 100644
index ef30028b..00000000
--- a/crypto_sign/falcon-1024/m4-ct/common.c
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * Support functions for signatures (hash-to-point, norm).
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-/* see inner.h */
-void
-Zf(hash_to_point_vartime)(
-	inner_shake256_context *sc,
-	uint16_t *x, unsigned logn)
-{
-	/*
-	 * This is the straightforward per-the-spec implementation. It
-	 * is not constant-time, thus it might reveal information on the
-	 * plaintext (at least, enough to check the plaintext against a
-	 * list of potential plaintexts) in a scenario where the
-	 * attacker does not have access to the signature value or to
-	 * the public key, but knows the nonce (without knowledge of the
-	 * nonce, the hashed output cannot be matched against potential
-	 * plaintexts).
-	 */
-	size_t n;
-
-	n = (size_t)1 << logn;
-	while (n > 0) {
-		uint8_t buf[2];
-		uint32_t w;
-
-		inner_shake256_extract(sc, (void *)buf, sizeof buf);
-		w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
-		if (w < 61445) {
-			while (w >= 12289) {
-				w -= 12289;
-			}
-			*x ++ = (uint16_t)w;
-			n --;
-		}
-	}
-}
-
-/* see inner.h */
-void
-Zf(hash_to_point_ct)(
-	inner_shake256_context *sc,
-	uint16_t *x, unsigned logn, uint8_t *tmp)
-{
-	/*
-	 * Each 16-bit sample is a value in 0..65535. The value is
-	 * kept if it falls in 0..61444 (because 61445 = 5*12289)
-	 * and rejected otherwise; thus, each sample has probability
-	 * about 0.93758 of being selected.
-	 *
-	 * We want to oversample enough to be sure that we will
-	 * have enough values with probability at least 1 - 2^(-256).
-	 * Depending on degree N, this leads to the following
-	 * required oversampling:
-	 *
-	 *   logn     n  oversampling
-	 *     1      2     65
-	 *     2      4     67
-	 *     3      8     71
-	 *     4     16     77
-	 *     5     32     86
-	 *     6     64    100
-	 *     7    128    122
-	 *     8    256    154
-	 *     9    512    205
-	 *    10   1024    287
-	 *
-	 * If logn >= 7, then the provided temporary buffer is large
-	 * enough. Otherwise, we use a stack buffer of 63 entries
-	 * (i.e. 126 bytes) for the values that do not fit in tmp[].
-	 */
-
-	static const uint16_t overtab[] = {
-		0, /* unused */
-		65,
-		67,
-		71,
-		77,
-		86,
-		100,
-		122,
-		154,
-		205,
-		287
-	};
-
-	unsigned n, n2, u, m, p, over;
-	uint16_t *tt1, tt2[63];
-
-	/*
-	 * We first generate m 16-bit value. Values 0..n-1 go to x[].
-	 * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[].
-	 * We also reduce modulo q the values; rejected values are set
-	 * to 0xFFFF.
-	 */
-	n = 1U << logn;
-	n2 = n << 1;
-	over = overtab[logn];
-	m = n + over;
-	tt1 = (uint16_t *)tmp;
-	for (u = 0; u < m; u ++) {
-		uint8_t buf[2];
-		uint32_t w, wr;
-
-		inner_shake256_extract(sc, buf, sizeof buf);
-		w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
-		wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
-		wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
-		wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1));
-		wr |= ((w - 61445) >> 31) - 1;
-		if (u < n) {
-			x[u] = (uint16_t)wr;
-		} else if (u < n2) {
-			tt1[u - n] = (uint16_t)wr;
-		} else {
-			tt2[u - n2] = (uint16_t)wr;
-		}
-	}
-
-	/*
-	 * Now we must "squeeze out" the invalid values. We do this in
-	 * a logarithmic sequence of passes; each pass computes where a
-	 * value should go, and moves it down by 'p' slots if necessary,
-	 * where 'p' uses an increasing powers-of-two scale. It can be
-	 * shown that in all cases where the loop decides that a value
-	 * has to be moved down by p slots, the destination slot is
-	 * "free" (i.e. contains an invalid value).
-	 */
-	for (p = 1; p <= over; p <<= 1) {
-		unsigned v;
-
-		/*
-		 * In the loop below:
-		 *
-		 *   - v contains the index of the final destination of
-		 *     the value; it is recomputed dynamically based on
-		 *     whether values are valid or not.
-		 *
-		 *   - u is the index of the value we consider ("source");
-		 *     its address is s.
-		 *
-		 *   - The loop may swap the value with the one at index
-		 *     u-p. The address of the swap destination is d.
-		 */
-		v = 0;
-		for (u = 0; u < m; u ++) {
-			uint16_t *s, *d;
-			unsigned j, sv, dv, mk;
-
-			if (u < n) {
-				s = &x[u];
-			} else if (u < n2) {
-				s = &tt1[u - n];
-			} else {
-				s = &tt2[u - n2];
-			}
-			sv = *s;
-
-			/*
-			 * The value in sv should ultimately go to
-			 * address v, i.e. jump back by u-v slots.
-			 */
-			j = u - v;
-
-			/*
-			 * We increment v for the next iteration, but
-			 * only if the source value is valid. The mask
-			 * 'mk' is -1 if the value is valid, 0 otherwise,
-			 * so we _subtract_ mk.
-			 */
-			mk = (sv >> 15) - 1U;
-			v -= mk;
-
-			/*
-			 * In this loop we consider jumps by p slots; if
-			 * u < p then there is nothing more to do.
-			 */
-			if (u < p) {
-				continue;
-			}
-
-			/*
-			 * Destination for the swap: value at address u-p.
-			 */
-			if ((u - p) < n) {
-				d = &x[u - p];
-			} else if ((u - p) < n2) {
-				d = &tt1[(u - p) - n];
-			} else {
-				d = &tt2[(u - p) - n2];
-			}
-			dv = *d;
-
-			/*
-			 * The swap should be performed only if the source
-			 * is valid AND the jump j has its 'p' bit set.
-			 */
-			mk &= -(((j & p) + 0x1FF) >> 9);
-
-			*s = (uint16_t)(sv ^ (mk & (sv ^ dv)));
-			*d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
-		}
-	}
-}
-
-/* see inner.h */
-int
-Zf(is_short)(
-	const int16_t *s1, const int16_t *s2, unsigned logn)
-{
-	/*
-	 * We use the l2-norm. Code below uses only 32-bit operations to
-	 * compute the square of the norm with saturation to 2^32-1 if
-	 * the value exceeds 2^31-1.
-	 */
-	size_t n, u;
-	uint32_t s, ng;
-
-	n = (size_t)1 << logn;
-	s = 0;
-	ng = 0;
-	for (u = 0; u < n; u ++) {
-		int32_t z;
-
-		z = s1[u];
-		s += (uint32_t)(z * z);
-		ng |= s;
-		z = s2[u];
-		s += (uint32_t)(z * z);
-		ng |= s;
-	}
-	s |= -(ng >> 31);
-
-	/*
-	 * Acceptance bound on the l2-norm is:
-	 *   1.2*1.55*sqrt(q)*sqrt(2*N)
-	 * Value 7085 is floor((1.2^2)*(1.55^2)*2*1024).
-	 */
-	return s < (((uint32_t)7085 * (uint32_t)12289) >> (10 - logn));
-}
-
-/* see inner.h */
-int
-Zf(is_short_half)(
-	uint32_t sqn, const int16_t *s2, unsigned logn)
-{
-	size_t n, u;
-	uint32_t ng;
-
-	n = (size_t)1 << logn;
-	ng = -(sqn >> 31);
-	for (u = 0; u < n; u ++) {
-		int32_t z;
-
-		z = s2[u];
-		sqn += (uint32_t)(z * z);
-		ng |= sqn;
-	}
-	sqn |= -(ng >> 31);
-
-	/*
-	 * Acceptance bound on the l2-norm is:
-	 *   1.2*1.55*sqrt(q)*sqrt(2*N)
-	 * Value 7085 is floor((1.2^2)*(1.55^2)*2*1024).
-	 */
-	return sqn < (((uint32_t)7085 * (uint32_t)12289) >> (10 - logn));
-}
diff --git a/crypto_sign/falcon-1024/m4-ct/config.h b/crypto_sign/falcon-1024/m4-ct/config.h
deleted file mode 100644
index cd78727e..00000000
--- a/crypto_sign/falcon-1024/m4-ct/config.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Manual configuration file for the Falcon implementation. Here can
- * be set some compilation-time options.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#ifndef FALCON_CONFIG_H__
-#define FALCON_CONFIG_H__
-
-/*
- * Each option is a macro which should be defined to either 1 or 0.
- * If any of the options below is left undefined, then a default value
- * will be used by the code, possibly using compile-time autodetection
- * from compiler-defined macros.
- *
- * Explicitly setting a parameter can be done by uncommenting/modifying
- * its definition below, in this file, or equivalently by setting it as
- * a compiler flag.
- */
-
-/*
- * Use the native 'double' C type for floating-point computations. Exact
- * reproducibility of all tests requires that type to faithfully follow
- * IEEE-754 "round-to-nearest" rules.
- *
- * Native double support will use the CPU hardware and/or
- * compiler-provided functions; the latter is typically NOT
- * constant-time, while the former MAY be constant-time, or not. On
- * recent x86 CPU in 64-bit mode, SSE2 opcodes are used and they provide
- * constant-time operations for all the operations used in Falcon,
- * except for some special cases of divisions and square roots, but it
- * can be shown that theses cases imply only negligible leak of
- * information that cannot be leveraged into a full attack.
- *
- * If neither FALCON_FPNATIVE nor FALCON_FPEMU is defined, then use of
- * the native 'double' C type is the default behaviour unless
- * FALCON_ASM_CORTEXM4 is defined to 1, in which case the emulated code
- * will be used.
- *
-#define FALCON_FPNATIVE   1
- */
-
-/*
- * Use emulated floating-point implementation.
- *
- * Emulation uses only integer operations with uint32_t and uint64_t
- * types. This is constant-time, provided that the underlying platform
- * offers constant-time opcodes for the following operations:
- *
- *  - Multiplication of two 32-bit unsigned integers into a 64-bit result.
- *  - Left-shift or right-shift of a 32-bit unsigned integer by a
- *    potentially secret shift count in the 0..31 range.
- *
- * Notably, the ARM Cortex M3 does not fulfill the first condition,
- * while the Pentium IV does not fulfill the second.
- *
- * If neither FALCON_FPNATIVE nor FALCON_FPEMU is defined, then use of
- * the native 'double' C type is the default behaviour unless
- * FALCON_ASM_CORTEXM4 is defined to 1, in which case the emulated code
- * will be used.
- *
-#define FALCON_FPEMU   1
- */
-
-/*
- * Enable use of assembly for ARM Cortex-M4 CPU. By default, such
- * support will be used based on some autodection on the compiler
- * version and target architecture. Define this variable to 1 to force
- * use of the assembly code, or 0 to disable it regardless of the
- * autodetection.
- *
- * When FALCON_ASM_CORTEXM4 is enabled (whether defined explicitly or
- * autodetected), emulated floating-point code will be used, unless
- * FALCON_FPNATIVE or FALCON_FPEMU is explicitly set to override the
- * choice. Emulated code with ARM assembly is constant-time and provides
- * better performance than emulated code with plain C.
- *
- * The assembly code for the M4 can also work on a Cortex-M3. If the
- * compiler is instructed to target the M3 (e.g. '-mcpu=cortex-m3' with
- * GCC) then FALCON_ASM_CORTEXM4 won't be autodetected, but it can be
- * enabled explicitly. Take care, though, that the M3 multiplication
- * opcode (multiplication of two 32-bit unsigned integers with a 64-bit
- * result) is NOT constant-time.
- *
-#define FALCON_ASM_CORTEXM4   1
- */
-
-#define FALCON_ASM_CORTEXM4   1
-
-/*
- * Enable use of AVX2 intrinsics. If enabled, then the code will compile
- * only when targeting x86 with a compiler that supports AVX2 intrinsics
- * (tested with GCC 7.4.0, Clang 6.0.0, and MSVC 2015, both in 32-bit
- * and 64-bit modes), and run only on systems that offer the AVX2
- * opcodes. Some operations leverage AVX2 for better performance.
- *
-#define FALCON_AVX2   1
- */
-
-/*
- * Enable use of FMA intrinsics. This setting has any effect only if
- * FALCON_AVX2 is also enabled. The FMA intrinsics are normally available
- * on any x86 CPU that also has AVX2. Note that setting this option will
- * slightly modify the values of expanded private keys, but will normally
- * not change the values of non-expanded private keys, public keys or
- * signatures, for a given keygen/sign seed (non-expanded private keys
- * and signatures might theoretically change, but only with low probability,
- * less than 2^(-40); produced signatures are still safe and interoperable).
- *
-#define FALCON_FMA   1
- */
-
-/*
- * Assert that the platform uses little-endian encoding. If enabled,
- * then encoding and decoding of aligned multibyte values will be
- * slightly faster (especially for hashing and random number
- * generation). If not defined explicitly, then autodetection is
- * applied.
- *
-#define FALCON_LE   1
- */
-
-/*
- * Assert that the platform tolerates accesses to unaligned multibyte
- * values. If enabled, then some operations are slightly faster. Note
- * that ARM Cortex M4 do _not_ fully tolerate unaligned accesses; for
- * such systems, this option should not be enabled. If not defined
- * explicitly, then autodetection is applied.
- *
-#define FALCON_UNALIGNED   1
- */
-
-/*
- * Use a PRNG based on ChaCha20 and seeded with SHAKE256, instead of
- * SHAKE256 directly, for key pair generation purposes. This speeds up
- * key pair generation, especially on platforms where SHAKE256 is
- * comparatively slow: on the ARM Cortex M4, average key generation time
- * is reduced by 19% with this setting; on a recent x86 Skylake, the
- * reduction is smaller (less than 8%).
- *
- * However, this setting changes the private/public key pair obtained
- * from a given seed, thus preventing reproducibility of the
- * known-answer tests vectors. For compatibility with existing KAT
- * vectors (e.g. in PQClean, pqm4 and NIST implementations), this
- * setting is not enabled by default.
- *
-#define FALCON_KG_CHACHA20   1
- */
-
-/*
- * Use an explicit OS-provided source of randomness for seeding (for the
- * Zf(get_seed)() function implementation). Three possible sources are
- * defined:
- *
- *  - getentropy() system call
- *  - /dev/urandom special file
- *  - CryptGenRandom() function call
- *
- * More than one source may be enabled, in which case they will be tried
- * in the order above, until a success is reached.
- *
- * By default, sources are enabled at compile-time based on these
- * conditions:
- *
- *  - getentropy(): target is one of: Linux with Glibc-2.25+, FreeBSD 12+,
- *    or OpenBSD.
- *  - /dev/urandom: target is a Unix-like system (including Linux,
- *    FreeBSD, NetBSD, OpenBSD, DragonFly, macOS, Android, Solaris, AIX).
- *  - CryptGenRandom(): target is Windows (Win32 or Win64).
- *
- * On most small embedded systems, none will be enabled and Zf(get_seed)()
- * will always return 0. Applications will need to provide their own seeds.
- *
-#define FALCON_RAND_GETENTROPY   1
-#define FALCON_RAND_URANDOM      1
-#define FALCON_RAND_WIN32        1
- */
-
-#endif
diff --git a/crypto_sign/falcon-1024/m4-ct/fft.c b/crypto_sign/falcon-1024/m4-ct/fft.c
deleted file mode 100644
index b1904b24..00000000
--- a/crypto_sign/falcon-1024/m4-ct/fft.c
+++ /dev/null
@@ -1,1412 +0,0 @@
-/*
- * FFT code.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-/*
- * Rules for complex number macros:
- * --------------------------------
- *
- * Operand order is: destination, source1, source2...
- *
- * Each operand is a real and an imaginary part.
- *
- * All overlaps are allowed.
- */
-
-/*
- * Addition of two complex numbers (d = a + b).
- */
-#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
-		fpr fpct_re, fpct_im; \
-		fpct_re = fpr_add(a_re, b_re); \
-		fpct_im = fpr_add(a_im, b_im); \
-		(d_re) = fpct_re; \
-		(d_im) = fpct_im; \
-	} while (0)
-
-/*
- * Subtraction of two complex numbers (d = a - b).
- */
-#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
-		fpr fpct_re, fpct_im; \
-		fpct_re = fpr_sub(a_re, b_re); \
-		fpct_im = fpr_sub(a_im, b_im); \
-		(d_re) = fpct_re; \
-		(d_im) = fpct_im; \
-	} while (0)
-
-/*
- * Multplication of two complex numbers (d = a * b).
- */
-#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
-		fpr fpct_a_re, fpct_a_im; \
-		fpr fpct_b_re, fpct_b_im; \
-		fpr fpct_d_re, fpct_d_im; \
-		fpct_a_re = (a_re); \
-		fpct_a_im = (a_im); \
-		fpct_b_re = (b_re); \
-		fpct_b_im = (b_im); \
-		fpct_d_re = fpr_sub( \
-			fpr_mul(fpct_a_re, fpct_b_re), \
-			fpr_mul(fpct_a_im, fpct_b_im)); \
-		fpct_d_im = fpr_add( \
-			fpr_mul(fpct_a_re, fpct_b_im), \
-			fpr_mul(fpct_a_im, fpct_b_re)); \
-		(d_re) = fpct_d_re; \
-		(d_im) = fpct_d_im; \
-	} while (0)
-
-/*
- * Squaring of a complex number (d = a * a).
- */
-#define FPC_SQR(d_re, d_im, a_re, a_im)   do { \
-		fpr fpct_a_re, fpct_a_im; \
-		fpr fpct_d_re, fpct_d_im; \
-		fpct_a_re = (a_re); \
-		fpct_a_im = (a_im); \
-		fpct_d_re = fpr_sub(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
-		fpct_d_im = fpr_double(fpr_mul(fpct_a_re, fpct_a_im)); \
-		(d_re) = fpct_d_re; \
-		(d_im) = fpct_d_im; \
-	} while (0)
-
-/*
- * Inversion of a complex number (d = 1 / a).
- */
-#define FPC_INV(d_re, d_im, a_re, a_im)   do { \
-		fpr fpct_a_re, fpct_a_im; \
-		fpr fpct_d_re, fpct_d_im; \
-		fpr fpct_m; \
-		fpct_a_re = (a_re); \
-		fpct_a_im = (a_im); \
-		fpct_m = fpr_add(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
-		fpct_m = fpr_inv(fpct_m); \
-		fpct_d_re = fpr_mul(fpct_a_re, fpct_m); \
-		fpct_d_im = fpr_mul(fpr_neg(fpct_a_im), fpct_m); \
-		(d_re) = fpct_d_re; \
-		(d_im) = fpct_d_im; \
-	} while (0)
-
-/*
- * Division of complex numbers (d = a / b).
- */
-#define FPC_DIV(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
-		fpr fpct_a_re, fpct_a_im; \
-		fpr fpct_b_re, fpct_b_im; \
-		fpr fpct_d_re, fpct_d_im; \
-		fpr fpct_m; \
-		fpct_a_re = (a_re); \
-		fpct_a_im = (a_im); \
-		fpct_b_re = (b_re); \
-		fpct_b_im = (b_im); \
-		fpct_m = fpr_add(fpr_sqr(fpct_b_re), fpr_sqr(fpct_b_im)); \
-		fpct_m = fpr_inv(fpct_m); \
-		fpct_b_re = fpr_mul(fpct_b_re, fpct_m); \
-		fpct_b_im = fpr_mul(fpr_neg(fpct_b_im), fpct_m); \
-		fpct_d_re = fpr_sub( \
-			fpr_mul(fpct_a_re, fpct_b_re), \
-			fpr_mul(fpct_a_im, fpct_b_im)); \
-		fpct_d_im = fpr_add( \
-			fpr_mul(fpct_a_re, fpct_b_im), \
-			fpr_mul(fpct_a_im, fpct_b_re)); \
-		(d_re) = fpct_d_re; \
-		(d_im) = fpct_d_im; \
-	} while (0)
-
-/*
- * Let w = exp(i*pi/N); w is a primitive 2N-th root of 1. We define the
- * values w_j = w^(2j+1) for all j from 0 to N-1: these are the roots
- * of X^N+1 in the field of complex numbers. A crucial property is that
- * w_{N-1-j} = conj(w_j) = 1/w_j for all j.
- *
- * FFT representation of a polynomial f (taken modulo X^N+1) is the
- * set of values f(w_j). Since f is real, conj(f(w_j)) = f(conj(w_j)),
- * thus f(w_{N-1-j}) = conj(f(w_j)). We thus store only half the values,
- * for j = 0 to N/2-1; the other half can be recomputed easily when (if)
- * needed. A consequence is that FFT representation has the same size
- * as normal representation: N/2 complex numbers use N real numbers (each
- * complex number is the combination of a real and an imaginary part).
- *
- * We use a specific ordering which makes computations easier. Let rev()
- * be the bit-reversal function over log(N) bits. For j in 0..N/2-1, we
- * store the real and imaginary parts of f(w_j) in slots:
- *
- *    Re(f(w_j)) -> slot rev(j)/2
- *    Im(f(w_j)) -> slot rev(j)/2+N/2
- *
- * (Note that rev(j) is even for j < N/2.)
- */
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(FFT)(fpr *f, unsigned logn)
-{
-	/*
-	 * FFT algorithm in bit-reversal order uses the following
-	 * iterative algorithm:
-	 *
-	 *   t = N
-	 *   for m = 1; m < N; m *= 2:
-	 *       ht = t/2
-	 *       for i1 = 0; i1 < m; i1 ++:
-	 *           j1 = i1 * t
-	 *           s = GM[m + i1]
-	 *           for j = j1; j < (j1 + ht); j ++:
-	 *               x = f[j]
-	 *               y = s * f[j + ht]
-	 *               f[j] = x + y
-	 *               f[j + ht] = x - y
-	 *       t = ht
-	 *
-	 * GM[k] contains w^rev(k) for primitive root w = exp(i*pi/N).
-	 *
-	 * In the description above, f[] is supposed to contain complex
-	 * numbers. In our in-memory representation, the real and
-	 * imaginary parts of f[k] are in array slots k and k+N/2.
-	 *
-	 * We only keep the first half of the complex numbers. We can
-	 * see that after the first iteration, the first and second halves
-	 * of the array of complex numbers have separate lives, so we
-	 * simply ignore the second part.
-	 */
-
-	unsigned u;
-	size_t t, n, hn, m;
-
-	/*
-	 * First iteration: compute f[j] + i * f[j+N/2] for all j < N/2
-	 * (because GM[1] = w^rev(1) = w^(N/2) = i).
-	 * In our chosen representation, this is a no-op: everything is
-	 * already where it should be.
-	 */
-
-	/*
-	 * Subsequent iterations are truncated to use only the first
-	 * half of values.
-	 */
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-	t = hn;
-	for (u = 1, m = 2; u < logn; u ++, m <<= 1) {
-		size_t ht, hm, i1, j1;
-
-		ht = t >> 1;
-		hm = m >> 1;
-		for (i1 = 0, j1 = 0; i1 < hm; i1 ++, j1 += t) {
-			size_t j, j2;
-
-			j2 = j1 + ht;
-#if FALCON_AVX2 // yyyAVX2+1
-			if (ht >= 4) {
-				__m256d s_re, s_im;
-
-				s_re = _mm256_set1_pd(
-					fpr_gm_tab[((m + i1) << 1) + 0].v);
-				s_im = _mm256_set1_pd(
-					fpr_gm_tab[((m + i1) << 1) + 1].v);
-				for (j = j1; j < j2; j += 4) {
-					__m256d x_re, x_im, y_re, y_im;
-					__m256d z_re, z_im;
-
-					x_re = _mm256_loadu_pd(&f[j].v);
-					x_im = _mm256_loadu_pd(&f[j + hn].v);
-					z_re = _mm256_loadu_pd(&f[j+ht].v);
-					z_im = _mm256_loadu_pd(&f[j+ht + hn].v);
-					y_re = FMSUB(z_re, s_re,
-						_mm256_mul_pd(z_im, s_im));
-					y_im = FMADD(z_re, s_im,
-						_mm256_mul_pd(z_im, s_re));
-					_mm256_storeu_pd(&f[j].v,
-						_mm256_add_pd(x_re, y_re));
-					_mm256_storeu_pd(&f[j + hn].v,
-						_mm256_add_pd(x_im, y_im));
-					_mm256_storeu_pd(&f[j + ht].v,
-						_mm256_sub_pd(x_re, y_re));
-					_mm256_storeu_pd(&f[j + ht + hn].v,
-						_mm256_sub_pd(x_im, y_im));
-				}
-			} else {
-				fpr s_re, s_im;
-
-				s_re = fpr_gm_tab[((m + i1) << 1) + 0];
-				s_im = fpr_gm_tab[((m + i1) << 1) + 1];
-				for (j = j1; j < j2; j ++) {
-					fpr x_re, x_im, y_re, y_im;
-
-					x_re = f[j];
-					x_im = f[j + hn];
-					y_re = f[j + ht];
-					y_im = f[j + ht + hn];
-					FPC_MUL(y_re, y_im,
-						y_re, y_im, s_re, s_im);
-					FPC_ADD(f[j], f[j + hn],
-						x_re, x_im, y_re, y_im);
-					FPC_SUB(f[j + ht], f[j + ht + hn],
-						x_re, x_im, y_re, y_im);
-				}
-			}
-#else // yyyAVX2+0
-			fpr s_re, s_im;
-
-			s_re = fpr_gm_tab[((m + i1) << 1) + 0];
-			s_im = fpr_gm_tab[((m + i1) << 1) + 1];
-			for (j = j1; j < j2; j ++) {
-				fpr x_re, x_im, y_re, y_im;
-
-				x_re = f[j];
-				x_im = f[j + hn];
-				y_re = f[j + ht];
-				y_im = f[j + ht + hn];
-				FPC_MUL(y_re, y_im, y_re, y_im, s_re, s_im);
-				FPC_ADD(f[j], f[j + hn],
-					x_re, x_im, y_re, y_im);
-				FPC_SUB(f[j + ht], f[j + ht + hn],
-					x_re, x_im, y_re, y_im);
-			}
-#endif // yyyAVX2-
-		}
-		t = ht;
-	}
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(iFFT)(fpr *f, unsigned logn)
-{
-	/*
-	 * Inverse FFT algorithm in bit-reversal order uses the following
-	 * iterative algorithm:
-	 *
-	 *   t = 1
-	 *   for m = N; m > 1; m /= 2:
-	 *       hm = m/2
-	 *       dt = t*2
-	 *       for i1 = 0; i1 < hm; i1 ++:
-	 *           j1 = i1 * dt
-	 *           s = iGM[hm + i1]
-	 *           for j = j1; j < (j1 + t); j ++:
-	 *               x = f[j]
-	 *               y = f[j + t]
-	 *               f[j] = x + y
-	 *               f[j + t] = s * (x - y)
-	 *       t = dt
-	 *   for i1 = 0; i1 < N; i1 ++:
-	 *       f[i1] = f[i1] / N
-	 *
-	 * iGM[k] contains (1/w)^rev(k) for primitive root w = exp(i*pi/N)
-	 * (actually, iGM[k] = 1/GM[k] = conj(GM[k])).
-	 *
-	 * In the main loop (not counting the final division loop), in
-	 * all iterations except the last, the first and second half of f[]
-	 * (as an array of complex numbers) are separate. In our chosen
-	 * representation, we do not keep the second half.
-	 *
-	 * The last iteration recombines the recomputed half with the
-	 * implicit half, and should yield only real numbers since the
-	 * target polynomial is real; moreover, s = i at that step.
-	 * Thus, when considering x and y:
-	 *    y = conj(x) since the final f[j] must be real
-	 *    Therefore, f[j] is filled with 2*Re(x), and f[j + t] is
-	 *    filled with 2*Im(x).
-	 * But we already have Re(x) and Im(x) in array slots j and j+t
-	 * in our chosen representation. That last iteration is thus a
-	 * simple doubling of the values in all the array.
-	 *
-	 * We make the last iteration a no-op by tweaking the final
-	 * division into a division by N/2, not N.
-	 */
-	size_t u, n, hn, t, m;
-
-	n = (size_t)1 << logn;
-	t = 1;
-	m = n;
-	hn = n >> 1;
-	for (u = logn; u > 1; u --) {
-		size_t hm, dt, i1, j1;
-
-		hm = m >> 1;
-		dt = t << 1;
-		for (i1 = 0, j1 = 0; j1 < hn; i1 ++, j1 += dt) {
-			size_t j, j2;
-
-			j2 = j1 + t;
-#if FALCON_AVX2 // yyyAVX2+1
-			if (t >= 4) {
-				__m256d s_re, s_im;
-
-				s_re = _mm256_set1_pd(
-					fpr_gm_tab[((hm + i1) << 1) + 0].v);
-				s_im = _mm256_set1_pd(
-					fpr_gm_tab[((hm + i1) << 1) + 1].v);
-				for (j = j1; j < j2; j += 4) {
-					__m256d x_re, x_im, y_re, y_im;
-					__m256d z_re, z_im;
-
-					x_re = _mm256_loadu_pd(&f[j].v);
-					x_im = _mm256_loadu_pd(&f[j + hn].v);
-					y_re = _mm256_loadu_pd(&f[j+t].v);
-					y_im = _mm256_loadu_pd(&f[j+t + hn].v);
-					_mm256_storeu_pd(&f[j].v,
-						_mm256_add_pd(x_re, y_re));
-					_mm256_storeu_pd(&f[j + hn].v,
-						_mm256_add_pd(x_im, y_im));
-					x_re = _mm256_sub_pd(y_re, x_re);
-					x_im = _mm256_sub_pd(x_im, y_im);
-					z_re = FMSUB(x_im, s_im,
-						_mm256_mul_pd(x_re, s_re));
-					z_im = FMADD(x_re, s_im,
-						_mm256_mul_pd(x_im, s_re));
-					_mm256_storeu_pd(&f[j+t].v, z_re);
-					_mm256_storeu_pd(&f[j+t + hn].v, z_im);
-				}
-			} else {
-				fpr s_re, s_im;
-
-				s_re = fpr_gm_tab[((hm + i1) << 1)+0];
-				s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1)+1]);
-				for (j = j1; j < j2; j ++) {
-					fpr x_re, x_im, y_re, y_im;
-
-					x_re = f[j];
-					x_im = f[j + hn];
-					y_re = f[j + t];
-					y_im = f[j + t + hn];
-					FPC_ADD(f[j], f[j + hn],
-						x_re, x_im, y_re, y_im);
-					FPC_SUB(x_re, x_im,
-						x_re, x_im, y_re, y_im);
-					FPC_MUL(f[j + t], f[j + t + hn],
-						x_re, x_im, s_re, s_im);
-				}
-			}
-#else // yyyAVX2+0
-			fpr s_re, s_im;
-
-			s_re = fpr_gm_tab[((hm + i1) << 1) + 0];
-			s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1) + 1]);
-			for (j = j1; j < j2; j ++) {
-				fpr x_re, x_im, y_re, y_im;
-
-				x_re = f[j];
-				x_im = f[j + hn];
-				y_re = f[j + t];
-				y_im = f[j + t + hn];
-				FPC_ADD(f[j], f[j + hn],
-					x_re, x_im, y_re, y_im);
-				FPC_SUB(x_re, x_im, x_re, x_im, y_re, y_im);
-				FPC_MUL(f[j + t], f[j + t + hn],
-					x_re, x_im, s_re, s_im);
-			}
-#endif // yyyAVX2-
-		}
-		t = dt;
-		m = hm;
-	}
-
-	/*
-	 * Last iteration is a no-op, provided that we divide by N/2
-	 * instead of N. We need to make a special case for logn = 0.
-	 */
-	if (logn > 0) {
-		fpr ni;
-
-		ni = fpr_p2_tab[logn];
-		for (u = 0; u < n; u ++) {
-			f[u] = fpr_mul(f[u], ni);
-		}
-	}
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_add)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, u;
-
-	n = (size_t)1 << logn;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 4) {
-		for (u = 0; u < n; u += 4) {
-			_mm256_storeu_pd(&a[u].v,
-				_mm256_add_pd(
-					_mm256_loadu_pd(&a[u].v),
-					_mm256_loadu_pd(&b[u].v)));
-		}
-	} else {
-		for (u = 0; u < n; u ++) {
-			a[u] = fpr_add(a[u], b[u]);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < n; u ++) {
-		a[u] = fpr_add(a[u], b[u]);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_sub)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, u;
-
-	n = (size_t)1 << logn;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 4) {
-		for (u = 0; u < n; u += 4) {
-			_mm256_storeu_pd(&a[u].v,
-				_mm256_sub_pd(
-					_mm256_loadu_pd(&a[u].v),
-					_mm256_loadu_pd(&b[u].v)));
-		}
-	} else {
-		for (u = 0; u < n; u ++) {
-			a[u] = fpr_sub(a[u], b[u]);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < n; u ++) {
-		a[u] = fpr_sub(a[u], b[u]);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_neg)(fpr *a, unsigned logn)
-{
-	size_t n, u;
-
-	n = (size_t)1 << logn;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 4) {
-		__m256d s;
-
-		s = _mm256_set1_pd(-0.0);
-		for (u = 0; u < n; u += 4) {
-			_mm256_storeu_pd(&a[u].v,
-				_mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
-		}
-	} else {
-		for (u = 0; u < n; u ++) {
-			a[u] = fpr_neg(a[u]);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < n; u ++) {
-		a[u] = fpr_neg(a[u]);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_adj_fft)(fpr *a, unsigned logn)
-{
-	size_t n, u;
-
-	n = (size_t)1 << logn;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d s;
-
-		s = _mm256_set1_pd(-0.0);
-		for (u = (n >> 1); u < n; u += 4) {
-			_mm256_storeu_pd(&a[u].v,
-				_mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
-		}
-	} else {
-		for (u = (n >> 1); u < n; u ++) {
-			a[u] = fpr_neg(a[u]);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = (n >> 1); u < n; u ++) {
-		a[u] = fpr_neg(a[u]);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_mul_fft)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		for (u = 0; u < hn; u += 4) {
-			__m256d a_re, a_im, b_re, b_im, c_re, c_im;
-
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			b_re = _mm256_loadu_pd(&b[u].v);
-			b_im = _mm256_loadu_pd(&b[u + hn].v);
-			c_re = FMSUB(
-				a_re, b_re, _mm256_mul_pd(a_im, b_im));
-			c_im = FMADD(
-				a_re, b_im, _mm256_mul_pd(a_im, b_re));
-			_mm256_storeu_pd(&a[u].v, c_re);
-			_mm256_storeu_pd(&a[u + hn].v, c_im);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr a_re, a_im, b_re, b_im;
-
-			a_re = a[u];
-			a_im = a[u + hn];
-			b_re = b[u];
-			b_im = b[u + hn];
-			FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr a_re, a_im, b_re, b_im;
-
-		a_re = a[u];
-		a_im = a[u + hn];
-		b_re = b[u];
-		b_im = b[u + hn];
-		FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_muladj_fft)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		for (u = 0; u < hn; u += 4) {
-			__m256d a_re, a_im, b_re, b_im, c_re, c_im;
-
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			b_re = _mm256_loadu_pd(&b[u].v);
-			b_im = _mm256_loadu_pd(&b[u + hn].v);
-			c_re = FMADD(
-				a_re, b_re, _mm256_mul_pd(a_im, b_im));
-			c_im = FMSUB(
-				a_im, b_re, _mm256_mul_pd(a_re, b_im));
-			_mm256_storeu_pd(&a[u].v, c_re);
-			_mm256_storeu_pd(&a[u + hn].v, c_im);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr a_re, a_im, b_re, b_im;
-
-			a_re = a[u];
-			a_im = a[u + hn];
-			b_re = b[u];
-			b_im = fpr_neg(b[u + hn]);
-			FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr a_re, a_im, b_re, b_im;
-
-		a_re = a[u];
-		a_im = a[u + hn];
-		b_re = b[u];
-		b_im = fpr_neg(b[u + hn]);
-		FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn)
-{
-	/*
-	 * Since each coefficient is multiplied with its own conjugate,
-	 * the result contains only real values.
-	 */
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d zero;
-
-		zero = _mm256_setzero_pd();
-		for (u = 0; u < hn; u += 4) {
-			__m256d a_re, a_im;
-
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			_mm256_storeu_pd(&a[u].v,
-				FMADD(a_re, a_re,
-					_mm256_mul_pd(a_im, a_im)));
-			_mm256_storeu_pd(&a[u + hn].v, zero);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr a_re, a_im;
-
-			a_re = a[u];
-			a_im = a[u + hn];
-			a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
-			a[u + hn] = fpr_zero;
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr a_re, a_im;
-
-		a_re = a[u];
-		a_im = a[u + hn];
-		a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
-		a[u + hn] = fpr_zero;
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_mulconst)(fpr *a, fpr x, unsigned logn)
-{
-	size_t n, u;
-
-	n = (size_t)1 << logn;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 4) {
-		__m256d x4;
-
-		x4 = _mm256_set1_pd(x.v);
-		for (u = 0; u < n; u += 4) {
-			_mm256_storeu_pd(&a[u].v,
-				_mm256_mul_pd(x4, _mm256_loadu_pd(&a[u].v)));
-		}
-	} else {
-		for (u = 0; u < n; u ++) {
-			a[u] = fpr_mul(a[u], x);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < n; u ++) {
-		a[u] = fpr_mul(a[u], x);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_div_fft)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d one;
-
-		one = _mm256_set1_pd(1.0);
-		for (u = 0; u < hn; u += 4) {
-			__m256d a_re, a_im, b_re, b_im, c_re, c_im, t;
-
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			b_re = _mm256_loadu_pd(&b[u].v);
-			b_im = _mm256_loadu_pd(&b[u + hn].v);
-			t = _mm256_div_pd(one,
-				FMADD(b_re, b_re,
-					_mm256_mul_pd(b_im, b_im)));
-			b_re = _mm256_mul_pd(b_re, t);
-			b_im = _mm256_mul_pd(b_im, t);
-			c_re = FMADD(
-				a_re, b_re, _mm256_mul_pd(a_im, b_im));
-			c_im = FMSUB(
-				a_im, b_re, _mm256_mul_pd(a_re, b_im));
-			_mm256_storeu_pd(&a[u].v, c_re);
-			_mm256_storeu_pd(&a[u + hn].v, c_im);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr a_re, a_im, b_re, b_im;
-
-			a_re = a[u];
-			a_im = a[u + hn];
-			b_re = b[u];
-			b_im = b[u + hn];
-			FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr a_re, a_im, b_re, b_im;
-
-		a_re = a[u];
-		a_im = a[u + hn];
-		b_re = b[u];
-		b_im = b[u + hn];
-		FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_invnorm2_fft)(fpr *restrict d,
-	const fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d one;
-
-		one = _mm256_set1_pd(1.0);
-		for (u = 0; u < hn; u += 4) {
-			__m256d a_re, a_im, b_re, b_im, dv;
-
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			b_re = _mm256_loadu_pd(&b[u].v);
-			b_im = _mm256_loadu_pd(&b[u + hn].v);
-			dv = _mm256_div_pd(one,
-				_mm256_add_pd(
-					FMADD(a_re, a_re,
-						_mm256_mul_pd(a_im, a_im)),
-					FMADD(b_re, b_re,
-						_mm256_mul_pd(b_im, b_im))));
-			_mm256_storeu_pd(&d[u].v, dv);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr a_re, a_im;
-			fpr b_re, b_im;
-
-			a_re = a[u];
-			a_im = a[u + hn];
-			b_re = b[u];
-			b_im = b[u + hn];
-			d[u] = fpr_inv(fpr_add(
-				fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
-				fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr a_re, a_im;
-		fpr b_re, b_im;
-
-		a_re = a[u];
-		a_im = a[u + hn];
-		b_re = b[u];
-		b_im = b[u + hn];
-		d[u] = fpr_inv(fpr_add(
-			fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
-			fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_add_muladj_fft)(fpr *restrict d,
-	const fpr *restrict F, const fpr *restrict G,
-	const fpr *restrict f, const fpr *restrict g, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		for (u = 0; u < hn; u += 4) {
-			__m256d F_re, F_im, G_re, G_im;
-			__m256d f_re, f_im, g_re, g_im;
-			__m256d a_re, a_im, b_re, b_im;
-
-			F_re = _mm256_loadu_pd(&F[u].v);
-			F_im = _mm256_loadu_pd(&F[u + hn].v);
-			G_re = _mm256_loadu_pd(&G[u].v);
-			G_im = _mm256_loadu_pd(&G[u + hn].v);
-			f_re = _mm256_loadu_pd(&f[u].v);
-			f_im = _mm256_loadu_pd(&f[u + hn].v);
-			g_re = _mm256_loadu_pd(&g[u].v);
-			g_im = _mm256_loadu_pd(&g[u + hn].v);
-
-			a_re = FMADD(F_re, f_re,
-				_mm256_mul_pd(F_im, f_im));
-			a_im = FMSUB(F_im, f_re,
-				_mm256_mul_pd(F_re, f_im));
-			b_re = FMADD(G_re, g_re,
-				_mm256_mul_pd(G_im, g_im));
-			b_im = FMSUB(G_im, g_re,
-				_mm256_mul_pd(G_re, g_im));
-			_mm256_storeu_pd(&d[u].v,
-				_mm256_add_pd(a_re, b_re));
-			_mm256_storeu_pd(&d[u + hn].v,
-				_mm256_add_pd(a_im, b_im));
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr F_re, F_im, G_re, G_im;
-			fpr f_re, f_im, g_re, g_im;
-			fpr a_re, a_im, b_re, b_im;
-
-			F_re = F[u];
-			F_im = F[u + hn];
-			G_re = G[u];
-			G_im = G[u + hn];
-			f_re = f[u];
-			f_im = f[u + hn];
-			g_re = g[u];
-			g_im = g[u + hn];
-
-			FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
-			FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
-			d[u] = fpr_add(a_re, b_re);
-			d[u + hn] = fpr_add(a_im, b_im);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr F_re, F_im, G_re, G_im;
-		fpr f_re, f_im, g_re, g_im;
-		fpr a_re, a_im, b_re, b_im;
-
-		F_re = F[u];
-		F_im = F[u + hn];
-		G_re = G[u];
-		G_im = G[u + hn];
-		f_re = f[u];
-		f_im = f[u + hn];
-		g_re = g[u];
-		g_im = g[u + hn];
-
-		FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
-		FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
-		d[u] = fpr_add(a_re, b_re);
-		d[u + hn] = fpr_add(a_im, b_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_mul_autoadj_fft)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		for (u = 0; u < hn; u += 4) {
-			__m256d a_re, a_im, bv;
-
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			bv = _mm256_loadu_pd(&b[u].v);
-			_mm256_storeu_pd(&a[u].v,
-				_mm256_mul_pd(a_re, bv));
-			_mm256_storeu_pd(&a[u + hn].v,
-				_mm256_mul_pd(a_im, bv));
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			a[u] = fpr_mul(a[u], b[u]);
-			a[u + hn] = fpr_mul(a[u + hn], b[u]);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		a[u] = fpr_mul(a[u], b[u]);
-		a[u + hn] = fpr_mul(a[u + hn], b[u]);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_div_autoadj_fft)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d one;
-
-		one = _mm256_set1_pd(1.0);
-		for (u = 0; u < hn; u += 4) {
-			__m256d ib, a_re, a_im;
-
-			ib = _mm256_div_pd(one, _mm256_loadu_pd(&b[u].v));
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			_mm256_storeu_pd(&a[u].v, _mm256_mul_pd(a_re, ib));
-			_mm256_storeu_pd(&a[u + hn].v, _mm256_mul_pd(a_im, ib));
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr ib;
-
-			ib = fpr_inv(b[u]);
-			a[u] = fpr_mul(a[u], ib);
-			a[u + hn] = fpr_mul(a[u + hn], ib);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr ib;
-
-		ib = fpr_inv(b[u]);
-		a[u] = fpr_mul(a[u], ib);
-		a[u + hn] = fpr_mul(a[u + hn], ib);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_LDL_fft)(
-	const fpr *restrict g00,
-	fpr *restrict g01, fpr *restrict g11, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d one;
-
-		one = _mm256_set1_pd(1.0);
-		for (u = 0; u < hn; u += 4) {
-			__m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
-			__m256d t, mu_re, mu_im, xi_re, xi_im;
-
-			g00_re = _mm256_loadu_pd(&g00[u].v);
-			g00_im = _mm256_loadu_pd(&g00[u + hn].v);
-			g01_re = _mm256_loadu_pd(&g01[u].v);
-			g01_im = _mm256_loadu_pd(&g01[u + hn].v);
-			g11_re = _mm256_loadu_pd(&g11[u].v);
-			g11_im = _mm256_loadu_pd(&g11[u + hn].v);
-
-			t = _mm256_div_pd(one,
-				FMADD(g00_re, g00_re,
-					_mm256_mul_pd(g00_im, g00_im)));
-			g00_re = _mm256_mul_pd(g00_re, t);
-			g00_im = _mm256_mul_pd(g00_im, t);
-			mu_re = FMADD(g01_re, g00_re,
-				_mm256_mul_pd(g01_im, g00_im));
-			mu_im = FMSUB(g01_re, g00_im,
-				_mm256_mul_pd(g01_im, g00_re));
-			xi_re = FMSUB(mu_re, g01_re,
-				_mm256_mul_pd(mu_im, g01_im));
-			xi_im = FMADD(mu_im, g01_re,
-				_mm256_mul_pd(mu_re, g01_im));
-			_mm256_storeu_pd(&g11[u].v,
-				_mm256_sub_pd(g11_re, xi_re));
-			_mm256_storeu_pd(&g11[u + hn].v,
-				_mm256_add_pd(g11_im, xi_im));
-			_mm256_storeu_pd(&g01[u].v, mu_re);
-			_mm256_storeu_pd(&g01[u + hn].v, mu_im);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
-			fpr mu_re, mu_im;
-
-			g00_re = g00[u];
-			g00_im = g00[u + hn];
-			g01_re = g01[u];
-			g01_im = g01[u + hn];
-			g11_re = g11[u];
-			g11_im = g11[u + hn];
-			FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
-			FPC_MUL(g01_re, g01_im,
-				mu_re, mu_im, g01_re, fpr_neg(g01_im));
-			FPC_SUB(g11[u], g11[u + hn],
-				g11_re, g11_im, g01_re, g01_im);
-			g01[u] = mu_re;
-			g01[u + hn] = fpr_neg(mu_im);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
-		fpr mu_re, mu_im;
-
-		g00_re = g00[u];
-		g00_im = g00[u + hn];
-		g01_re = g01[u];
-		g01_im = g01[u + hn];
-		g11_re = g11[u];
-		g11_im = g11[u + hn];
-		FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
-		FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
-		FPC_SUB(g11[u], g11[u + hn], g11_re, g11_im, g01_re, g01_im);
-		g01[u] = mu_re;
-		g01[u + hn] = fpr_neg(mu_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_LDLmv_fft)(
-	fpr *restrict d11, fpr *restrict l10,
-	const fpr *restrict g00, const fpr *restrict g01,
-	const fpr *restrict g11, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d one;
-
-		one = _mm256_set1_pd(1.0);
-		for (u = 0; u < hn; u += 4) {
-			__m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
-			__m256d t, mu_re, mu_im, xi_re, xi_im;
-
-			g00_re = _mm256_loadu_pd(&g00[u].v);
-			g00_im = _mm256_loadu_pd(&g00[u + hn].v);
-			g01_re = _mm256_loadu_pd(&g01[u].v);
-			g01_im = _mm256_loadu_pd(&g01[u + hn].v);
-			g11_re = _mm256_loadu_pd(&g11[u].v);
-			g11_im = _mm256_loadu_pd(&g11[u + hn].v);
-
-			t = _mm256_div_pd(one,
-				FMADD(g00_re, g00_re,
-					_mm256_mul_pd(g00_im, g00_im)));
-			g00_re = _mm256_mul_pd(g00_re, t);
-			g00_im = _mm256_mul_pd(g00_im, t);
-			mu_re = FMADD(g01_re, g00_re,
-				_mm256_mul_pd(g01_im, g00_im));
-			mu_im = FMSUB(g01_re, g00_im,
-				_mm256_mul_pd(g01_im, g00_re));
-			xi_re = FMSUB(mu_re, g01_re,
-				_mm256_mul_pd(mu_im, g01_im));
-			xi_im = FMADD(mu_im, g01_re,
-				_mm256_mul_pd(mu_re, g01_im));
-			_mm256_storeu_pd(&d11[u].v,
-				_mm256_sub_pd(g11_re, xi_re));
-			_mm256_storeu_pd(&d11[u + hn].v,
-				_mm256_add_pd(g11_im, xi_im));
-			_mm256_storeu_pd(&l10[u].v, mu_re);
-			_mm256_storeu_pd(&l10[u + hn].v, mu_im);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
-			fpr mu_re, mu_im;
-
-			g00_re = g00[u];
-			g00_im = g00[u + hn];
-			g01_re = g01[u];
-			g01_im = g01[u + hn];
-			g11_re = g11[u];
-			g11_im = g11[u + hn];
-			FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
-			FPC_MUL(g01_re, g01_im,
-				mu_re, mu_im, g01_re, fpr_neg(g01_im));
-			FPC_SUB(d11[u], d11[u + hn],
-				g11_re, g11_im, g01_re, g01_im);
-			l10[u] = mu_re;
-			l10[u + hn] = fpr_neg(mu_im);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
-		fpr mu_re, mu_im;
-
-		g00_re = g00[u];
-		g00_im = g00[u + hn];
-		g01_re = g01[u];
-		g01_im = g01[u + hn];
-		g11_re = g11[u];
-		g11_im = g11[u + hn];
-		FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
-		FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
-		FPC_SUB(d11[u], d11[u + hn], g11_re, g11_im, g01_re, g01_im);
-		l10[u] = mu_re;
-		l10[u + hn] = fpr_neg(mu_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_split_fft)(
-	fpr *restrict f0, fpr *restrict f1,
-	const fpr *restrict f, unsigned logn)
-{
-	/*
-	 * The FFT representation we use is in bit-reversed order
-	 * (element i contains f(w^(rev(i))), where rev() is the
-	 * bit-reversal function over the ring degree. This changes
-	 * indexes with regards to the Falcon specification.
-	 */
-	size_t n, hn, qn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-	qn = hn >> 1;
-
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d half, sv;
-
-		half = _mm256_set1_pd(0.5);
-		sv = _mm256_set_pd(-0.0, 0.0, -0.0, 0.0);
-		for (u = 0; u < qn; u += 2) {
-			__m256d ab_re, ab_im, ff0, ff1, ff2, ff3, gmt;
-
-			ab_re = _mm256_loadu_pd(&f[(u << 1)].v);
-			ab_im = _mm256_loadu_pd(&f[(u << 1) + hn].v);
-			ff0 = _mm256_mul_pd(_mm256_hadd_pd(ab_re, ab_im), half);
-			ff0 = _mm256_permute4x64_pd(ff0, 0xD8);
-			_mm_storeu_pd(&f0[u].v,
-				_mm256_extractf128_pd(ff0, 0));
-			_mm_storeu_pd(&f0[u + qn].v,
-				_mm256_extractf128_pd(ff0, 1));
-
-			ff1 = _mm256_mul_pd(_mm256_hsub_pd(ab_re, ab_im), half);
-			gmt = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
-			ff2 = _mm256_shuffle_pd(ff1, ff1, 0x5);
-			ff3 = _mm256_hadd_pd(
-				_mm256_mul_pd(ff1, gmt),
-				_mm256_xor_pd(_mm256_mul_pd(ff2, gmt), sv));
-			ff3 = _mm256_permute4x64_pd(ff3, 0xD8);
-			_mm_storeu_pd(&f1[u].v,
-				_mm256_extractf128_pd(ff3, 0));
-			_mm_storeu_pd(&f1[u + qn].v,
-				_mm256_extractf128_pd(ff3, 1));
-		}
-	} else {
-		f0[0] = f[0];
-		f1[0] = f[hn];
-
-		for (u = 0; u < qn; u ++) {
-			fpr a_re, a_im, b_re, b_im;
-			fpr t_re, t_im;
-
-			a_re = f[(u << 1) + 0];
-			a_im = f[(u << 1) + 0 + hn];
-			b_re = f[(u << 1) + 1];
-			b_im = f[(u << 1) + 1 + hn];
-
-			FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
-			f0[u] = fpr_half(t_re);
-			f0[u + qn] = fpr_half(t_im);
-
-			FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
-			FPC_MUL(t_re, t_im, t_re, t_im,
-				fpr_gm_tab[((u + hn) << 1) + 0],
-				fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
-			f1[u] = fpr_half(t_re);
-			f1[u + qn] = fpr_half(t_im);
-		}
-	}
-#else // yyyAVX2+0
-	/*
-	 * We process complex values by pairs. For logn = 1, there is only
-	 * one complex value (the other one is the implicit conjugate),
-	 * so we add the two lines below because the loop will be
-	 * skipped.
-	 */
-	f0[0] = f[0];
-	f1[0] = f[hn];
-
-	for (u = 0; u < qn; u ++) {
-		fpr a_re, a_im, b_re, b_im;
-		fpr t_re, t_im;
-
-		a_re = f[(u << 1) + 0];
-		a_im = f[(u << 1) + 0 + hn];
-		b_re = f[(u << 1) + 1];
-		b_im = f[(u << 1) + 1 + hn];
-
-		FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
-		f0[u] = fpr_half(t_re);
-		f0[u + qn] = fpr_half(t_im);
-
-		FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
-		FPC_MUL(t_re, t_im, t_re, t_im,
-			fpr_gm_tab[((u + hn) << 1) + 0],
-			fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
-		f1[u] = fpr_half(t_re);
-		f1[u + qn] = fpr_half(t_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_merge_fft)(
-	fpr *restrict f,
-	const fpr *restrict f0, const fpr *restrict f1, unsigned logn)
-{
-	size_t n, hn, qn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-	qn = hn >> 1;
-
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 16) {
-		for (u = 0; u < qn; u += 4) {
-			__m256d a_re, a_im, b_re, b_im, c_re, c_im;
-			__m256d gm1, gm2, g_re, g_im;
-			__m256d t_re, t_im, u_re, u_im;
-			__m256d tu1_re, tu2_re, tu1_im, tu2_im;
-
-			a_re = _mm256_loadu_pd(&f0[u].v);
-			a_im = _mm256_loadu_pd(&f0[u + qn].v);
-			c_re = _mm256_loadu_pd(&f1[u].v);
-			c_im = _mm256_loadu_pd(&f1[u + qn].v);
-
-			gm1 = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
-			gm2 = _mm256_loadu_pd(&fpr_gm_tab[(u + 2 + hn) << 1].v);
-			g_re = _mm256_unpacklo_pd(gm1, gm2);
-			g_im = _mm256_unpackhi_pd(gm1, gm2);
-			g_re = _mm256_permute4x64_pd(g_re, 0xD8);
-			g_im = _mm256_permute4x64_pd(g_im, 0xD8);
-
-			b_re = FMSUB(
-				c_re, g_re, _mm256_mul_pd(c_im, g_im));
-			b_im = FMADD(
-				c_re, g_im, _mm256_mul_pd(c_im, g_re));
-
-			t_re = _mm256_add_pd(a_re, b_re);
-			t_im = _mm256_add_pd(a_im, b_im);
-			u_re = _mm256_sub_pd(a_re, b_re);
-			u_im = _mm256_sub_pd(a_im, b_im);
-
-			tu1_re = _mm256_unpacklo_pd(t_re, u_re);
-			tu2_re = _mm256_unpackhi_pd(t_re, u_re);
-			tu1_im = _mm256_unpacklo_pd(t_im, u_im);
-			tu2_im = _mm256_unpackhi_pd(t_im, u_im);
-			_mm256_storeu_pd(&f[(u << 1)].v,
-				_mm256_permute2f128_pd(tu1_re, tu2_re, 0x20));
-			_mm256_storeu_pd(&f[(u << 1) + 4].v,
-				_mm256_permute2f128_pd(tu1_re, tu2_re, 0x31));
-			_mm256_storeu_pd(&f[(u << 1) + hn].v,
-				_mm256_permute2f128_pd(tu1_im, tu2_im, 0x20));
-			_mm256_storeu_pd(&f[(u << 1) + 4 + hn].v,
-				_mm256_permute2f128_pd(tu1_im, tu2_im, 0x31));
-		}
-	} else {
-		f[0] = f0[0];
-		f[hn] = f1[0];
-
-		for (u = 0; u < qn; u ++) {
-			fpr a_re, a_im, b_re, b_im;
-			fpr t_re, t_im;
-
-			a_re = f0[u];
-			a_im = f0[u + qn];
-			FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
-				fpr_gm_tab[((u + hn) << 1) + 0],
-				fpr_gm_tab[((u + hn) << 1) + 1]);
-			FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
-			f[(u << 1) + 0] = t_re;
-			f[(u << 1) + 0 + hn] = t_im;
-			FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
-			f[(u << 1) + 1] = t_re;
-			f[(u << 1) + 1 + hn] = t_im;
-		}
-	}
-#else // yyyAVX2+0
-	/*
-	 * An extra copy to handle the special case logn = 1.
-	 */
-	f[0] = f0[0];
-	f[hn] = f1[0];
-
-	for (u = 0; u < qn; u ++) {
-		fpr a_re, a_im, b_re, b_im;
-		fpr t_re, t_im;
-
-		a_re = f0[u];
-		a_im = f0[u + qn];
-		FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
-			fpr_gm_tab[((u + hn) << 1) + 0],
-			fpr_gm_tab[((u + hn) << 1) + 1]);
-		FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
-		f[(u << 1) + 0] = t_re;
-		f[(u << 1) + 0 + hn] = t_im;
-		FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
-		f[(u << 1) + 1] = t_re;
-		f[(u << 1) + 1 + hn] = t_im;
-	}
-#endif // yyyAVX2-
-}
diff --git a/crypto_sign/falcon-1024/m4-ct/fpr.c b/crypto_sign/falcon-1024/m4-ct/fpr.c
deleted file mode 100644
index eb23a44b..00000000
--- a/crypto_sign/falcon-1024/m4-ct/fpr.c
+++ /dev/null
@@ -1,3460 +0,0 @@
-/*
- * Floating-point operations.
- *
- * This file implements the non-inline functions declared in
- * fpr.h, as well as the constants for FFT / iFFT.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-#if FALCON_FPEMU // yyyFPEMU+1
-
-/*
- * Normalize a provided unsigned integer to the 2^63..2^64-1 range by
- * left-shifting it if necessary. The exponent e is adjusted accordingly
- * (i.e. if the value was left-shifted by n bits, then n is subtracted
- * from e). If source m is 0, then it remains 0, but e is altered.
- * Both m and e must be simple variables (no expressions allowed).
- */
-#define FPR_NORM64(m, e)   do { \
-		uint32_t nt; \
- \
-		(e) -= 63; \
- \
-		nt = (uint32_t)((m) >> 32); \
-		nt = (nt | -nt) >> 31; \
-		(m) ^= ((m) ^ ((m) << 32)) & ((uint64_t)nt - 1); \
-		(e) += (int)(nt << 5); \
- \
-		nt = (uint32_t)((m) >> 48); \
-		nt = (nt | -nt) >> 31; \
-		(m) ^= ((m) ^ ((m) << 16)) & ((uint64_t)nt - 1); \
-		(e) += (int)(nt << 4); \
- \
-		nt = (uint32_t)((m) >> 56); \
-		nt = (nt | -nt) >> 31; \
-		(m) ^= ((m) ^ ((m) <<  8)) & ((uint64_t)nt - 1); \
-		(e) += (int)(nt << 3); \
- \
-		nt = (uint32_t)((m) >> 60); \
-		nt = (nt | -nt) >> 31; \
-		(m) ^= ((m) ^ ((m) <<  4)) & ((uint64_t)nt - 1); \
-		(e) += (int)(nt << 2); \
- \
-		nt = (uint32_t)((m) >> 62); \
-		nt = (nt | -nt) >> 31; \
-		(m) ^= ((m) ^ ((m) <<  2)) & ((uint64_t)nt - 1); \
-		(e) += (int)(nt << 1); \
- \
-		nt = (uint32_t)((m) >> 63); \
-		(m) ^= ((m) ^ ((m) <<  1)) & ((uint64_t)nt - 1); \
-		(e) += (int)(nt); \
-	} while (0)
-
-#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1
-
-__attribute__((naked))
-fpr
-fpr_scaled(int64_t i __attribute__((unused)), int sc __attribute__((unused)))
-{
-	__asm__ (
-	"push	{ r4, r5, r6, lr }\n\t"
-	"\n\t"
-	"@ Input i is in r0:r1, and sc in r2.\n\t"
-	"@ Extract the sign bit, and compute the absolute value.\n\t"
-	"@ -> sign bit in r3, with value 0 or -1\n\t"
-	"asrs	r3, r1, #31\n\t"
-	"eors	r0, r3\n\t"
-	"eors	r1, r3\n\t"
-	"subs	r0, r3\n\t"
-	"sbcs	r1, r3\n\t"
-	"\n\t"
-	"@ Scale exponent to account for the encoding; if the source is\n\t"
-	"@ zero or if the scaled exponent is negative, it is set to 32.\n\t"
-	"addw	r2, r2, #1022\n\t"
-	"orrs	r4, r0, r1\n\t"
-	"bics	r4, r4, r2, asr #31\n\t"
-	"rsbs	r5, r4, #0\n\t"
-	"orrs	r4, r5\n\t"
-	"ands	r2, r2, r4, asr #31\n\t"
-	"adds	r2, #32\n\t"
-	"\n\t"
-	"@ Normalize value to a full 64-bit width, by shifting it left.\n\t"
-	"@ The shift count is subtracted from the exponent (in r2).\n\t"
-	"@ If the mantissa is 0, the exponent is set to 0.\n\t"
-	"\n\t"
-	"@ If top word is 0, replace with low word; otherwise, add 32 to\n\t"
-	"@ the exponent.\n\t"
-	"rsbs	r4, r1, #0\n\t"
-	"orrs	r4, r1\n\t"
-	"eors	r5, r0, r1\n\t"
-	"bics	r5, r5, r4, asr #31\n\t"
-	"eors	r1, r5\n\t"
-	"ands	r0, r0, r4, asr #31\n\t"
-	"lsrs	r4, r4, #31\n\t"
-	"adds	r2, r2, r4, lsl #5\n\t"
-	"\n\t"
-	"@ Count leading zeros of r1 to finish the shift.\n\t"
-	"clz	r4, r1\n\t"
-	"subs	r2, r4\n\t"
-	"rsbs	r5, r4, #32\n\t"
-	"lsls	r1, r4\n\t"
-	"lsrs	r5, r0, r5\n\t"
-	"lsls	r0, r4\n\t"
-	"orrs	r1, r5\n\t"
-	"\n\t"
-	"@ Clear the top bit; we know it's a 1 (unless the whole mantissa\n\t"
-	"@ was zero, but then it's still OK to clear it)\n\t"
-	"bfc	r1, #31, #1\n\t"
-	"\n\t"
-	"@ Now shift right the value by 11 bits; this puts the value in\n\t"
-	"@ the 2^52..2^53-1 range. We also keep a copy of the pre-shift\n\t"
-	"@ low bits in r5.\n\t"
-	"movs	r5, r0\n\t"
-	"lsrs	r0, #11\n\t"
-	"orrs	r0, r0, r1, lsl #21\n\t"
-	"lsrs	r1, #11\n\t"
-	"\n\t"
-	"@ Also plug the exponent at the right place. This must be done\n\t"
-	"@ now so that, in case the rounding creates a carry, that carry\n\t"
-	"@ adds to the exponent, which would be exactly what we want at\n\t"
-	"@ that point.\n\t"
-	"orrs	r1, r1, r2, lsl #20\n\t"
-	"\n\t"
-	"@ Rounding: we must add 1 to the mantissa in the following cases:\n\t"
-	"@  - bits 11 to 9 of r5 are '011', '110' or '111'\n\t"
-	"@  - bits 11 to 9 of r5 are '010' and one of the\n\t"
-	"@    bits 0 to 8 is non-zero\n\t"
-	"ubfx	r6, r5, #0, #9\n\t"
-	"addw	r6, r6, #511\n\t"
-	"orrs	r5, r6\n\t"
-	"\n\t"
-	"ubfx	r5, r5, #9, #3\n\t"
-	"movs	r6, #0xC8\n\t"
-	"lsrs	r6, r5\n\t"
-	"ands	r6, #1\n\t"
-	"adds	r0, r6\n\t"
-	"adcs	r1, #0\n\t"
-	"\n\t"
-	"@ Put back the sign.\n\t"
-	"orrs	r1, r1, r3, lsl #31\n\t"
-	"\n\t"
-	"pop	{ r4, r5, r6, pc}\n\t"
-	);
-}
-
-#else // yyyASM_CORTEXM4+0
-
-fpr
-fpr_scaled(int64_t i, int sc)
-{
-	/*
-	 * To convert from int to float, we have to do the following:
-	 *  1. Get the absolute value of the input, and its sign
-	 *  2. Shift right or left the value as appropriate
-	 *  3. Pack the result
-	 *
-	 * We can assume that the source integer is not -2^63.
-	 */
-	int s, e;
-	uint32_t t;
-	uint64_t m;
-
-	/*
-	 * Extract sign bit.
-	 * We have: -i = 1 + ~i
-	 */
-	s = (int)((uint64_t)i >> 63);
-	i ^= -(int64_t)s;
-	i += s;
-
-	/*
-	 * For now we suppose that i != 0.
-	 * Otherwise, we set m to i and left-shift it as much as needed
-	 * to get a 1 in the top bit. We can do that in a logarithmic
-	 * number of conditional shifts.
-	 */
-	m = (uint64_t)i;
-	e = 9 + sc;
-	FPR_NORM64(m, e);
-
-	/*
-	 * Now m is in the 2^63..2^64-1 range. We must divide it by 512;
-	 * if one of the dropped bits is a 1, this should go into the
-	 * "sticky bit".
-	 */
-	m |= ((uint32_t)m & 0x1FF) + 0x1FF;
-	m >>= 9;
-
-	/*
-	 * Corrective action: if i = 0 then all of the above was
-	 * incorrect, and we clamp e and m down to zero.
-	 */
-	t = (uint32_t)((uint64_t)(i | -i) >> 63);
-	m &= -(uint64_t)t;
-	e &= -(int)t;
-
-	/*
-	 * Assemble back everything. The FPR() function will handle cases
-	 * where e is too low.
-	 */
-	return FPR(s, e, m);
-}
-
-#endif // yyyASM_CORTEXM4-
-
-#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1
-
-// yyyPQCLEAN+0
-#if 0
-/* Debug code -- To get a printout of registers from a specific point
-   in ARM Cortex M4 assembly code, uncomment this code and add a
-   "bl DEBUG" call where wished for. */
-
-void
-print_regs(uint32_t *rr, uint32_t flags)
-{
-	int i;
-	extern int printf(const char *fmt, ...);
-
-	printf("\nRegs:\n");
-	for (i = 0; i < 7; i ++) {
-		int j;
-
-		j = i + 7;
-		printf("  %2d = %08X    %2d = %08X\n", i, rr[i], j, rr[j]);
-	}
-	printf("  flags = %08X  ", flags);
-	if ((flags >> 31) & 1) {
-		printf("N");
-	}
-	if ((flags >> 30) & 1) {
-		printf("Z");
-	}
-	if ((flags >> 29) & 1) {
-		printf("C");
-	}
-	if ((flags >> 28) & 1) {
-		printf("V");
-	}
-	if ((flags >> 27) & 1) {
-		printf("Q");
-	}
-	printf("\n");
-}
-
-__attribute__((naked))
-void
-DEBUG(void)
-{
-	__asm__ (
-	"push	{ r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr }\n\t"
-	"mov	r0, sp\n\t"
-	"mrs	r1, apsr\n\t"
-	"bl	print_regs\n\t"
-	"pop	{ r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, pc }\n\t"
-	);
-}
-#endif
-// yyyPQCLEAN-
-
-__attribute__((naked))
-fpr
-fpr_add(fpr x __attribute__((unused)), fpr y __attribute__((unused)))
-{
-	__asm__ (
-	"push	{ r4, r5, r6, r7, r8, r10, r11, lr }\n\t"
-	"\n\t"
-	"@ Make sure that the first operand (x) has the larger absolute\n\t"
-	"@ value. This guarantees that the exponent of y is less than\n\t"
-	"@ or equal to the exponent of x, and, if they are equal, then\n\t"
-	"@ the mantissa of y will not be greater than the mantissa of x.\n\t"
-	"@ However, if absolute values are equal and the sign of x is 1,\n\t"
-	"@ then we want to also swap the values.\n\t"
-	"ubfx	r4, r1, #0, #31  @ top word without sign bit\n\t"
-	"ubfx	r5, r3, #0, #31  @ top word without sign bit\n\t"
-	"subs	r7, r0, r2       @ difference in r7:r4\n\t"
-	"sbcs	r4, r5\n\t"
-	"orrs	r7, r4\n\t"
-	"rsbs	r5, r7, #0\n\t"
-	"orrs	r7, r5      @ bit 31 of r7 is 0 iff difference is zero\n\t"
-	"bics	r6, r1, r7\n\t"
-	"orrs	r6, r4      @ bit 31 of r6 is 1 iff the swap must be done\n\t"
-	"\n\t"
-	"@ Conditional swap\n\t"
-	"eors	r4, r0, r2\n\t"
-	"eors	r5, r1, r3\n\t"
-	"ands	r4, r4, r6, asr #31\n\t"
-	"ands	r5, r5, r6, asr #31\n\t"
-	"eors	r0, r4\n\t"
-	"eors	r1, r5\n\t"
-	"eors	r2, r4\n\t"
-	"eors	r3, r5\n\t"
-	"\n\t"
-	"@ Extract mantissa of x into r0:r1, exponent in r4, sign in r5\n\t"
-	"ubfx	r4, r1, #20, #11   @ Exponent in r4 (without sign)\n\t"
-	"addw	r5, r4, #2047 @ Get a carry to test r4 for zero\n\t"
-	"lsrs	r5, #11       @ r5 is the mantissa implicit high bit\n\t"
-	"bfc	r1, #20, #11  @ Clear exponent bits (not the sign)\n\t"
-	"orrs	r1, r1, r5, lsl #20  @ Set mantissa high bit\n\t"
-	"asrs	r5, r1, #31   @ Get sign bit (sign-extended)\n\t"
-	"bfc	r1, #31, #1   @ Clear the sign bit\n\t"
-	"\n\t"
-	"@ Extract mantissa of y into r2:r3, exponent in r6, sign in r7\n\t"
-	"ubfx	r6, r3, #20, #11   @ Exponent in r6 (without sign)\n\t"
-	"addw	r7, r6, #2047 @ Get a carry to test r6 for zero\n\t"
-	"lsrs	r7, #11       @ r7 is the mantissa implicit high bit\n\t"
-	"bfc	r3, #20, #11  @ Clear exponent bits (not the sign)\n\t"
-	"orrs	r3, r3, r7, lsl #20  @ Set mantissa high bit\n\t"
-	"asrs	r7, r3, #31   @ Get sign bit (sign-extended)\n\t"
-	"bfc	r3, #31, #1   @ Clear the sign bit\n\t"
-	"\n\t"
-	"@ Scale mantissas up by three bits.\n\t"
-	"lsls	r1, #3\n\t"
-	"orrs	r1, r1, r0, lsr #29\n\t"
-	"lsls	r0, #3\n\t"
-	"lsls	r3, #3\n\t"
-	"orrs	r3, r3, r2, lsr #29\n\t"
-	"lsls	r2, #3\n\t"
-	"\n\t"
-	"@ x: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t"
-	"@ y: exponent=r6, sign=r7, mantissa=r2:r3 (scaled up 3 bits)\n\t"
-	"\n\t"
-	"@ At that point, the exponent of x (in r4) is larger than that\n\t"
-	"@ of y (in r6). The difference is the amount of shifting that\n\t"
-	"@ should be done on y. If that amount is larger than 59 then\n\t"
-	"@ we clamp y to 0. We won't need y's exponent beyond that point,\n\t"
-	"@ so we store that shift count in r6.\n\t"
-	"subs	r6, r4, r6\n\t"
-	"subs	r8, r6, #60\n\t"
-	"ands	r2, r2, r8, asr #31\n\t"
-	"ands	r3, r3, r8, asr #31\n\t"
-	"\n\t"
-	"@ Shift right r2:r3 by r6 bits. The shift count is in the 0..59\n\t"
-	"@ range. r11 will be non-zero if and only if some non-zero bits\n\t"
-	"@ were dropped.\n\t"
-	"subs	r8, r6, #32\n\t"
-	"bics	r11, r2, r8, asr #31\n\t"
-	"ands	r2, r2, r8, asr #31\n\t"
-	"bics	r10, r3, r8, asr #31\n\t"
-	"orrs	r2, r2, r10\n\t"
-	"ands	r3, r3, r8, asr #31\n\t"
-	"ands	r6, r6, #31\n\t"
-	"rsbs	r8, r6, #32\n\t"
-	"lsls	r10, r2, r8\n\t"
-	"orrs	r11, r11, r10\n\t"
-	"lsrs	r2, r2, r6\n\t"
-	"lsls	r10, r3, r8\n\t"
-	"orrs	r2, r2, r10\n\t"
-	"lsrs	r3, r3, r6\n\t"
-	"\n\t"
-	"@ If r11 is non-zero then some non-zero bit was dropped and the\n\t"
-	"@ low bit of r2 must be forced to 1 ('sticky bit').\n\t"
-	"rsbs	r6, r11, #0\n\t"
-	"orrs	r6, r6, r11\n\t"
-	"orrs	r2, r2, r6, lsr #31\n\t"
-	"\n\t"
-	"@ x: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t"
-	"@ y: sign=r7, value=r2:r3 (scaled to same exponent as x)\n\t"
-	"\n\t"
-	"@ If x and y don't have the same sign, then we should negate r2:r3\n\t"
-	"@ (i.e. subtract the mantissa instead of adding it). Signs of x\n\t"
-	"@ and y are in r5 and r7, as full-width words. We won't need r7\n\t"
-	"@ afterwards.\n\t"
-	"eors	r7, r5    @ r7 = -1 if y must be negated, 0 otherwise\n\t"
-	"eors	r2, r7\n\t"
-	"eors	r3, r7\n\t"
-	"subs	r2, r7\n\t"
-	"sbcs	r3, r7\n\t"
-	"\n\t"
-	"@ r2:r3 has been shifted, we can add to r0:r1.\n\t"
-	"adds	r0, r2\n\t"
-	"adcs	r1, r3\n\t"
-	"\n\t"
-	"@ result: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t"
-	"\n\t"
-	"@ Normalize the result with some left-shifting to full 64-bit\n\t"
-	"@ width. Shift count goes to r2, and exponent (r4) is adjusted.\n\t"
-	"clz	r2, r0\n\t"
-	"clz	r3, r1\n\t"
-	"sbfx	r6, r3, #5, #1\n\t"
-	"ands	r2, r6\n\t"
-	"adds	r2, r2, r3\n\t"
-	"subs	r4, r4, r2\n\t"
-	"\n\t"
-	"@ Shift r0:r1 to the left by r2 bits.\n\t"
-	"subs	r7, r2, #32\n\t"
-	"lsls	r7, r0, r7\n\t"
-	"lsls	r1, r1, r2\n\t"
-	"rsbs	r6, r2, #32\n\t"
-	"orrs	r1, r1, r7\n\t"
-	"lsrs	r6, r0, r6\n\t"
-	"orrs	r1, r1, r6\n\t"
-	"lsls	r0, r0, r2\n\t"
-	"\n\t"
-	"@ The exponent of x was in r4. The left-shift operation has\n\t"
-	"@ subtracted some value from it, 8 in case the result has the\n\t"
-	"@ same exponent as x. However, the high bit of the mantissa will\n\t"
-	"@ add 1 to the exponent, so we only add back 7 (the exponent is\n\t"
-	"@ added in because rounding might have produced a carry, which\n\t"
-	"@ should then spill into the exponent).\n\t"
-	"adds	r4, #7\n\t"
-	"\n\t"
-	"@ If the mantissa new mantissa is non-zero, then its bit 63 is\n\t"
-	"@ non-zero (thanks to the normalizing shift). Otherwise, that bit\n\t"
-	"@ is zero, and we should then set the exponent to zero as well.\n\t"
-	"ands	r4, r4, r1, asr #31\n\t"
-	"\n\t"
-	"@ Shrink back the value to a 52-bit mantissa. This requires\n\t"
-	"@ right-shifting by 11 bits; we keep a copy of the pre-shift\n\t"
-	"@ low word in r3.\n\t"
-	"movs	r3, r0\n\t"
-	"lsrs	r0, #11\n\t"
-	"orrs	r0, r0, r1, lsl #21\n\t"
-	"lsrs	r1, #11\n\t"
-	"\n\t"
-	"@ Apply rounding.\n\t"
-	"ubfx	r6, r3, #0, #9\n\t"
-	"addw	r6, r6, #511\n\t"
-	"orrs	r3, r6\n\t"
-	"ubfx	r3, r3, #9, #3\n\t"
-	"movs	r6, #0xC8\n\t"
-	"lsrs	r6, r3\n\t"
-	"ands	r6, #1\n\t"
-	"adds	r0, r6\n\t"
-	"adcs	r1, #0\n\t"
-	"\n\t"
-	"@Plug in the exponent with an addition.\n\t"
-	"adds	r1, r1, r4, lsl #20\n\t"
-	"\n\t"
-	"@ If the new exponent is negative or zero, then it underflowed\n\t"
-	"@ and we must clear the whole mantissa and exponent.\n\t"
-	"rsbs	r4, r4, #0\n\t"
-	"ands	r0, r0, r4, asr #31\n\t"
-	"ands	r1, r1, r4, asr #31\n\t"
-	"\n\t"
-	"@ Put back the sign. This is the sign of x: thanks to the\n\t"
-	"@ conditional swap at the start, this is always correct.\n\t"
-	"bfi	r1, r5, #31, #1\n\t"
-	"\n\t"
-	"pop	{ r4, r5, r6, r7, r8, r10, r11, pc }\n\t"
-	);
-}
-
-#else // yyyASM_CORTEXM4+0
-
-fpr
-fpr_add(fpr x, fpr y)
-{
-	uint64_t m, xu, yu, za;
-	uint32_t cs;
-	int ex, ey, sx, sy, cc;
-
-	/*
-	 * Make sure that the first operand (x) has the larger absolute
-	 * value. This guarantees that the exponent of y is less than
-	 * or equal to the exponent of x, and, if they are equal, then
-	 * the mantissa of y will not be greater than the mantissa of x.
-	 *
-	 * After this swap, the result will have the sign x, except in
-	 * the following edge case: abs(x) = abs(y), and x and y have
-	 * opposite sign bits; in that case, the result shall be +0
-	 * even if the sign bit of x is 1. To handle this case properly,
-	 * we do the swap is abs(x) = abs(y) AND the sign of x is 1.
-	 */
-	m = ((uint64_t)1 << 63) - 1;
-	za = (x & m) - (y & m);
-	cs = (uint32_t)(za >> 63)
-		| ((1U - (uint32_t)(-za >> 63)) & (uint32_t)(x >> 63));
-	m = (x ^ y) & -(uint64_t)cs;
-	x ^= m;
-	y ^= m;
-
-	/*
-	 * Extract sign bits, exponents and mantissas. The mantissas are
-	 * scaled up to 2^55..2^56-1, and the exponent is unbiased. If
-	 * an operand is zero, its mantissa is set to 0 at this step, and
-	 * its exponent will be -1078.
-	 */
-	ex = (int)(x >> 52);
-	sx = ex >> 11;
-	ex &= 0x7FF;
-	m = (uint64_t)(uint32_t)((ex + 0x7FF) >> 11) << 52;
-	xu = ((x & (((uint64_t)1 << 52) - 1)) | m) << 3;
-	ex -= 1078;
-	ey = (int)(y >> 52);
-	sy = ey >> 11;
-	ey &= 0x7FF;
-	m = (uint64_t)(uint32_t)((ey + 0x7FF) >> 11) << 52;
-	yu = ((y & (((uint64_t)1 << 52) - 1)) | m) << 3;
-	ey -= 1078;
-
-	/*
-	 * x has the larger exponent; hence, we only need to right-shift y.
-	 * If the shift count is larger than 59 bits then we clamp the
-	 * value to zero.
-	 */
-	cc = ex - ey;
-	yu &= -(uint64_t)((uint32_t)(cc - 60) >> 31);
-	cc &= 63;
-
-	/*
-	 * The lowest bit of yu is "sticky".
-	 */
-	m = fpr_ulsh(1, cc) - 1;
-	yu |= (yu & m) + m;
-	yu = fpr_ursh(yu, cc);
-
-	/*
-	 * If the operands have the same sign, then we add the mantissas;
-	 * otherwise, we subtract the mantissas.
-	 */
-	xu += yu - ((yu << 1) & -(uint64_t)(sx ^ sy));
-
-	/*
-	 * The result may be smaller, or slightly larger. We normalize
-	 * it to the 2^63..2^64-1 range (if xu is zero, then it stays
-	 * at zero).
-	 */
-	FPR_NORM64(xu, ex);
-
-	/*
-	 * Scale down the value to 2^54..s^55-1, handling the last bit
-	 * as sticky.
-	 */
-	xu |= ((uint32_t)xu & 0x1FF) + 0x1FF;
-	xu >>= 9;
-	ex += 9;
-
-	/*
-	 * In general, the result has the sign of x. However, if the
-	 * result is exactly zero, then the following situations may
-	 * be encountered:
-	 *   x > 0, y = -x   -> result should be +0
-	 *   x < 0, y = -x   -> result should be +0
-	 *   x = +0, y = +0  -> result should be +0
-	 *   x = -0, y = +0  -> result should be +0
-	 *   x = +0, y = -0  -> result should be +0
-	 *   x = -0, y = -0  -> result should be -0
-	 *
-	 * But at the conditional swap step at the start of the
-	 * function, we ensured that if abs(x) = abs(y) and the
-	 * sign of x was 1, then x and y were swapped. Thus, the
-	 * two following cases cannot actually happen:
-	 *   x < 0, y = -x
-	 *   x = -0, y = +0
-	 * In all other cases, the sign bit of x is conserved, which
-	 * is what the FPR() function does. The FPR() function also
-	 * properly clamps values to zero when the exponent is too
-	 * low, but does not alter the sign in that case.
-	 */
-	return FPR(sx, ex, xu);
-}
-
-#endif // yyyASM_CORTEXM4-
-
-#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1
-
-__attribute__((naked))
-fpr
-fpr_mul(fpr x __attribute__((unused)), fpr y __attribute__((unused)))
-{
-	__asm__ (
-	"push	{ r4, r5, r6, r7, r8, r10, r11, lr }\n\t"
-	"\n\t"
-	"@ Extract mantissas: x.m = r4:r5, y.m = r6:r7\n\t"
-	"@ r4 and r6 contain only 25 bits each.\n\t"
-	"bics	r4, r0, #0xFE000000\n\t"
-	"lsls	r5, r1, #7\n\t"
-	"orrs	r5, r5, r0, lsr #25\n\t"
-	"orrs	r5, r5, #0x08000000\n\t"
-	"bics	r5, r5, #0xF0000000\n\t"
-	"bics	r6, r2, #0xFE000000\n\t"
-	"lsls	r7, r3, #7\n\t"
-	"orrs	r7, r7, r2, lsr #25\n\t"
-	"orrs	r7, r7, #0x08000000\n\t"
-	"bics	r7, r7, #0xF0000000\n\t"
-	"\n\t"
-	"@ Perform product. Values are in the 2^52..2^53-1 range, so\n\t"
-	"@ the product is at most 106-bit long. Of the low 50 bits,\n\t"
-	"@ we only want to know if they are all zeros or not. Here,\n\t"
-	"@ we get the top 56 bits in r10:r11, and r8 will be non-zero\n\t"
-	"@ if and only if at least one of the low 50 bits is non-zero.\n\t"
-	"umull	r8, r10, r4, r6      @ x0*y0\n\t"
-	"lsls	r10, #7\n\t"
-	"orrs	r10, r10, r8, lsr #25\n\t"
-	"eors	r11, r11\n\t"
-	"umlal	r10, r11, r4, r7     @ x0*y1\n\t"
-	"umlal	r10, r11, r5, r6     @ x1*y0\n\t"
-	"orrs	r8, r8, r10, lsl #7\n\t"
-	"lsrs	r10, #25\n\t"
-	"orrs	r10, r10, r11, lsl #7\n\t"
-	"eors	r11, r11\n\t"
-	"umlal	r10, r11, r5, r7     @ x1*y1\n\t"
-	"\n\t"
-	"@ Now r0, r2, r4, r5, r6 and r7 are free.\n\t"
-	"@ If any of the low 50 bits was non-zero, then we force the\n\t"
-	"@ low bit of r10 to 1.\n\t"
-	"rsbs	r4, r8, #0\n\t"
-	"orrs	r8, r8, r4\n\t"
-	"orrs	r10, r10, r8, lsr #31\n\t"
-	"\n\t"
-	"@ r8 is free.\n\t"
-	"@ r10:r11 contains the product in the 2^54..2^56-1 range. We\n\t"
-	"@ normalize it to 2^54..2^55-1 (into r6:r7) with a conditional\n\t"
-	"@ shift (low bit is sticky). r5 contains -1 if the shift was done,\n\t"
-	"@ 0 otherwise.\n\t"
-	"ands	r6, r10, #1\n\t"
-	"lsrs	r5, r11, #23\n\t"
-	"rsbs	r5, r5, #0\n\t"
-	"orrs	r6, r6, r10, lsr #1\n\t"
-	"orrs	r6, r6, r11, lsl #31\n\t"
-	"lsrs	r7, r11, #1\n\t"
-	"eors	r10, r10, r6\n\t"
-	"eors	r11, r11, r7\n\t"
-	"bics	r10, r10, r5\n\t"
-	"bics	r11, r11, r5\n\t"
-	"eors	r6, r6, r10\n\t"
-	"eors	r7, r7, r11\n\t"
-	"\n\t"
-	"@ Compute aggregate exponent: ex + ey - 1023 + w\n\t"
-	"@ (where w = 1 if the conditional shift was done, 0 otherwise)\n\t"
-	"@ But we subtract 1 because the injection of the mantissa high\n\t"
-	"@ bit will increment the exponent by 1.\n\t"
-	"lsls	r0, r1, #1\n\t"
-	"lsls	r2, r3, #1\n\t"
-	"lsrs	r0, #21\n\t"
-	"addw	r4, r0, #0x7FF   @ save ex + 2047 in r4\n\t"
-	"lsrs	r2, #21\n\t"
-	"addw	r8, r2, #0x7FF   @ save ey + 2047 in r8\n\t"
-	"adds	r2, r0\n\t"
-	"subw	r2, r2, #1024\n\t"
-	"subs	r2, r5\n\t"
-	"\n\t"
-	"@ r5 is free.\n\t"
-	"@ Also, if either of the source exponents is 0, or the result\n\t"
-	"@ exponent is 0 or negative, then the result is zero and the\n\t"
-	"@ mantissa and the exponent shall be clamped to zero. Since\n\t"
-	"@ r2 contains the result exponent minus 1, we test on r2\n\t"
-	"@ being strictly negative.\n\t"
-	"ands	r4, r8    @ if bit 11 = 0 then one of the exponents was 0\n\t"
-	"mvns	r5, r2\n\t"
-	"ands	r5, r5, r4, lsl #20\n\t"
-	"ands	r2, r2, r5, asr #31\n\t"
-	"ands	r6, r6, r5, asr #31\n\t"
-	"ands	r7, r7, r5, asr #31\n\t"
-	"\n\t"
-	"@ Sign is the XOR of the sign of the operands. This is true in\n\t"
-	"@ all cases, including very small results (exponent underflow)\n\t"
-	"@ and zeros.\n\t"
-	"eors	r1, r3\n\t"
-	"bfc	r1, #0, #31\n\t"
-	"\n\t"
-	"@ Plug in the exponent.\n\t"
-	"bfi	r1, r2, #20, #11\n\t"
-	"\n\t"
-	"@ r2 and r3 are free.\n\t"
-	"@ Shift back to the normal 53-bit mantissa, with rounding.\n\t"
-	"@ Mantissa goes into r0:r1. For r1, we must use an addition\n\t"
-	"@ because the rounding may have triggered a carry, that should\n\t"
-	"@ be added to the exponent.\n\t"
-	"movs	r4, r6\n\t"
-	"lsrs	r0, r6, #2\n\t"
-	"orrs	r0, r0, r7, lsl #30\n\t"
-	"adds	r1, r1, r7, lsr #2\n\t"
-	"ands	r4, #0x7\n\t"
-	"movs	r3, #0xC8\n\t"
-	"lsrs	r3, r4\n\t"
-	"ands	r3, #1\n\t"
-	"adds	r0, r3\n\t"
-	"adcs	r1, #0\n\t"
-	"\n\t"
-	"pop	{ r4, r5, r6, r7, r8, r10, r11, pc }\n\t"
-	);
-}
-
-#else // yyyASM_CORTEXM4+0
-
-fpr
-fpr_mul(fpr x, fpr y)
-{
-	uint64_t xu, yu, w, zu, zv;
-	uint32_t x0, x1, y0, y1, z0, z1, z2;
-	int ex, ey, d, e, s;
-
-	/*
-	 * Extract absolute values as scaled unsigned integers. We
-	 * don't extract exponents yet.
-	 */
-	xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
-	yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
-
-	/*
-	 * We have two 53-bit integers to multiply; we need to split
-	 * each into a lower half and a upper half. Moreover, we
-	 * prefer to have lower halves to be of 25 bits each, for
-	 * reasons explained later on.
-	 */
-	x0 = (uint32_t)xu & 0x01FFFFFF;
-	x1 = (uint32_t)(xu >> 25);
-	y0 = (uint32_t)yu & 0x01FFFFFF;
-	y1 = (uint32_t)(yu >> 25);
-	w = (uint64_t)x0 * (uint64_t)y0;
-	z0 = (uint32_t)w & 0x01FFFFFF;
-	z1 = (uint32_t)(w >> 25);
-	w = (uint64_t)x0 * (uint64_t)y1;
-	z1 += (uint32_t)w & 0x01FFFFFF;
-	z2 = (uint32_t)(w >> 25);
-	w = (uint64_t)x1 * (uint64_t)y0;
-	z1 += (uint32_t)w & 0x01FFFFFF;
-	z2 += (uint32_t)(w >> 25);
-	zu = (uint64_t)x1 * (uint64_t)y1;
-	z2 += (z1 >> 25);
-	z1 &= 0x01FFFFFF;
-	zu += z2;
-
-	/*
-	 * Since xu and yu are both in the 2^52..2^53-1 range, the
-	 * product is in the 2^104..2^106-1 range. We first reassemble
-	 * it and round it into the 2^54..2^56-1 range; the bottom bit
-	 * is made "sticky". Since the low limbs z0 and z1 are 25 bits
-	 * each, we just take the upper part (zu), and consider z0 and
-	 * z1 only for purposes of stickiness.
-	 * (This is the reason why we chose 25-bit limbs above.)
-	 */
-	zu |= ((z0 | z1) + 0x01FFFFFF) >> 25;
-
-	/*
-	 * We normalize zu to the 2^54..s^55-1 range: it could be one
-	 * bit too large at this point. This is done with a conditional
-	 * right-shift that takes into account the sticky bit.
-	 */
-	zv = (zu >> 1) | (zu & 1);
-	w = zu >> 55;
-	zu ^= (zu ^ zv) & -w;
-
-	/*
-	 * Get the aggregate scaling factor:
-	 *
-	 *   - Each exponent is biased by 1023.
-	 *
-	 *   - Integral mantissas are scaled by 2^52, hence an
-	 *     extra 52 bias for each exponent.
-	 *
-	 *   - However, we right-shifted z by 50 bits, and then
-	 *     by 0 or 1 extra bit (depending on the value of w).
-	 *
-	 * In total, we must add the exponents, then subtract
-	 * 2 * (1023 + 52), then add 50 + w.
-	 */
-	ex = (int)((x >> 52) & 0x7FF);
-	ey = (int)((y >> 52) & 0x7FF);
-	e = ex + ey - 2100 + (int)w;
-
-	/*
-	 * Sign bit is the XOR of the operand sign bits.
-	 */
-	s = (int)((x ^ y) >> 63);
-
-	/*
-	 * Corrective actions for zeros: if either of the operands is
-	 * zero, then the computations above were wrong. Test for zero
-	 * is whether ex or ey is zero. We just have to set the mantissa
-	 * (zu) to zero, the FPR() function will normalize e.
-	 */
-	d = ((ex + 0x7FF) & (ey + 0x7FF)) >> 11;
-	zu &= -(uint64_t)d;
-
-	/*
-	 * FPR() packs the result and applies proper rounding.
-	 */
-	return FPR(s, e, zu);
-}
-
-#endif // yyyASM_CORTEXM4-
-
-#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1
-
-__attribute__((naked))
-fpr
-fpr_div(fpr x __attribute__((unused)), fpr y __attribute__((unused)))
-{
-	__asm__ (
-	"push	{ r4, r5, r6, r7, r8, r10, r11, lr }\n\t"
-
-	"@ Extract mantissas of x and y, in r0:r4 and r2:r5, respectively.\n\t"
-	"@ We don't touch r1 and r3 as they contain the exponents and\n\t"
-	"@ signs, which we'll need later on.\n\t"
-	"ubfx	r4, r1, #0, #20\n\t"
-	"ubfx	r5, r3, #0, #20\n\t"
-	"orrs	r4, r4, #0x00100000\n\t"
-	"orrs	r5, r5, #0x00100000\n\t"
-	"\n\t"
-	"@ Perform bit-by-bit division. We want a 56-bit result in r8:r10\n\t"
-	"@ (low bit is 0). Bits come from the carry flag and are\n\t"
-	"@ injected with rrx, i.e. in position 31; we thus get bits in\n\t"
-	"@ the reverse order. Bits accumulate in r8; after the first 24\n\t"
-	"@ bits, we move the quotient bits to r10.\n\t"
-	"eors	r8, r8\n\t"
-	"\n\t"
-
-#define DIVSTEP \
-	"subs	r6, r0, r2\n\t" \
-	"sbcs	r7, r4, r5\n\t" \
-	"rrx	r8, r8\n\t" \
-	"ands	r6, r2, r8, asr #31\n\t" \
-	"ands	r7, r5, r8, asr #31\n\t" \
-	"subs	r0, r6\n\t" \
-	"sbcs	r4, r7\n\t" \
-	"adds	r0, r0, r0\n\t" \
-	"adcs	r4, r4, r4\n\t"
-
-#define DIVSTEP4   DIVSTEP DIVSTEP DIVSTEP DIVSTEP
-#define DIVSTEP8   DIVSTEP4 DIVSTEP4
-
-	DIVSTEP8
-	DIVSTEP8
-	DIVSTEP8
-
-	"\n\t"
-	"@ We have the first 24 bits of the quotient, move them to r10.\n\t"
-	"rbit	r10, r8\n\t"
-	"\n\t"
-
-	DIVSTEP8
-	DIVSTEP8
-	DIVSTEP8
-	DIVSTEP4 DIVSTEP DIVSTEP DIVSTEP
-
-#undef DIVSTEP
-#undef DIVSTEP4
-#undef DIVSTEP8
-
-	"\n\t"
-	"@ Lowest bit will be set if remainder is non-zero at this point\n\t"
-	"@ (this is the 'sticky' bit).\n\t"
-	"subs	r0, #1\n\t"
-	"sbcs	r4, #0\n\t"
-	"rrx	r8, r8\n\t"
-	"\n\t"
-	"@ We now have the next (low) 32 bits of the quotient.\n\t"
-	"rbit	r8, r8\n\t"
-	"\n\t"
-	"@ Since both operands had their top bit set, we know that the\n\t"
-	"@ result at this point is in 2^54..2^56-1. We scale it down\n\t"
-	"@ to 2^54..2^55-1 with a conditional shift. We also write the\n\t"
-	"@ result in r4:r5. If the shift is done, r6 will contain -1.\n\t"
-	"ands	r4, r8, #1\n\t"
-	"lsrs	r6, r10, #23\n\t"
-	"rsbs	r6, r6, #0\n\t"
-	"orrs	r4, r4, r8, lsr #1\n\t"
-	"orrs	r4, r4, r10, lsl #31\n\t"
-	"lsrs	r5, r10, #1\n\t"
-	"eors	r8, r8, r4\n\t"
-	"eors	r10, r10, r5\n\t"
-	"bics	r8, r8, r6\n\t"
-	"bics	r10, r10, r6\n\t"
-	"eors	r4, r4, r8\n\t"
-	"eors	r5, r5, r10\n\t"
-	"\n\t"
-	"@ Compute aggregate exponent: ex - ey + 1022 + w\n\t"
-	"@ (where w = 1 if the conditional shift was done, 0 otherwise)\n\t"
-	"@ But we subtract 1 because the injection of the mantissa high\n\t"
-	"@ bit will increment the exponent by 1.\n\t"
-	"lsls	r0, r1, #1\n\t"
-	"lsls	r2, r3, #1\n\t"
-	"lsrs	r0, r0, #21\n\t"
-	"addw	r7, r0, #0x7FF  @ save ex + 2047 in r7\n\t"
-	"subs	r0, r0, r2, lsr #21\n\t"
-	"addw	r0, r0, #1021\n\t"
-	"subs	r0, r6\n\t"
-	"\n\t"
-	"@ If the x operand was zero, then the computation was wrong and\n\t"
-	"@ the result is zero. Also, if the result exponent is zero or\n\t"
-	"@ negative, then the mantissa shall be clamped to zero. Since r0\n\t"
-	"@ contains the result exponent minus 1, we test on r0 being\n\t"
-	"@ strictly negative.\n\t"
-	"mvns	r2, r0\n\t"
-	"ands	r2, r2, r7, lsl #20\n\t"
-	"ands	r0, r0, r2, asr #31\n\t"
-	"ands	r4, r4, r2, asr #31\n\t"
-	"ands	r5, r5, r2, asr #31\n\t"
-	"\n\t"
-	"@ Sign is the XOR of the sign of the operands. This is true in\n\t"
-	"@ all cases, including very small results (exponent underflow)\n\t"
-	"@ and zeros.\n\t"
-	"eors	r1, r3\n\t"
-	"bfc	r1, #0, #31\n\t"
-	"\n\t"
-	"@ Plug in the exponent.\n\t"
-	"bfi	r1, r0, #20, #11\n\t"
-	"\n\t"
-	"@ Shift back to the normal 53-bit mantissa, with rounding.\n\t"
-	"@ Mantissa goes into r0:r1. For r1, we must use an addition\n\t"
-	"@ because the rounding may have triggered a carry, that should\n\t"
-	"@ be added to the exponent.\n\t"
-	"movs	r6, r4\n\t"
-	"lsrs	r0, r4, #2\n\t"
-	"orrs	r0, r0, r5, lsl #30\n\t"
-	"adds	r1, r1, r5, lsr #2\n\t"
-	"ands	r6, #0x7\n\t"
-	"movs	r3, #0xC8\n\t"
-	"lsrs	r3, r6\n\t"
-	"ands	r3, #1\n\t"
-	"adds	r0, r3\n\t"
-	"adcs	r1, #0\n\t"
-	"\n\t"
-	"pop	{ r4, r5, r6, r7, r8, r10, r11, pc }\n\t"
-	);
-}
-
-#else // yyyASM_CORTEXM4+0
-
-fpr
-fpr_div(fpr x, fpr y)
-{
-	uint64_t xu, yu, q, q2, w;
-	int i, ex, ey, e, d, s;
-
-	/*
-	 * Extract mantissas of x and y (unsigned).
-	 */
-	xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
-	yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
-
-	/*
-	 * Perform bit-by-bit division of xu by yu. We run it for 55 bits.
-	 */
-	q = 0;
-	for (i = 0; i < 55; i ++) {
-		/*
-		 * If yu is less than or equal xu, then subtract it and
-		 * push a 1 in the quotient; otherwise, leave xu unchanged
-		 * and push a 0.
-		 */
-		uint64_t b;
-
-		b = ((xu - yu) >> 63) - 1;
-		xu -= b & yu;
-		q |= b & 1;
-		xu <<= 1;
-		q <<= 1;
-	}
-
-	/*
-	 * We got 55 bits in the quotient, followed by an extra zero. We
-	 * want that 56th bit to be "sticky": it should be a 1 if and
-	 * only if the remainder (xu) is non-zero.
-	 */
-	q |= (xu | -xu) >> 63;
-
-	/*
-	 * Quotient is at most 2^56-1. Its top bit may be zero, but in
-	 * that case the next-to-top bit will be a one, since the
-	 * initial xu and yu were both in the 2^52..2^53-1 range.
-	 * We perform a conditional shift to normalize q to the
-	 * 2^54..2^55-1 range (with the bottom bit being sticky).
-	 */
-	q2 = (q >> 1) | (q & 1);
-	w = q >> 55;
-	q ^= (q ^ q2) & -w;
-
-	/*
-	 * Extract exponents to compute the scaling factor:
-	 *
-	 *   - Each exponent is biased and we scaled them up by
-	 *     52 bits; but these biases will cancel out.
-	 *
-	 *   - The division loop produced a 55-bit shifted result,
-	 *     so we must scale it down by 55 bits.
-	 *
-	 *   - If w = 1, we right-shifted the integer by 1 bit,
-	 *     hence we must add 1 to the scaling.
-	 */
-	ex = (int)((x >> 52) & 0x7FF);
-	ey = (int)((y >> 52) & 0x7FF);
-	e = ex - ey - 55 + (int)w;
-
-	/*
-	 * Sign is the XOR of the signs of the operands.
-	 */
-	s = (int)((x ^ y) >> 63);
-
-	/*
-	 * Corrective actions for zeros: if x = 0, then the computation
-	 * is wrong, and we must clamp e and q to 0. We do not care
-	 * about the case y = 0 (as per assumptions in this module,
-	 * the caller does not perform divisions by zero).
-	 */
-	d = (ex + 0x7FF) >> 11;
-	s &= d;
-	e &= -d;
-	q &= -(uint64_t)d;
-
-	/*
-	 * FPR() packs the result and applies proper rounding.
-	 */
-	return FPR(s, e, q);
-}
-
-#endif // yyyASM_CORTEXM4-
-
-#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1
-
-__attribute__((naked))
-fpr
-fpr_sqrt(fpr x __attribute__((unused)))
-{
-	__asm__ (
-	"push	{ r4, r5, r6, r7, r8, r10, r11, lr }\n\t"
-	"\n\t"
-	"@ Extract mantissa (r0:r1) and exponent (r2). We assume that the\n\t"
-	"@ sign is positive. If the source is zero, then the mantissa is\n\t"
-	"@ set to 0.\n\t"
-	"lsrs	r2, r1, #20\n\t"
-	"bfc	r1, #20, #12\n\t"
-	"addw	r3, r2, #0x7FF\n\t"
-	"subw	r2, r2, #1023\n\t"
-	"lsrs	r3, r3, #11\n\t"
-	"orrs	r1, r1, r3, lsl #20\n\t"
-	"\n\t"
-	"@ If the exponent is odd, then multiply mantissa by 2 and subtract\n\t"
-	"@ 1 from the exponent.\n\t"
-	"ands	r3, r2, #1\n\t"
-	"subs	r2, r2, r3\n\t"
-	"rsbs	r3, r3, #0\n\t"
-	"ands	r4, r1, r3\n\t"
-	"ands	r3, r0\n\t"
-	"adds	r0, r3\n\t"
-	"adcs	r1, r4\n\t"
-	"\n\t"
-	"@ Left-shift the mantissa by 9 bits to put it in the\n\t"
-	"@ 2^61..2^63-1 range (unless it is exactly 0).\n\t"
-	"lsls	r1, r1, #9\n\t"
-	"orrs	r1, r1, r0, lsr #23\n\t"
-	"lsls	r0, r0, #9\n\t"
-	"\n\t"
-	"@ Compute the square root bit-by-bit.\n\t"
-	"@ There are 54 iterations; first 30 can work on top word only.\n\t"
-	"@   q = r3 (bit-reversed)\n\t"
-	"@   s = r5\n\t"
-	"eors	r3, r3\n\t"
-	"eors	r5, r5\n\t"
-
-#define SQRT_STEP_HI(bit) \
-	"orrs	r6, r5, #(1 << (" #bit "))\n\t" \
-	"subs	r7, r1, r6\n\t" \
-	"rrx	r3, r3\n\t" \
-	"ands	r6, r6, r3, asr #31\n\t" \
-	"subs	r1, r1, r6\n\t" \
-	"lsrs	r6, r3, #31\n\t" \
-	"orrs	r5, r5, r6, lsl #((" #bit ") + 1)\n\t" \
-	"adds	r0, r0\n\t" \
-	"adcs	r1, r1\n\t"
-
-#define SQRT_STEP_HIx5(b)  \
-		SQRT_STEP_HI((b)+4) \
-		SQRT_STEP_HI((b)+3) \
-		SQRT_STEP_HI((b)+2) \
-		SQRT_STEP_HI((b)+1) \
-		SQRT_STEP_HI(b)
-
-	SQRT_STEP_HIx5(25)
-	SQRT_STEP_HIx5(20)
-	SQRT_STEP_HIx5(15)
-	SQRT_STEP_HIx5(10)
-	SQRT_STEP_HIx5(5)
-	SQRT_STEP_HIx5(0)
-
-#undef SQRT_STEP_HI
-#undef SQRT_STEP_HIx5
-
-	"@ Top 30 bits of the result must be reversed: they were\n\t"
-	"@ accumulated with rrx (hence from the top bit).\n\t"
-	"rbit	r3, r3\n\t"
-	"\n\t"
-	"@ For the next 24 iterations, we must use two-word operations.\n\t"
-	"@   bits of q now accumulate in r4\n\t"
-	"@   s is in r6:r5\n\t"
-	"eors	r4, r4\n\t"
-	"eors	r6, r6\n\t"
-	"\n\t"
-	"@ First iteration is special because the potential bit goes into\n\t"
-	"@ r5, not r6.\n\t"
-	"orrs	r7, r6, #(1 << 31)\n\t"
-	"subs	r8, r0, r7\n\t"
-	"sbcs	r10, r1, r5\n\t"
-	"rrx	r4, r4\n\t"
-	"ands	r7, r7, r4, asr #31\n\t"
-	"ands	r8, r5, r4, asr #31\n\t"
-	"subs	r0, r0, r7\n\t"
-	"sbcs	r1, r1, r8\n\t"
-	"lsrs	r7, r4, #31\n\t"
-	"orrs	r5, r5, r4, lsr #31\n\t"
-	"adds	r0, r0\n\t"
-	"adcs	r1, r1\n\t"
-
-#define SQRT_STEP_LO(bit) \
-	"orrs	r7, r6, #(1 << (" #bit "))\n\t" \
-	"subs	r8, r0, r7\n\t" \
-	"sbcs	r10, r1, r5\n\t" \
-	"rrx	r4, r4\n\t" \
-	"ands	r7, r7, r4, asr #31\n\t" \
-	"ands	r8, r5, r4, asr #31\n\t" \
-	"subs	r0, r0, r7\n\t" \
-	"sbcs	r1, r1, r8\n\t" \
-	"lsrs	r7, r4, #31\n\t" \
-	"orrs	r6, r6, r7, lsl #((" #bit ") + 1)\n\t" \
-	"adds	r0, r0\n\t" \
-	"adcs	r1, r1\n\t"
-
-#define SQRT_STEP_LOx4(b) \
-		SQRT_STEP_LO((b)+3) \
-		SQRT_STEP_LO((b)+2) \
-		SQRT_STEP_LO((b)+1) \
-		SQRT_STEP_LO(b)
-
-	SQRT_STEP_LO(30)
-	SQRT_STEP_LO(29)
-	SQRT_STEP_LO(28)
-	SQRT_STEP_LOx4(24)
-	SQRT_STEP_LOx4(20)
-	SQRT_STEP_LOx4(16)
-	SQRT_STEP_LOx4(12)
-	SQRT_STEP_LOx4(8)
-
-#undef SQRT_STEP_LO
-#undef SQRT_STEP_LOx4
-
-	"@ Put low 24 bits in the right order.\n\t"
-	"rbit	r4, r4\n\t"
-	"\n\t"
-	"@ We have a 54-bit result; compute the 55-th bit as the 'sticky'\n\t"
-	"@ bit: it is non-zero if and only if r0:r1 is non-zero. We put the\n\t"
-	"@ three low bits (including the sticky bit) in r5.\n\t"
-	"orrs	r0, r1\n\t"
-	"rsbs	r1, r0, #0\n\t"
-	"orrs	r0, r1\n\t"
-	"lsls	r5, r4, #1\n\t"
-	"orrs	r5, r5, r0, lsr #31\n\t"
-	"ands	r5, #0x7\n\t"
-	"\n\t"
-	"@ Compute the rounding: r6 is set to 0 or 1, and will be added\n\t"
-	"@ to the mantissa.\n\t"
-	"movs	r6, #0xC8\n\t"
-	"lsrs	r6, r5\n\t"
-	"ands	r6, #1\n\t"
-	"\n\t"
-	"@ Put the mantissa (53 bits, in the 2^52..2^53-1 range) in r0:r1\n\t"
-	"@ (rounding not applied yet).\n\t"
-	"lsrs	r0, r4, #1\n\t"
-	"orrs	r0, r0, r3, lsl #23\n\t"
-	"lsrs	r1, r3, #9\n\t"
-	"\n\t"
-	"@ Compute new exponent. This is half the old one (then reencoded\n\t"
-	"@ by adding 1023). Exception: if the mantissa is zero, then the\n\t"
-	"@ encoded exponent is set to 0. At that point, if the mantissa\n\t"
-	"@ is non-zero, then its high bit (bit 52, i.e. bit 20 of r1) is\n\t"
-	"@ non-zero. Note that the exponent cannot go out of range.\n\t"
-	"lsrs	r2, r2, #1\n\t"
-	"addw	r2, r2, #1023\n\t"
-	"lsrs	r5, r1, #20\n\t"
-	"rsbs	r5, r5, #0\n\t"
-	"ands	r2, r5\n\t"
-	"\n\t"
-	"@ Place exponent. This overwrites the high bit of the mantissa.\n\t"
-	"bfi	r1, r2, #20, #11\n\t"
-	"\n\t"
-	"@ Apply rounding. This may create a carry that will spill into\n\t"
-	"@ the exponent, which is exactly what should be done in that case\n\t"
-	"@ (i.e. increment the exponent).\n\t"
-	"adds	r0, r0, r6\n\t"
-	"adcs	r1, r1, #0\n\t"
-	"\n\t"
-	"pop	{ r4, r5, r6, r7, r8, r10, r11, pc }\n\t"
-	);
-}
-
-#else // yyyASM_CORTEXM4+0
-
-fpr
-fpr_sqrt(fpr x)
-{
-	uint64_t xu, q, s, r;
-	int ex, e;
-
-	/*
-	 * Extract the mantissa and the exponent. We don't care about
-	 * the sign: by assumption, the operand is nonnegative.
-	 * We want the "true" exponent corresponding to a mantissa
-	 * in the 1..2 range.
-	 */
-	xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
-	ex = (int)((x >> 52) & 0x7FF);
-	e = ex - 1023;
-
-	/*
-	 * If the exponent is odd, double the mantissa and decrement
-	 * the exponent. The exponent is then halved to account for
-	 * the square root.
-	 */
-	xu += xu & -(uint64_t)(e & 1);
-	e >>= 1;
-
-	/*
-	 * Double the mantissa.
-	 */
-	xu <<= 1;
-
-	/*
-	 * We now have a mantissa in the 2^53..2^55-1 range. It
-	 * represents a value between 1 (inclusive) and 4 (exclusive)
-	 * in fixed point notation (with 53 fractional bits). We
-	 * compute the square root bit by bit.
-	 */
-	q = 0;
-	s = 0;
-	r = (uint64_t)1 << 53;
-	for (int i = 0; i < 54; i ++) {
-		uint64_t t, b;
-
-		t = s + r;
-		b = ((xu - t) >> 63) - 1;
-		s += (r << 1) & b;
-		xu -= t & b;
-		q += r & b;
-		xu <<= 1;
-		r >>= 1;
-	}
-
-	/*
-	 * Now, q is a rounded-low 54-bit value, with a leading 1,
-	 * 52 fractional digits, and an additional guard bit. We add
-	 * an extra sticky bit to account for what remains of the operand.
-	 */
-	q <<= 1;
-	q |= (xu | -xu) >> 63;
-
-	/*
-	 * Result q is in the 2^54..2^55-1 range; we bias the exponent
-	 * by 54 bits (the value e at that point contains the "true"
-	 * exponent, but q is now considered an integer, i.e. scaled
-	 * up.
-	 */
-	e -= 54;
-
-	/*
-	 * Corrective action for an operand of value zero.
-	 */
-	q &= -(uint64_t)((ex + 0x7FF) >> 11);
-
-	/*
-	 * Apply rounding and back result.
-	 */
-	return FPR(0, e, q);
-}
-
-#endif // yyyASM_CORTEXM4-
-
-uint64_t
-fpr_expm_p63(fpr x, fpr ccs)
-{
-	/*
-	 * Polynomial approximation of exp(-x) is taken from FACCT:
-	 *   https://eprint.iacr.org/2018/1234
-	 * Specifically, values are extracted from the implementation
-	 * referenced from the FACCT article, and available at:
-	 *   https://github.com/raykzhao/gaussian
-	 * Here, the coefficients have been scaled up by 2^63 and
-	 * converted to integers.
-	 *
-	 * Tests over more than 24 billions of random inputs in the
-	 * 0..log(2) range have never shown a deviation larger than
-	 * 2^(-50) from the true mathematical value.
-	 */
-	static const uint64_t C[] = {
-		0x00000004741183A3u,
-		0x00000036548CFC06u,
-		0x0000024FDCBF140Au,
-		0x0000171D939DE045u,
-		0x0000D00CF58F6F84u,
-		0x000680681CF796E3u,
-		0x002D82D8305B0FEAu,
-		0x011111110E066FD0u,
-		0x0555555555070F00u,
-		0x155555555581FF00u,
-		0x400000000002B400u,
-		0x7FFFFFFFFFFF4800u,
-		0x8000000000000000u
-	};
-
-	uint64_t z, y;
-	unsigned u;
-	uint32_t z0, z1, y0, y1;
-	uint64_t a, b;
-
-	y = C[0];
-	z = (uint64_t)fpr_trunc(fpr_mul(x, fpr_ptwo63)) << 1;
-	for (u = 1; u < (sizeof C) / sizeof(C[0]); u ++) {
-		/*
-		 * Compute product z * y over 128 bits, but keep only
-		 * the top 64 bits.
-		 *
-		 * TODO: On some architectures/compilers we could use
-		 * some intrinsics (__umulh() on MSVC) or other compiler
-		 * extensions (unsigned __int128 on GCC / Clang) for
-		 * improved speed; however, most 64-bit architectures
-		 * also have appropriate IEEE754 floating-point support,
-		 * which is better.
-		 */
-		uint64_t c;
-
-		z0 = (uint32_t)z;
-		z1 = (uint32_t)(z >> 32);
-		y0 = (uint32_t)y;
-		y1 = (uint32_t)(y >> 32);
-		a = ((uint64_t)z0 * (uint64_t)y1)
-			+ (((uint64_t)z0 * (uint64_t)y0) >> 32);
-		b = ((uint64_t)z1 * (uint64_t)y0);
-		c = (a >> 32) + (b >> 32);
-		c += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
-		c += (uint64_t)z1 * (uint64_t)y1;
-		y = C[u] - c;
-	}
-
-	/*
-	 * The scaling factor must be applied at the end. Since y is now
-	 * in fixed-point notation, we have to convert the factor to the
-	 * same format, and do an extra integer multiplication.
-	 */
-	z = (uint64_t)fpr_trunc(fpr_mul(ccs, fpr_ptwo63)) << 1;
-	z0 = (uint32_t)z;
-	z1 = (uint32_t)(z >> 32);
-	y0 = (uint32_t)y;
-	y1 = (uint32_t)(y >> 32);
-	a = ((uint64_t)z0 * (uint64_t)y1)
-		+ (((uint64_t)z0 * (uint64_t)y0) >> 32);
-	b = ((uint64_t)z1 * (uint64_t)y0);
-	y = (a >> 32) + (b >> 32);
-	y += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
-	y += (uint64_t)z1 * (uint64_t)y1;
-
-	return y;
-}
-
-const fpr fpr_gm_tab[] = {
-	0, 0,
-	 9223372036854775808U,  4607182418800017408U,
-	 4604544271217802189U,  4604544271217802189U,
-	13827916308072577997U,  4604544271217802189U,
-	 4606496786581982534U,  4600565431771507043U,
-	13823937468626282851U,  4606496786581982534U,
-	 4600565431771507043U,  4606496786581982534U,
-	13829868823436758342U,  4600565431771507043U,
-	 4607009347991985328U,  4596196889902818827U,
-	13819568926757594635U,  4607009347991985328U,
-	 4603179351334086856U,  4605664432017547683U,
-	13829036468872323491U,  4603179351334086856U,
-	 4605664432017547683U,  4603179351334086856U,
-	13826551388188862664U,  4605664432017547683U,
-	 4596196889902818827U,  4607009347991985328U,
-	13830381384846761136U,  4596196889902818827U,
-	 4607139046673687846U,  4591727299969791020U,
-	13815099336824566828U,  4607139046673687846U,
-	 4603889326261607894U,  4605137878724712257U,
-	13828509915579488065U,  4603889326261607894U,
-	 4606118860100255153U,  4602163548591158843U,
-	13825535585445934651U,  4606118860100255153U,
-	 4598900923775164166U,  4606794571824115162U,
-	13830166608678890970U,  4598900923775164166U,
-	 4606794571824115162U,  4598900923775164166U,
-	13822272960629939974U,  4606794571824115162U,
-	 4602163548591158843U,  4606118860100255153U,
-	13829490896955030961U,  4602163548591158843U,
-	 4605137878724712257U,  4603889326261607894U,
-	13827261363116383702U,  4605137878724712257U,
-	 4591727299969791020U,  4607139046673687846U,
-	13830511083528463654U,  4591727299969791020U,
-	 4607171569234046334U,  4587232218149935124U,
-	13810604255004710932U,  4607171569234046334U,
-	 4604224084862889120U,  4604849113969373103U,
-	13828221150824148911U,  4604224084862889120U,
-	 4606317631232591731U,  4601373767755717824U,
-	13824745804610493632U,  4606317631232591731U,
-	 4599740487990714333U,  4606655894547498725U,
-	13830027931402274533U,  4599740487990714333U,
-	 4606912484326125783U,  4597922303871901467U,
-	13821294340726677275U,  4606912484326125783U,
-	 4602805845399633902U,  4605900952042040894U,
-	13829272988896816702U,  4602805845399633902U,
-	 4605409869824231233U,  4603540801876750389U,
-	13826912838731526197U,  4605409869824231233U,
-	 4594454542771183930U,  4607084929468638487U,
-	13830456966323414295U,  4594454542771183930U,
-	 4607084929468638487U,  4594454542771183930U,
-	13817826579625959738U,  4607084929468638487U,
-	 4603540801876750389U,  4605409869824231233U,
-	13828781906679007041U,  4603540801876750389U,
-	 4605900952042040894U,  4602805845399633902U,
-	13826177882254409710U,  4605900952042040894U,
-	 4597922303871901467U,  4606912484326125783U,
-	13830284521180901591U,  4597922303871901467U,
-	 4606655894547498725U,  4599740487990714333U,
-	13823112524845490141U,  4606655894547498725U,
-	 4601373767755717824U,  4606317631232591731U,
-	13829689668087367539U,  4601373767755717824U,
-	 4604849113969373103U,  4604224084862889120U,
-	13827596121717664928U,  4604849113969373103U,
-	 4587232218149935124U,  4607171569234046334U,
-	13830543606088822142U,  4587232218149935124U,
-	 4607179706000002317U,  4582730748936808062U,
-	13806102785791583870U,  4607179706000002317U,
-	 4604386048625945823U,  4604698657331085206U,
-	13828070694185861014U,  4604386048625945823U,
-	 4606409688975526202U,  4600971798440897930U,
-	13824343835295673738U,  4606409688975526202U,
-	 4600154912527631775U,  4606578871587619388U,
-	13829950908442395196U,  4600154912527631775U,
-	 4606963563043808649U,  4597061974398750563U,
-	13820434011253526371U,  4606963563043808649U,
-	 4602994049708411683U,  4605784983948558848U,
-	13829157020803334656U,  4602994049708411683U,
-	 4605539368864982914U,  4603361638657888991U,
-	13826733675512664799U,  4605539368864982914U,
-	 4595327571478659014U,  4607049811591515049U,
-	13830421848446290857U,  4595327571478659014U,
-	 4607114680469659603U,  4593485039402578702U,
-	13816857076257354510U,  4607114680469659603U,
-	 4603716733069447353U,  4605276012900672507U,
-	13828648049755448315U,  4603716733069447353U,
-	 4606012266443150634U,  4602550884377336506U,
-	13825922921232112314U,  4606012266443150634U,
-	 4598476289818621559U,  4606856142606846307U,
-	13830228179461622115U,  4598476289818621559U,
-	 4606727809065869586U,  4599322407794599425U,
-	13822694444649375233U,  4606727809065869586U,
-	 4601771097584682078U,  4606220668805321205U,
-	13829592705660097013U,  4601771097584682078U,
-	 4604995550503212910U,  4604058477489546729U,
-	13827430514344322537U,  4604995550503212910U,
-	 4589965306122607094U,  4607158013403433018U,
-	13830530050258208826U,  4589965306122607094U,
-	 4607158013403433018U,  4589965306122607094U,
-	13813337342977382902U,  4607158013403433018U,
-	 4604058477489546729U,  4604995550503212910U,
-	13828367587357988718U,  4604058477489546729U,
-	 4606220668805321205U,  4601771097584682078U,
-	13825143134439457886U,  4606220668805321205U,
-	 4599322407794599425U,  4606727809065869586U,
-	13830099845920645394U,  4599322407794599425U,
-	 4606856142606846307U,  4598476289818621559U,
-	13821848326673397367U,  4606856142606846307U,
-	 4602550884377336506U,  4606012266443150634U,
-	13829384303297926442U,  4602550884377336506U,
-	 4605276012900672507U,  4603716733069447353U,
-	13827088769924223161U,  4605276012900672507U,
-	 4593485039402578702U,  4607114680469659603U,
-	13830486717324435411U,  4593485039402578702U,
-	 4607049811591515049U,  4595327571478659014U,
-	13818699608333434822U,  4607049811591515049U,
-	 4603361638657888991U,  4605539368864982914U,
-	13828911405719758722U,  4603361638657888991U,
-	 4605784983948558848U,  4602994049708411683U,
-	13826366086563187491U,  4605784983948558848U,
-	 4597061974398750563U,  4606963563043808649U,
-	13830335599898584457U,  4597061974398750563U,
-	 4606578871587619388U,  4600154912527631775U,
-	13823526949382407583U,  4606578871587619388U,
-	 4600971798440897930U,  4606409688975526202U,
-	13829781725830302010U,  4600971798440897930U,
-	 4604698657331085206U,  4604386048625945823U,
-	13827758085480721631U,  4604698657331085206U,
-	 4582730748936808062U,  4607179706000002317U,
-	13830551742854778125U,  4582730748936808062U,
-	 4607181740574479067U,  4578227681973159812U,
-	13801599718827935620U,  4607181740574479067U,
-	 4604465633578481725U,  4604621949701367983U,
-	13827993986556143791U,  4604465633578481725U,
-	 4606453861145241227U,  4600769149537129431U,
-	13824141186391905239U,  4606453861145241227U,
-	 4600360675823176935U,  4606538458821337243U,
-	13829910495676113051U,  4600360675823176935U,
-	 4606987119037722413U,  4596629994023683153U,
-	13820002030878458961U,  4606987119037722413U,
-	 4603087070374583113U,  4605725276488455441U,
-	13829097313343231249U,  4603087070374583113U,
-	 4605602459698789090U,  4603270878689749849U,
-	13826642915544525657U,  4605602459698789090U,
-	 4595762727260045105U,  4607030246558998647U,
-	13830402283413774455U,  4595762727260045105U,
-	 4607127537664763515U,  4592606767730311893U,
-	13815978804585087701U,  4607127537664763515U,
-	 4603803453461190356U,  4605207475328619533U,
-	13828579512183395341U,  4603803453461190356U,
-	 4606066157444814153U,  4602357870542944470U,
-	13825729907397720278U,  4606066157444814153U,
-	 4598688984595225406U,  4606826008603986804U,
-	13830198045458762612U,  4598688984595225406U,
-	 4606761837001494797U,  4599112075441176914U,
-	13822484112295952722U,  4606761837001494797U,
-	 4601967947786150793U,  4606170366472647579U,
-	13829542403327423387U,  4601967947786150793U,
-	 4605067233569943231U,  4603974338538572089U,
-	13827346375393347897U,  4605067233569943231U,
-	 4590846768565625881U,  4607149205763218185U,
-	13830521242617993993U,  4590846768565625881U,
-	 4607165468267934125U,  4588998070480937184U,
-	13812370107335712992U,  4607165468267934125U,
-	 4604141730443515286U,  4604922840319727473U,
-	13828294877174503281U,  4604141730443515286U,
-	 4606269759522929756U,  4601573027631668967U,
-	13824945064486444775U,  4606269759522929756U,
-	 4599531889160152938U,  4606692493141721470U,
-	13830064529996497278U,  4599531889160152938U,
-	 4606884969294623682U,  4598262871476403630U,
-	13821634908331179438U,  4606884969294623682U,
-	 4602710690099904183U,  4605957195211051218U,
-	13829329232065827026U,  4602710690099904183U,
-	 4605343481119364930U,  4603629178146150899U,
-	13827001215000926707U,  4605343481119364930U,
-	 4594016801320007031U,  4607100477024622401U,
-	13830472513879398209U,  4594016801320007031U,
-	 4607068040143112603U,  4594891488091520602U,
-	13818263524946296410U,  4607068040143112603U,
-	 4603451617570386922U,  4605475169017376660U,
-	13828847205872152468U,  4603451617570386922U,
-	 4605843545406134034U,  4602900303344142735U,
-	13826272340198918543U,  4605843545406134034U,
-	 4597492765973365521U,  4606938683557690074U,
-	13830310720412465882U,  4597492765973365521U,
-	 4606618018794815019U,  4599948172872067014U,
-	13823320209726842822U,  4606618018794815019U,
-	 4601173347964633034U,  4606364276725003740U,
-	13829736313579779548U,  4601173347964633034U,
-	 4604774382555066977U,  4604305528345395596U,
-	13827677565200171404U,  4604774382555066977U,
-	 4585465300892538317U,  4607176315382986589U,
-	13830548352237762397U,  4585465300892538317U,
-	 4607176315382986589U,  4585465300892538317U,
-	13808837337747314125U,  4607176315382986589U,
-	 4604305528345395596U,  4604774382555066977U,
-	13828146419409842785U,  4604305528345395596U,
-	 4606364276725003740U,  4601173347964633034U,
-	13824545384819408842U,  4606364276725003740U,
-	 4599948172872067014U,  4606618018794815019U,
-	13829990055649590827U,  4599948172872067014U,
-	 4606938683557690074U,  4597492765973365521U,
-	13820864802828141329U,  4606938683557690074U,
-	 4602900303344142735U,  4605843545406134034U,
-	13829215582260909842U,  4602900303344142735U,
-	 4605475169017376660U,  4603451617570386922U,
-	13826823654425162730U,  4605475169017376660U,
-	 4594891488091520602U,  4607068040143112603U,
-	13830440076997888411U,  4594891488091520602U,
-	 4607100477024622401U,  4594016801320007031U,
-	13817388838174782839U,  4607100477024622401U,
-	 4603629178146150899U,  4605343481119364930U,
-	13828715517974140738U,  4603629178146150899U,
-	 4605957195211051218U,  4602710690099904183U,
-	13826082726954679991U,  4605957195211051218U,
-	 4598262871476403630U,  4606884969294623682U,
-	13830257006149399490U,  4598262871476403630U,
-	 4606692493141721470U,  4599531889160152938U,
-	13822903926014928746U,  4606692493141721470U,
-	 4601573027631668967U,  4606269759522929756U,
-	13829641796377705564U,  4601573027631668967U,
-	 4604922840319727473U,  4604141730443515286U,
-	13827513767298291094U,  4604922840319727473U,
-	 4588998070480937184U,  4607165468267934125U,
-	13830537505122709933U,  4588998070480937184U,
-	 4607149205763218185U,  4590846768565625881U,
-	13814218805420401689U,  4607149205763218185U,
-	 4603974338538572089U,  4605067233569943231U,
-	13828439270424719039U,  4603974338538572089U,
-	 4606170366472647579U,  4601967947786150793U,
-	13825339984640926601U,  4606170366472647579U,
-	 4599112075441176914U,  4606761837001494797U,
-	13830133873856270605U,  4599112075441176914U,
-	 4606826008603986804U,  4598688984595225406U,
-	13822061021450001214U,  4606826008603986804U,
-	 4602357870542944470U,  4606066157444814153U,
-	13829438194299589961U,  4602357870542944470U,
-	 4605207475328619533U,  4603803453461190356U,
-	13827175490315966164U,  4605207475328619533U,
-	 4592606767730311893U,  4607127537664763515U,
-	13830499574519539323U,  4592606767730311893U,
-	 4607030246558998647U,  4595762727260045105U,
-	13819134764114820913U,  4607030246558998647U,
-	 4603270878689749849U,  4605602459698789090U,
-	13828974496553564898U,  4603270878689749849U,
-	 4605725276488455441U,  4603087070374583113U,
-	13826459107229358921U,  4605725276488455441U,
-	 4596629994023683153U,  4606987119037722413U,
-	13830359155892498221U,  4596629994023683153U,
-	 4606538458821337243U,  4600360675823176935U,
-	13823732712677952743U,  4606538458821337243U,
-	 4600769149537129431U,  4606453861145241227U,
-	13829825898000017035U,  4600769149537129431U,
-	 4604621949701367983U,  4604465633578481725U,
-	13827837670433257533U,  4604621949701367983U,
-	 4578227681973159812U,  4607181740574479067U,
-	13830553777429254875U,  4578227681973159812U,
-	 4607182249242036882U,  4573724215515480177U,
-	13797096252370255985U,  4607182249242036882U,
-	 4604505071555817232U,  4604583231088591477U,
-	13827955267943367285U,  4604505071555817232U,
-	 4606475480113671417U,  4600667422348321968U,
-	13824039459203097776U,  4606475480113671417U,
-	 4600463181646572228U,  4606517779747998088U,
-	13829889816602773896U,  4600463181646572228U,
-	 4606998399608725124U,  4596413578358834022U,
-	13819785615213609830U,  4606998399608725124U,
-	 4603133304188877240U,  4605694995810664660U,
-	13829067032665440468U,  4603133304188877240U,
-	 4605633586259814045U,  4603225210076562971U,
-	13826597246931338779U,  4605633586259814045U,
-	 4595979936813835462U,  4607019963775302583U,
-	13830392000630078391U,  4595979936813835462U,
-	 4607133460805585796U,  4592167175087283203U,
-	13815539211942059011U,  4607133460805585796U,
-	 4603846496621587377U,  4605172808754305228U,
-	13828544845609081036U,  4603846496621587377U,
-	 4606092657816072624U,  4602260871257280788U,
-	13825632908112056596U,  4606092657816072624U,
-	 4598795050632330097U,  4606810452769876110U,
-	13830182489624651918U,  4598795050632330097U,
-	 4606778366364612594U,  4599006600037663623U,
-	13822378636892439431U,  4606778366364612594U,
-	 4602065906208722008U,  4606144763310860551U,
-	13829516800165636359U,  4602065906208722008U,
-	 4605102686554936490U,  4603931940768740167U,
-	13827303977623515975U,  4605102686554936490U,
-	 4591287158938884897U,  4607144295058764886U,
-	13830516331913540694U,  4591287158938884897U,
-	 4607168688050493276U,  4588115294056142819U,
-	13811487330910918627U,  4607168688050493276U,
-	 4604183020748362039U,  4604886103475043762U,
-	13828258140329819570U,  4604183020748362039U,
-	 4606293848208650998U,  4601473544562720001U,
-	13824845581417495809U,  4606293848208650998U,
-	 4599636300858866724U,  4606674353838411301U,
-	13830046390693187109U,  4599636300858866724U,
-	 4606898891031025132U,  4598136582470364665U,
-	13821508619325140473U,  4606898891031025132U,
-	 4602758354025980442U,  4605929219593405673U,
-	13829301256448181481U,  4602758354025980442U,
-	 4605376811039722786U,  4603585091850767959U,
-	13826957128705543767U,  4605376811039722786U,
-	 4594235767444503503U,  4607092871118901179U,
-	13830464907973676987U,  4594235767444503503U,
-	 4607076652372832968U,  4594673119063280916U,
-	13818045155918056724U,  4607076652372832968U,
-	 4603496309891590679U,  4605442656228245717U,
-	13828814693083021525U,  4603496309891590679U,
-	 4605872393621214213U,  4602853162432841185U,
-	13826225199287616993U,  4605872393621214213U,
-	 4597707695679609371U,  4606925748668145757U,
-	13830297785522921565U,  4597707695679609371U,
-	 4606637115963965612U,  4599844446633109139U,
-	13823216483487884947U,  4606637115963965612U,
-	 4601273700967202825U,  4606341107699334546U,
-	13829713144554110354U,  4601273700967202825U,
-	 4604811873195349477U,  4604264921241055824U,
-	13827636958095831632U,  4604811873195349477U,
-	 4586348876009622851U,  4607174111710118367U,
-	13830546148564894175U,  4586348876009622851U,
-	 4607178180169683960U,  4584498631466405633U,
-	13807870668321181441U,  4607178180169683960U,
-	 4604345904647073908U,  4604736643460027021U,
-	13828108680314802829U,  4604345904647073908U,
-	 4606387137437298591U,  4601072712526242277U,
-	13824444749381018085U,  4606387137437298591U,
-	 4600051662802353687U,  4606598603759044570U,
-	13829970640613820378U,  4600051662802353687U,
-	 4606951288507767453U,  4597277522845151878U,
-	13820649559699927686U,  4606951288507767453U,
-	 4602947266358709886U,  4605814408482919348U,
-	13829186445337695156U,  4602947266358709886U,
-	 4605507406967535927U,  4603406726595779752U,
-	13826778763450555560U,  4605507406967535927U,
-	 4595109641634432498U,  4607059093103722971U,
-	13830431129958498779U,  4595109641634432498U,
-	 4607107746899444102U,  4593797652641645341U,
-	13817169689496421149U,  4607107746899444102U,
-	 4603673059103075106U,  4605309881318010327U,
-	13828681918172786135U,  4603673059103075106U,
-	 4605984877841711338U,  4602646891659203088U,
-	13826018928513978896U,  4605984877841711338U,
-	 4598369669086960528U,  4606870719641066940U,
-	13830242756495842748U,  4598369669086960528U,
-	 4606710311774494716U,  4599427256825614420U,
-	13822799293680390228U,  4606710311774494716U,
-	 4601672213217083403U,  4606245366082353408U,
-	13829617402937129216U,  4601672213217083403U,
-	 4604959323120302796U,  4604100215502905499U,
-	13827472252357681307U,  4604959323120302796U,
-	 4589524267239410099U,  4607161910007591876U,
-	13830533946862367684U,  4589524267239410099U,
-	 4607153778602162496U,  4590406145430462614U,
-	13813778182285238422U,  4607153778602162496U,
-	 4604016517974851588U,  4605031521104517324U,
-	13828403557959293132U,  4604016517974851588U,
-	 4606195668621671667U,  4601869677011524443U,
-	13825241713866300251U,  4606195668621671667U,
-	 4599217346014614711U,  4606744984357082948U,
-	13830117021211858756U,  4599217346014614711U,
-	 4606841238740778884U,  4598582729657176439U,
-	13821954766511952247U,  4606841238740778884U,
-	 4602454542796181607U,  4606039359984203741U,
-	13829411396838979549U,  4602454542796181607U,
-	 4605241877142478242U,  4603760198400967492U,
-	13827132235255743300U,  4605241877142478242U,
-	 4593046061348462537U,  4607121277474223905U,
-	13830493314328999713U,  4593046061348462537U,
-	 4607040195955932526U,  4595545269419264690U,
-	13818917306274040498U,  4607040195955932526U,
-	 4603316355454250015U,  4605571053506370248U,
-	13828943090361146056U,  4603316355454250015U,
-	 4605755272910869620U,  4603040651631881451U,
-	13826412688486657259U,  4605755272910869620U,
-	 4596846128749438754U,  4606975506703684317U,
-	13830347543558460125U,  4596846128749438754U,
-	 4606558823023444576U,  4600257918160607478U,
-	13823629955015383286U,  4606558823023444576U,
-	 4600870609507958271U,  4606431930490633905U,
-	13829803967345409713U,  4600870609507958271U,
-	 4604660425598397818U,  4604425958770613225U,
-	13827797995625389033U,  4604660425598397818U,
-	 4580962600092897021U,  4607180892816495009U,
-	13830552929671270817U,  4580962600092897021U,
-	 4607180892816495009U,  4580962600092897021U,
-	13804334636947672829U,  4607180892816495009U,
-	 4604425958770613225U,  4604660425598397818U,
-	13828032462453173626U,  4604425958770613225U,
-	 4606431930490633905U,  4600870609507958271U,
-	13824242646362734079U,  4606431930490633905U,
-	 4600257918160607478U,  4606558823023444576U,
-	13829930859878220384U,  4600257918160607478U,
-	 4606975506703684317U,  4596846128749438754U,
-	13820218165604214562U,  4606975506703684317U,
-	 4603040651631881451U,  4605755272910869620U,
-	13829127309765645428U,  4603040651631881451U,
-	 4605571053506370248U,  4603316355454250015U,
-	13826688392309025823U,  4605571053506370248U,
-	 4595545269419264690U,  4607040195955932526U,
-	13830412232810708334U,  4595545269419264690U,
-	 4607121277474223905U,  4593046061348462537U,
-	13816418098203238345U,  4607121277474223905U,
-	 4603760198400967492U,  4605241877142478242U,
-	13828613913997254050U,  4603760198400967492U,
-	 4606039359984203741U,  4602454542796181607U,
-	13825826579650957415U,  4606039359984203741U,
-	 4598582729657176439U,  4606841238740778884U,
-	13830213275595554692U,  4598582729657176439U,
-	 4606744984357082948U,  4599217346014614711U,
-	13822589382869390519U,  4606744984357082948U,
-	 4601869677011524443U,  4606195668621671667U,
-	13829567705476447475U,  4601869677011524443U,
-	 4605031521104517324U,  4604016517974851588U,
-	13827388554829627396U,  4605031521104517324U,
-	 4590406145430462614U,  4607153778602162496U,
-	13830525815456938304U,  4590406145430462614U,
-	 4607161910007591876U,  4589524267239410099U,
-	13812896304094185907U,  4607161910007591876U,
-	 4604100215502905499U,  4604959323120302796U,
-	13828331359975078604U,  4604100215502905499U,
-	 4606245366082353408U,  4601672213217083403U,
-	13825044250071859211U,  4606245366082353408U,
-	 4599427256825614420U,  4606710311774494716U,
-	13830082348629270524U,  4599427256825614420U,
-	 4606870719641066940U,  4598369669086960528U,
-	13821741705941736336U,  4606870719641066940U,
-	 4602646891659203088U,  4605984877841711338U,
-	13829356914696487146U,  4602646891659203088U,
-	 4605309881318010327U,  4603673059103075106U,
-	13827045095957850914U,  4605309881318010327U,
-	 4593797652641645341U,  4607107746899444102U,
-	13830479783754219910U,  4593797652641645341U,
-	 4607059093103722971U,  4595109641634432498U,
-	13818481678489208306U,  4607059093103722971U,
-	 4603406726595779752U,  4605507406967535927U,
-	13828879443822311735U,  4603406726595779752U,
-	 4605814408482919348U,  4602947266358709886U,
-	13826319303213485694U,  4605814408482919348U,
-	 4597277522845151878U,  4606951288507767453U,
-	13830323325362543261U,  4597277522845151878U,
-	 4606598603759044570U,  4600051662802353687U,
-	13823423699657129495U,  4606598603759044570U,
-	 4601072712526242277U,  4606387137437298591U,
-	13829759174292074399U,  4601072712526242277U,
-	 4604736643460027021U,  4604345904647073908U,
-	13827717941501849716U,  4604736643460027021U,
-	 4584498631466405633U,  4607178180169683960U,
-	13830550217024459768U,  4584498631466405633U,
-	 4607174111710118367U,  4586348876009622851U,
-	13809720912864398659U,  4607174111710118367U,
-	 4604264921241055824U,  4604811873195349477U,
-	13828183910050125285U,  4604264921241055824U,
-	 4606341107699334546U,  4601273700967202825U,
-	13824645737821978633U,  4606341107699334546U,
-	 4599844446633109139U,  4606637115963965612U,
-	13830009152818741420U,  4599844446633109139U,
-	 4606925748668145757U,  4597707695679609371U,
-	13821079732534385179U,  4606925748668145757U,
-	 4602853162432841185U,  4605872393621214213U,
-	13829244430475990021U,  4602853162432841185U,
-	 4605442656228245717U,  4603496309891590679U,
-	13826868346746366487U,  4605442656228245717U,
-	 4594673119063280916U,  4607076652372832968U,
-	13830448689227608776U,  4594673119063280916U,
-	 4607092871118901179U,  4594235767444503503U,
-	13817607804299279311U,  4607092871118901179U,
-	 4603585091850767959U,  4605376811039722786U,
-	13828748847894498594U,  4603585091850767959U,
-	 4605929219593405673U,  4602758354025980442U,
-	13826130390880756250U,  4605929219593405673U,
-	 4598136582470364665U,  4606898891031025132U,
-	13830270927885800940U,  4598136582470364665U,
-	 4606674353838411301U,  4599636300858866724U,
-	13823008337713642532U,  4606674353838411301U,
-	 4601473544562720001U,  4606293848208650998U,
-	13829665885063426806U,  4601473544562720001U,
-	 4604886103475043762U,  4604183020748362039U,
-	13827555057603137847U,  4604886103475043762U,
-	 4588115294056142819U,  4607168688050493276U,
-	13830540724905269084U,  4588115294056142819U,
-	 4607144295058764886U,  4591287158938884897U,
-	13814659195793660705U,  4607144295058764886U,
-	 4603931940768740167U,  4605102686554936490U,
-	13828474723409712298U,  4603931940768740167U,
-	 4606144763310860551U,  4602065906208722008U,
-	13825437943063497816U,  4606144763310860551U,
-	 4599006600037663623U,  4606778366364612594U,
-	13830150403219388402U,  4599006600037663623U,
-	 4606810452769876110U,  4598795050632330097U,
-	13822167087487105905U,  4606810452769876110U,
-	 4602260871257280788U,  4606092657816072624U,
-	13829464694670848432U,  4602260871257280788U,
-	 4605172808754305228U,  4603846496621587377U,
-	13827218533476363185U,  4605172808754305228U,
-	 4592167175087283203U,  4607133460805585796U,
-	13830505497660361604U,  4592167175087283203U,
-	 4607019963775302583U,  4595979936813835462U,
-	13819351973668611270U,  4607019963775302583U,
-	 4603225210076562971U,  4605633586259814045U,
-	13829005623114589853U,  4603225210076562971U,
-	 4605694995810664660U,  4603133304188877240U,
-	13826505341043653048U,  4605694995810664660U,
-	 4596413578358834022U,  4606998399608725124U,
-	13830370436463500932U,  4596413578358834022U,
-	 4606517779747998088U,  4600463181646572228U,
-	13823835218501348036U,  4606517779747998088U,
-	 4600667422348321968U,  4606475480113671417U,
-	13829847516968447225U,  4600667422348321968U,
-	 4604583231088591477U,  4604505071555817232U,
-	13827877108410593040U,  4604583231088591477U,
-	 4573724215515480177U,  4607182249242036882U,
-	13830554286096812690U,  4573724215515480177U,
-	 4607182376410422530U,  4569220649180767418U,
-	13792592686035543226U,  4607182376410422530U,
-	 4604524701268679793U,  4604563781218984604U,
-	13827935818073760412U,  4604524701268679793U,
-	 4606486172460753999U,  4600616459743653188U,
-	13823988496598428996U,  4606486172460753999U,
-	 4600514338912178239U,  4606507322377452870U,
-	13829879359232228678U,  4600514338912178239U,
-	 4607003915349878877U,  4596305267720071930U,
-	13819677304574847738U,  4607003915349878877U,
-	 4603156351203636159U,  4605679749231851918U,
-	13829051786086627726U,  4603156351203636159U,
-	 4605649044311923410U,  4603202304363743346U,
-	13826574341218519154U,  4605649044311923410U,
-	 4596088445927168004U,  4607014697483910382U,
-	13830386734338686190U,  4596088445927168004U,
-	 4607136295912168606U,  4591947271803021404U,
-	13815319308657797212U,  4607136295912168606U,
-	 4603867938232615808U,  4605155376589456981U,
-	13828527413444232789U,  4603867938232615808U,
-	 4606105796280968177U,  4602212250118051877U,
-	13825584286972827685U,  4606105796280968177U,
-	 4598848011564831930U,  4606802552898869248U,
-	13830174589753645056U,  4598848011564831930U,
-	 4606786509620734768U,  4598953786765296928U,
-	13822325823620072736U,  4606786509620734768U,
-	 4602114767134999006U,  4606131849150971908U,
-	13829503886005747716U,  4602114767134999006U,
-	 4605120315324767624U,  4603910660507251362U,
-	13827282697362027170U,  4605120315324767624U,
-	 4591507261658050721U,  4607141713064252300U,
-	13830513749919028108U,  4591507261658050721U,
-	 4607170170974224083U,  4587673791460508439U,
-	13811045828315284247U,  4607170170974224083U,
-	 4604203581176243359U,  4604867640218014515U,
-	13828239677072790323U,  4604203581176243359U,
-	 4606305777984577632U,  4601423692641949331U,
-	13824795729496725139U,  4606305777984577632U,
-	 4599688422741010356U,  4606665164148251002U,
-	13830037201003026810U,  4599688422741010356U,
-	 4606905728766014348U,  4598029484874872834U,
-	13821401521729648642U,  4606905728766014348U,
-	 4602782121393764535U,  4605915122243179241U,
-	13829287159097955049U,  4602782121393764535U,
-	 4605393374401988274U,  4603562972219549215U,
-	13826935009074325023U,  4605393374401988274U,
-	 4594345179472540681U,  4607088942243446236U,
-	13830460979098222044U,  4594345179472540681U,
-	 4607080832832247697U,  4594563856311064231U,
-	13817935893165840039U,  4607080832832247697U,
-	 4603518581031047189U,  4605426297151190466U,
-	13828798334005966274U,  4603518581031047189U,
-	 4605886709123365959U,  4602829525820289164U,
-	13826201562675064972U,  4605886709123365959U,
-	 4597815040470278984U,  4606919157647773535U,
-	13830291194502549343U,  4597815040470278984U,
-	 4606646545123403481U,  4599792496117920694U,
-	13823164532972696502U,  4606646545123403481U,
-	 4601323770373937522U,  4606329407841126011U,
-	13829701444695901819U,  4601323770373937522U,
-	 4604830524903495634U,  4604244531615310815U,
-	13827616568470086623U,  4604830524903495634U,
-	 4586790578280679046U,  4607172882816799076U,
-	13830544919671574884U,  4586790578280679046U,
-	 4607178985458280057U,  4583614727651146525U,
-	13806986764505922333U,  4607178985458280057U,
-	 4604366005771528720U,  4604717681185626434U,
-	13828089718040402242U,  4604366005771528720U,
-	 4606398451906509788U,  4601022290077223616U,
-	13824394326931999424U,  4606398451906509788U,
-	 4600103317933788342U,  4606588777269136769U,
-	13829960814123912577U,  4600103317933788342U,
-	 4606957467106717424U,  4597169786279785693U,
-	13820541823134561501U,  4606957467106717424U,
-	 4602970680601913687U,  4605799732098147061U,
-	13829171768952922869U,  4602970680601913687U,
-	 4605523422498301790U,  4603384207141321914U,
-	13826756243996097722U,  4605523422498301790U,
-	 4595218635031890910U,  4607054494135176056U,
-	13830426530989951864U,  4595218635031890910U,
-	 4607111255739239816U,  4593688012422887515U,
-	13817060049277663323U,  4607111255739239816U,
-	 4603694922063032361U,  4605292980606880364U,
-	13828665017461656172U,  4603694922063032361U,
-	 4605998608960791335U,  4602598930031891166U,
-	13825970966886666974U,  4605998608960791335U,
-	 4598423001813699022U,  4606863472012527185U,
-	13830235508867302993U,  4598423001813699022U,
-	 4606719100629313491U,  4599374859150636784U,
-	13822746896005412592U,  4606719100629313491U,
-	 4601721693286060937U,  4606233055365547081U,
-	13829605092220322889U,  4601721693286060937U,
-	 4604977468824438271U,  4604079374282302598U,
-	13827451411137078406U,  4604977468824438271U,
-	 4589744810590291021U,  4607160003989618959U,
-	13830532040844394767U,  4589744810590291021U,
-	 4607155938267770208U,  4590185751760970393U,
-	13813557788615746201U,  4607155938267770208U,
-	 4604037525321326463U,  4605013567986435066U,
-	13828385604841210874U,  4604037525321326463U,
-	 4606208206518262803U,  4601820425647934753U,
-	13825192462502710561U,  4606208206518262803U,
-	 4599269903251194481U,  4606736437002195879U,
-	13830108473856971687U,  4599269903251194481U,
-	 4606848731493011465U,  4598529532600161144U,
-	13821901569454936952U,  4606848731493011465U,
-	 4602502755147763107U,  4606025850160239809U,
-	13829397887015015617U,  4602502755147763107U,
-	 4605258978359093269U,  4603738491917026584U,
-	13827110528771802392U,  4605258978359093269U,
-	 4593265590854265407U,  4607118021058468598U,
-	13830490057913244406U,  4593265590854265407U,
-	 4607045045516813836U,  4595436449949385485U,
-	13818808486804161293U,  4607045045516813836U,
-	 4603339021357904144U,  4605555245917486022U,
-	13828927282772261830U,  4603339021357904144U,
-	 4605770164172969910U,  4603017373458244943U,
-	13826389410313020751U,  4605770164172969910U,
-	 4596954088216812973U,  4606969576261663845U,
-	13830341613116439653U,  4596954088216812973U,
-	 4606568886807728474U,  4600206446098256018U,
-	13823578482953031826U,  4606568886807728474U,
-	 4600921238092511730U,  4606420848538580260U,
-	13829792885393356068U,  4600921238092511730U,
-	 4604679572075463103U,  4604406033021674239U,
-	13827778069876450047U,  4604679572075463103U,
-	 4581846703643734566U,  4607180341788068727U,
-	13830552378642844535U,  4581846703643734566U,
-	 4607181359080094673U,  4579996072175835083U,
-	13803368109030610891U,  4607181359080094673U,
-	 4604445825685214043U,  4604641218080103285U,
-	13828013254934879093U,  4604445825685214043U,
-	 4606442934727379583U,  4600819913163773071U,
-	13824191950018548879U,  4606442934727379583U,
-	 4600309328230211502U,  4606548680329491866U,
-	13829920717184267674U,  4600309328230211502U,
-	 4606981354314050484U,  4596738097012783531U,
-	13820110133867559339U,  4606981354314050484U,
-	 4603063884010218172U,  4605740310302420207U,
-	13829112347157196015U,  4603063884010218172U,
-	 4605586791482848547U,  4603293641160266722U,
-	13826665678015042530U,  4605586791482848547U,
-	 4595654028864046335U,  4607035262954517034U,
-	13830407299809292842U,  4595654028864046335U,
-	 4607124449686274900U,  4592826452951465409U,
-	13816198489806241217U,  4607124449686274900U,
-	 4603781852316960384U,  4605224709411790590U,
-	13828596746266566398U,  4603781852316960384U,
-	 4606052795787882823U,  4602406247776385022U,
-	13825778284631160830U,  4606052795787882823U,
-	 4598635880488956483U,  4606833664420673202U,
-	13830205701275449010U,  4598635880488956483U,
-	 4606753451050079834U,  4599164736579548843U,
-	13822536773434324651U,  4606753451050079834U,
-	 4601918851211878557U,  4606183055233559255U,
-	13829555092088335063U,  4601918851211878557U,
-	 4605049409688478101U,  4603995455647851249U,
-	13827367492502627057U,  4605049409688478101U,
-	 4590626485056654602U,  4607151534426937478U,
-	13830523571281713286U,  4590626485056654602U,
-	 4607163731439411601U,  4589303678145802340U,
-	13812675715000578148U,  4607163731439411601U,
-	 4604121000955189926U,  4604941113561600762U,
-	13828313150416376570U,  4604121000955189926U,
-	 4606257600839867033U,  4601622657843474729U,
-	13824994694698250537U,  4606257600839867033U,
-	 4599479600326345459U,  4606701442584137310U,
-	13830073479438913118U,  4599479600326345459U,
-	 4606877885424248132U,  4598316292140394014U,
-	13821688328995169822U,  4606877885424248132U,
-	 4602686793990243041U,  4605971073215153165U,
-	13829343110069928973U,  4602686793990243041U,
-	 4605326714874986465U,  4603651144395358093U,
-	13827023181250133901U,  4605326714874986465U,
-	 4593907249284540294U,  4607104153983298999U,
-	13830476190838074807U,  4593907249284540294U,
-	 4607063608453868552U,  4595000592312171144U,
-	13818372629166946952U,  4607063608453868552U,
-	 4603429196809300824U,  4605491322423429598U,
-	13828863359278205406U,  4603429196809300824U,
-	 4605829012964735987U,  4602923807199184054U,
-	13826295844053959862U,  4605829012964735987U,
-	 4597385183080791534U,  4606945027305114062U,
-	13830317064159889870U,  4597385183080791534U,
-	 4606608350964852124U,  4599999947619525579U,
-	13823371984474301387U,  4606608350964852124U,
-	 4601123065313358619U,  4606375745674388705U,
-	13829747782529164513U,  4601123065313358619U,
-	 4604755543975806820U,  4604325745441780828U,
-	13827697782296556636U,  4604755543975806820U,
-	 4585023436363055487U,  4607177290141793710U,
-	13830549326996569518U,  4585023436363055487U,
-	 4607175255902437396U,  4585907115494236537U,
-	13809279152349012345U,  4607175255902437396U,
-	 4604285253548209224U,  4604793159020491611U,
-	13828165195875267419U,  4604285253548209224U,
-	 4606352730697093817U,  4601223560006786057U,
-	13824595596861561865U,  4606352730697093817U,
-	 4599896339047301634U,  4606627607157935956U,
-	13829999644012711764U,  4599896339047301634U,
-	 4606932257325205256U,  4597600270510262682U,
-	13820972307365038490U,  4606932257325205256U,
-	 4602876755014813164U,  4605858005670328613U,
-	13829230042525104421U,  4602876755014813164U,
-	 4605458946901419122U,  4603473988668005304U,
-	13826846025522781112U,  4605458946901419122U,
-	 4594782329999411347U,  4607072388129742377U,
-	13830444424984518185U,  4594782329999411347U,
-	 4607096716058023245U,  4594126307716900071U,
-	13817498344571675879U,  4607096716058023245U,
-	 4603607160562208225U,  4605360179893335444U,
-	13828732216748111252U,  4603607160562208225U,
-	 4605943243960030558U,  4602734543519989142U,
-	13826106580374764950U,  4605943243960030558U,
-	 4598209407597805010U,  4606891971185517504U,
-	13830264008040293312U,  4598209407597805010U,
-	 4606683463531482757U,  4599584122834874440U,
-	13822956159689650248U,  4606683463531482757U,
-	 4601523323048804569U,  4606281842017099424U,
-	13829653878871875232U,  4601523323048804569U,
-	 4604904503566677638U,  4604162403772767740U,
-	13827534440627543548U,  4604904503566677638U,
-	 4588556721781247689U,  4607167120476811757U,
-	13830539157331587565U,  4588556721781247689U,
-	 4607146792632922887U,  4591066993883984169U,
-	13814439030738759977U,  4607146792632922887U,
-	 4603953166845776383U,  4605084992581147553U,
-	13828457029435923361U,  4603953166845776383U,
-	 4606157602458368090U,  4602016966272225497U,
-	13825389003127001305U,  4606157602458368090U,
-	 4599059363095165615U,  4606770142132396069U,
-	13830142178987171877U,  4599059363095165615U,
-	 4606818271362779153U,  4598742041476147134U,
-	13822114078330922942U,  4606818271362779153U,
-	 4602309411551204896U,  4606079444829232727U,
-	13829451481684008535U,  4602309411551204896U,
-	 4605190175055178825U,  4603825001630339212U,
-	13827197038485115020U,  4605190175055178825U,
-	 4592387007752762956U,  4607130541380624519U,
-	13830502578235400327U,  4592387007752762956U,
-	 4607025146816593591U,  4595871363584150300U,
-	13819243400438926108U,  4607025146816593591U,
-	 4603248068256948438U,  4605618058006716661U,
-	13828990094861492469U,  4603248068256948438U,
-	 4605710171610479304U,  4603110210506737381U,
-	13826482247361513189U,  4605710171610479304U,
-	 4596521820799644122U,  4606992800820440327U,
-	13830364837675216135U,  4596521820799644122U,
-	 4606528158595189433U,  4600411960456200676U,
-	13823783997310976484U,  4606528158595189433U,
-	 4600718319105833937U,  4606464709641375231U,
-	13829836746496151039U,  4600718319105833937U,
-	 4604602620643553229U,  4604485382263976838U,
-	13827857419118752646U,  4604602620643553229U,
-	 4576459225186735875U,  4607182037296057423U,
-	13830554074150833231U,  4576459225186735875U,
-	 4607182037296057423U,  4576459225186735875U,
-	13799831262041511683U,  4607182037296057423U,
-	 4604485382263976838U,  4604602620643553229U,
-	13827974657498329037U,  4604485382263976838U,
-	 4606464709641375231U,  4600718319105833937U,
-	13824090355960609745U,  4606464709641375231U,
-	 4600411960456200676U,  4606528158595189433U,
-	13829900195449965241U,  4600411960456200676U,
-	 4606992800820440327U,  4596521820799644122U,
-	13819893857654419930U,  4606992800820440327U,
-	 4603110210506737381U,  4605710171610479304U,
-	13829082208465255112U,  4603110210506737381U,
-	 4605618058006716661U,  4603248068256948438U,
-	13826620105111724246U,  4605618058006716661U,
-	 4595871363584150300U,  4607025146816593591U,
-	13830397183671369399U,  4595871363584150300U,
-	 4607130541380624519U,  4592387007752762956U,
-	13815759044607538764U,  4607130541380624519U,
-	 4603825001630339212U,  4605190175055178825U,
-	13828562211909954633U,  4603825001630339212U,
-	 4606079444829232727U,  4602309411551204896U,
-	13825681448405980704U,  4606079444829232727U,
-	 4598742041476147134U,  4606818271362779153U,
-	13830190308217554961U,  4598742041476147134U,
-	 4606770142132396069U,  4599059363095165615U,
-	13822431399949941423U,  4606770142132396069U,
-	 4602016966272225497U,  4606157602458368090U,
-	13829529639313143898U,  4602016966272225497U,
-	 4605084992581147553U,  4603953166845776383U,
-	13827325203700552191U,  4605084992581147553U,
-	 4591066993883984169U,  4607146792632922887U,
-	13830518829487698695U,  4591066993883984169U,
-	 4607167120476811757U,  4588556721781247689U,
-	13811928758636023497U,  4607167120476811757U,
-	 4604162403772767740U,  4604904503566677638U,
-	13828276540421453446U,  4604162403772767740U,
-	 4606281842017099424U,  4601523323048804569U,
-	13824895359903580377U,  4606281842017099424U,
-	 4599584122834874440U,  4606683463531482757U,
-	13830055500386258565U,  4599584122834874440U,
-	 4606891971185517504U,  4598209407597805010U,
-	13821581444452580818U,  4606891971185517504U,
-	 4602734543519989142U,  4605943243960030558U,
-	13829315280814806366U,  4602734543519989142U,
-	 4605360179893335444U,  4603607160562208225U,
-	13826979197416984033U,  4605360179893335444U,
-	 4594126307716900071U,  4607096716058023245U,
-	13830468752912799053U,  4594126307716900071U,
-	 4607072388129742377U,  4594782329999411347U,
-	13818154366854187155U,  4607072388129742377U,
-	 4603473988668005304U,  4605458946901419122U,
-	13828830983756194930U,  4603473988668005304U,
-	 4605858005670328613U,  4602876755014813164U,
-	13826248791869588972U,  4605858005670328613U,
-	 4597600270510262682U,  4606932257325205256U,
-	13830304294179981064U,  4597600270510262682U,
-	 4606627607157935956U,  4599896339047301634U,
-	13823268375902077442U,  4606627607157935956U,
-	 4601223560006786057U,  4606352730697093817U,
-	13829724767551869625U,  4601223560006786057U,
-	 4604793159020491611U,  4604285253548209224U,
-	13827657290402985032U,  4604793159020491611U,
-	 4585907115494236537U,  4607175255902437396U,
-	13830547292757213204U,  4585907115494236537U,
-	 4607177290141793710U,  4585023436363055487U,
-	13808395473217831295U,  4607177290141793710U,
-	 4604325745441780828U,  4604755543975806820U,
-	13828127580830582628U,  4604325745441780828U,
-	 4606375745674388705U,  4601123065313358619U,
-	13824495102168134427U,  4606375745674388705U,
-	 4599999947619525579U,  4606608350964852124U,
-	13829980387819627932U,  4599999947619525579U,
-	 4606945027305114062U,  4597385183080791534U,
-	13820757219935567342U,  4606945027305114062U,
-	 4602923807199184054U,  4605829012964735987U,
-	13829201049819511795U,  4602923807199184054U,
-	 4605491322423429598U,  4603429196809300824U,
-	13826801233664076632U,  4605491322423429598U,
-	 4595000592312171144U,  4607063608453868552U,
-	13830435645308644360U,  4595000592312171144U,
-	 4607104153983298999U,  4593907249284540294U,
-	13817279286139316102U,  4607104153983298999U,
-	 4603651144395358093U,  4605326714874986465U,
-	13828698751729762273U,  4603651144395358093U,
-	 4605971073215153165U,  4602686793990243041U,
-	13826058830845018849U,  4605971073215153165U,
-	 4598316292140394014U,  4606877885424248132U,
-	13830249922279023940U,  4598316292140394014U,
-	 4606701442584137310U,  4599479600326345459U,
-	13822851637181121267U,  4606701442584137310U,
-	 4601622657843474729U,  4606257600839867033U,
-	13829629637694642841U,  4601622657843474729U,
-	 4604941113561600762U,  4604121000955189926U,
-	13827493037809965734U,  4604941113561600762U,
-	 4589303678145802340U,  4607163731439411601U,
-	13830535768294187409U,  4589303678145802340U,
-	 4607151534426937478U,  4590626485056654602U,
-	13813998521911430410U,  4607151534426937478U,
-	 4603995455647851249U,  4605049409688478101U,
-	13828421446543253909U,  4603995455647851249U,
-	 4606183055233559255U,  4601918851211878557U,
-	13825290888066654365U,  4606183055233559255U,
-	 4599164736579548843U,  4606753451050079834U,
-	13830125487904855642U,  4599164736579548843U,
-	 4606833664420673202U,  4598635880488956483U,
-	13822007917343732291U,  4606833664420673202U,
-	 4602406247776385022U,  4606052795787882823U,
-	13829424832642658631U,  4602406247776385022U,
-	 4605224709411790590U,  4603781852316960384U,
-	13827153889171736192U,  4605224709411790590U,
-	 4592826452951465409U,  4607124449686274900U,
-	13830496486541050708U,  4592826452951465409U,
-	 4607035262954517034U,  4595654028864046335U,
-	13819026065718822143U,  4607035262954517034U,
-	 4603293641160266722U,  4605586791482848547U,
-	13828958828337624355U,  4603293641160266722U,
-	 4605740310302420207U,  4603063884010218172U,
-	13826435920864993980U,  4605740310302420207U,
-	 4596738097012783531U,  4606981354314050484U,
-	13830353391168826292U,  4596738097012783531U,
-	 4606548680329491866U,  4600309328230211502U,
-	13823681365084987310U,  4606548680329491866U,
-	 4600819913163773071U,  4606442934727379583U,
-	13829814971582155391U,  4600819913163773071U,
-	 4604641218080103285U,  4604445825685214043U,
-	13827817862539989851U,  4604641218080103285U,
-	 4579996072175835083U,  4607181359080094673U,
-	13830553395934870481U,  4579996072175835083U,
-	 4607180341788068727U,  4581846703643734566U,
-	13805218740498510374U,  4607180341788068727U,
-	 4604406033021674239U,  4604679572075463103U,
-	13828051608930238911U,  4604406033021674239U,
-	 4606420848538580260U,  4600921238092511730U,
-	13824293274947287538U,  4606420848538580260U,
-	 4600206446098256018U,  4606568886807728474U,
-	13829940923662504282U,  4600206446098256018U,
-	 4606969576261663845U,  4596954088216812973U,
-	13820326125071588781U,  4606969576261663845U,
-	 4603017373458244943U,  4605770164172969910U,
-	13829142201027745718U,  4603017373458244943U,
-	 4605555245917486022U,  4603339021357904144U,
-	13826711058212679952U,  4605555245917486022U,
-	 4595436449949385485U,  4607045045516813836U,
-	13830417082371589644U,  4595436449949385485U,
-	 4607118021058468598U,  4593265590854265407U,
-	13816637627709041215U,  4607118021058468598U,
-	 4603738491917026584U,  4605258978359093269U,
-	13828631015213869077U,  4603738491917026584U,
-	 4606025850160239809U,  4602502755147763107U,
-	13825874792002538915U,  4606025850160239809U,
-	 4598529532600161144U,  4606848731493011465U,
-	13830220768347787273U,  4598529532600161144U,
-	 4606736437002195879U,  4599269903251194481U,
-	13822641940105970289U,  4606736437002195879U,
-	 4601820425647934753U,  4606208206518262803U,
-	13829580243373038611U,  4601820425647934753U,
-	 4605013567986435066U,  4604037525321326463U,
-	13827409562176102271U,  4605013567986435066U,
-	 4590185751760970393U,  4607155938267770208U,
-	13830527975122546016U,  4590185751760970393U,
-	 4607160003989618959U,  4589744810590291021U,
-	13813116847445066829U,  4607160003989618959U,
-	 4604079374282302598U,  4604977468824438271U,
-	13828349505679214079U,  4604079374282302598U,
-	 4606233055365547081U,  4601721693286060937U,
-	13825093730140836745U,  4606233055365547081U,
-	 4599374859150636784U,  4606719100629313491U,
-	13830091137484089299U,  4599374859150636784U,
-	 4606863472012527185U,  4598423001813699022U,
-	13821795038668474830U,  4606863472012527185U,
-	 4602598930031891166U,  4605998608960791335U,
-	13829370645815567143U,  4602598930031891166U,
-	 4605292980606880364U,  4603694922063032361U,
-	13827066958917808169U,  4605292980606880364U,
-	 4593688012422887515U,  4607111255739239816U,
-	13830483292594015624U,  4593688012422887515U,
-	 4607054494135176056U,  4595218635031890910U,
-	13818590671886666718U,  4607054494135176056U,
-	 4603384207141321914U,  4605523422498301790U,
-	13828895459353077598U,  4603384207141321914U,
-	 4605799732098147061U,  4602970680601913687U,
-	13826342717456689495U,  4605799732098147061U,
-	 4597169786279785693U,  4606957467106717424U,
-	13830329503961493232U,  4597169786279785693U,
-	 4606588777269136769U,  4600103317933788342U,
-	13823475354788564150U,  4606588777269136769U,
-	 4601022290077223616U,  4606398451906509788U,
-	13829770488761285596U,  4601022290077223616U,
-	 4604717681185626434U,  4604366005771528720U,
-	13827738042626304528U,  4604717681185626434U,
-	 4583614727651146525U,  4607178985458280057U,
-	13830551022313055865U,  4583614727651146525U,
-	 4607172882816799076U,  4586790578280679046U,
-	13810162615135454854U,  4607172882816799076U,
-	 4604244531615310815U,  4604830524903495634U,
-	13828202561758271442U,  4604244531615310815U,
-	 4606329407841126011U,  4601323770373937522U,
-	13824695807228713330U,  4606329407841126011U,
-	 4599792496117920694U,  4606646545123403481U,
-	13830018581978179289U,  4599792496117920694U,
-	 4606919157647773535U,  4597815040470278984U,
-	13821187077325054792U,  4606919157647773535U,
-	 4602829525820289164U,  4605886709123365959U,
-	13829258745978141767U,  4602829525820289164U,
-	 4605426297151190466U,  4603518581031047189U,
-	13826890617885822997U,  4605426297151190466U,
-	 4594563856311064231U,  4607080832832247697U,
-	13830452869687023505U,  4594563856311064231U,
-	 4607088942243446236U,  4594345179472540681U,
-	13817717216327316489U,  4607088942243446236U,
-	 4603562972219549215U,  4605393374401988274U,
-	13828765411256764082U,  4603562972219549215U,
-	 4605915122243179241U,  4602782121393764535U,
-	13826154158248540343U,  4605915122243179241U,
-	 4598029484874872834U,  4606905728766014348U,
-	13830277765620790156U,  4598029484874872834U,
-	 4606665164148251002U,  4599688422741010356U,
-	13823060459595786164U,  4606665164148251002U,
-	 4601423692641949331U,  4606305777984577632U,
-	13829677814839353440U,  4601423692641949331U,
-	 4604867640218014515U,  4604203581176243359U,
-	13827575618031019167U,  4604867640218014515U,
-	 4587673791460508439U,  4607170170974224083U,
-	13830542207828999891U,  4587673791460508439U,
-	 4607141713064252300U,  4591507261658050721U,
-	13814879298512826529U,  4607141713064252300U,
-	 4603910660507251362U,  4605120315324767624U,
-	13828492352179543432U,  4603910660507251362U,
-	 4606131849150971908U,  4602114767134999006U,
-	13825486803989774814U,  4606131849150971908U,
-	 4598953786765296928U,  4606786509620734768U,
-	13830158546475510576U,  4598953786765296928U,
-	 4606802552898869248U,  4598848011564831930U,
-	13822220048419607738U,  4606802552898869248U,
-	 4602212250118051877U,  4606105796280968177U,
-	13829477833135743985U,  4602212250118051877U,
-	 4605155376589456981U,  4603867938232615808U,
-	13827239975087391616U,  4605155376589456981U,
-	 4591947271803021404U,  4607136295912168606U,
-	13830508332766944414U,  4591947271803021404U,
-	 4607014697483910382U,  4596088445927168004U,
-	13819460482781943812U,  4607014697483910382U,
-	 4603202304363743346U,  4605649044311923410U,
-	13829021081166699218U,  4603202304363743346U,
-	 4605679749231851918U,  4603156351203636159U,
-	13826528388058411967U,  4605679749231851918U,
-	 4596305267720071930U,  4607003915349878877U,
-	13830375952204654685U,  4596305267720071930U,
-	 4606507322377452870U,  4600514338912178239U,
-	13823886375766954047U,  4606507322377452870U,
-	 4600616459743653188U,  4606486172460753999U,
-	13829858209315529807U,  4600616459743653188U,
-	 4604563781218984604U,  4604524701268679793U,
-	13827896738123455601U,  4604563781218984604U,
-	 4569220649180767418U,  4607182376410422530U,
-	13830554413265198338U,  4569220649180767418U
-};
-
-const fpr fpr_p2_tab[] = {
-	4611686018427387904U,
-	4607182418800017408U,
-	4602678819172646912U,
-	4598175219545276416U,
-	4593671619917905920U,
-	4589168020290535424U,
-	4584664420663164928U,
-	4580160821035794432U,
-	4575657221408423936U,
-	4571153621781053440U,
-	4566650022153682944U
-};
-
-#elif FALCON_FPNATIVE // yyyFPEMU+0 yyyFPNATIVE+1
-
-const fpr fpr_gm_tab[] = {
-	{0}, {0}, /* unused */
-	{-0.000000000000000000000000000}, { 1.000000000000000000000000000},
-	{ 0.707106781186547524400844362}, { 0.707106781186547524400844362},
-	{-0.707106781186547524400844362}, { 0.707106781186547524400844362},
-	{ 0.923879532511286756128183189}, { 0.382683432365089771728459984},
-	{-0.382683432365089771728459984}, { 0.923879532511286756128183189},
-	{ 0.382683432365089771728459984}, { 0.923879532511286756128183189},
-	{-0.923879532511286756128183189}, { 0.382683432365089771728459984},
-	{ 0.980785280403230449126182236}, { 0.195090322016128267848284868},
-	{-0.195090322016128267848284868}, { 0.980785280403230449126182236},
-	{ 0.555570233019602224742830814}, { 0.831469612302545237078788378},
-	{-0.831469612302545237078788378}, { 0.555570233019602224742830814},
-	{ 0.831469612302545237078788378}, { 0.555570233019602224742830814},
-	{-0.555570233019602224742830814}, { 0.831469612302545237078788378},
-	{ 0.195090322016128267848284868}, { 0.980785280403230449126182236},
-	{-0.980785280403230449126182236}, { 0.195090322016128267848284868},
-	{ 0.995184726672196886244836953}, { 0.098017140329560601994195564},
-	{-0.098017140329560601994195564}, { 0.995184726672196886244836953},
-	{ 0.634393284163645498215171613}, { 0.773010453362736960810906610},
-	{-0.773010453362736960810906610}, { 0.634393284163645498215171613},
-	{ 0.881921264348355029712756864}, { 0.471396736825997648556387626},
-	{-0.471396736825997648556387626}, { 0.881921264348355029712756864},
-	{ 0.290284677254462367636192376}, { 0.956940335732208864935797887},
-	{-0.956940335732208864935797887}, { 0.290284677254462367636192376},
-	{ 0.956940335732208864935797887}, { 0.290284677254462367636192376},
-	{-0.290284677254462367636192376}, { 0.956940335732208864935797887},
-	{ 0.471396736825997648556387626}, { 0.881921264348355029712756864},
-	{-0.881921264348355029712756864}, { 0.471396736825997648556387626},
-	{ 0.773010453362736960810906610}, { 0.634393284163645498215171613},
-	{-0.634393284163645498215171613}, { 0.773010453362736960810906610},
-	{ 0.098017140329560601994195564}, { 0.995184726672196886244836953},
-	{-0.995184726672196886244836953}, { 0.098017140329560601994195564},
-	{ 0.998795456205172392714771605}, { 0.049067674327418014254954977},
-	{-0.049067674327418014254954977}, { 0.998795456205172392714771605},
-	{ 0.671558954847018400625376850}, { 0.740951125354959091175616897},
-	{-0.740951125354959091175616897}, { 0.671558954847018400625376850},
-	{ 0.903989293123443331586200297}, { 0.427555093430282094320966857},
-	{-0.427555093430282094320966857}, { 0.903989293123443331586200297},
-	{ 0.336889853392220050689253213}, { 0.941544065183020778412509403},
-	{-0.941544065183020778412509403}, { 0.336889853392220050689253213},
-	{ 0.970031253194543992603984207}, { 0.242980179903263889948274162},
-	{-0.242980179903263889948274162}, { 0.970031253194543992603984207},
-	{ 0.514102744193221726593693839}, { 0.857728610000272069902269984},
-	{-0.857728610000272069902269984}, { 0.514102744193221726593693839},
-	{ 0.803207531480644909806676513}, { 0.595699304492433343467036529},
-	{-0.595699304492433343467036529}, { 0.803207531480644909806676513},
-	{ 0.146730474455361751658850130}, { 0.989176509964780973451673738},
-	{-0.989176509964780973451673738}, { 0.146730474455361751658850130},
-	{ 0.989176509964780973451673738}, { 0.146730474455361751658850130},
-	{-0.146730474455361751658850130}, { 0.989176509964780973451673738},
-	{ 0.595699304492433343467036529}, { 0.803207531480644909806676513},
-	{-0.803207531480644909806676513}, { 0.595699304492433343467036529},
-	{ 0.857728610000272069902269984}, { 0.514102744193221726593693839},
-	{-0.514102744193221726593693839}, { 0.857728610000272069902269984},
-	{ 0.242980179903263889948274162}, { 0.970031253194543992603984207},
-	{-0.970031253194543992603984207}, { 0.242980179903263889948274162},
-	{ 0.941544065183020778412509403}, { 0.336889853392220050689253213},
-	{-0.336889853392220050689253213}, { 0.941544065183020778412509403},
-	{ 0.427555093430282094320966857}, { 0.903989293123443331586200297},
-	{-0.903989293123443331586200297}, { 0.427555093430282094320966857},
-	{ 0.740951125354959091175616897}, { 0.671558954847018400625376850},
-	{-0.671558954847018400625376850}, { 0.740951125354959091175616897},
-	{ 0.049067674327418014254954977}, { 0.998795456205172392714771605},
-	{-0.998795456205172392714771605}, { 0.049067674327418014254954977},
-	{ 0.999698818696204220115765650}, { 0.024541228522912288031734529},
-	{-0.024541228522912288031734529}, { 0.999698818696204220115765650},
-	{ 0.689540544737066924616730630}, { 0.724247082951466920941069243},
-	{-0.724247082951466920941069243}, { 0.689540544737066924616730630},
-	{ 0.914209755703530654635014829}, { 0.405241314004989870908481306},
-	{-0.405241314004989870908481306}, { 0.914209755703530654635014829},
-	{ 0.359895036534988148775104572}, { 0.932992798834738887711660256},
-	{-0.932992798834738887711660256}, { 0.359895036534988148775104572},
-	{ 0.975702130038528544460395766}, { 0.219101240156869797227737547},
-	{-0.219101240156869797227737547}, { 0.975702130038528544460395766},
-	{ 0.534997619887097210663076905}, { 0.844853565249707073259571205},
-	{-0.844853565249707073259571205}, { 0.534997619887097210663076905},
-	{ 0.817584813151583696504920884}, { 0.575808191417845300745972454},
-	{-0.575808191417845300745972454}, { 0.817584813151583696504920884},
-	{ 0.170961888760301226363642357}, { 0.985277642388941244774018433},
-	{-0.985277642388941244774018433}, { 0.170961888760301226363642357},
-	{ 0.992479534598709998156767252}, { 0.122410675199216198498704474},
-	{-0.122410675199216198498704474}, { 0.992479534598709998156767252},
-	{ 0.615231590580626845484913563}, { 0.788346427626606262009164705},
-	{-0.788346427626606262009164705}, { 0.615231590580626845484913563},
-	{ 0.870086991108711418652292404}, { 0.492898192229784036873026689},
-	{-0.492898192229784036873026689}, { 0.870086991108711418652292404},
-	{ 0.266712757474898386325286515}, { 0.963776065795439866686464356},
-	{-0.963776065795439866686464356}, { 0.266712757474898386325286515},
-	{ 0.949528180593036667195936074}, { 0.313681740398891476656478846},
-	{-0.313681740398891476656478846}, { 0.949528180593036667195936074},
-	{ 0.449611329654606600046294579}, { 0.893224301195515320342416447},
-	{-0.893224301195515320342416447}, { 0.449611329654606600046294579},
-	{ 0.757208846506484547575464054}, { 0.653172842953776764084203014},
-	{-0.653172842953776764084203014}, { 0.757208846506484547575464054},
-	{ 0.073564563599667423529465622}, { 0.997290456678690216135597140},
-	{-0.997290456678690216135597140}, { 0.073564563599667423529465622},
-	{ 0.997290456678690216135597140}, { 0.073564563599667423529465622},
-	{-0.073564563599667423529465622}, { 0.997290456678690216135597140},
-	{ 0.653172842953776764084203014}, { 0.757208846506484547575464054},
-	{-0.757208846506484547575464054}, { 0.653172842953776764084203014},
-	{ 0.893224301195515320342416447}, { 0.449611329654606600046294579},
-	{-0.449611329654606600046294579}, { 0.893224301195515320342416447},
-	{ 0.313681740398891476656478846}, { 0.949528180593036667195936074},
-	{-0.949528180593036667195936074}, { 0.313681740398891476656478846},
-	{ 0.963776065795439866686464356}, { 0.266712757474898386325286515},
-	{-0.266712757474898386325286515}, { 0.963776065795439866686464356},
-	{ 0.492898192229784036873026689}, { 0.870086991108711418652292404},
-	{-0.870086991108711418652292404}, { 0.492898192229784036873026689},
-	{ 0.788346427626606262009164705}, { 0.615231590580626845484913563},
-	{-0.615231590580626845484913563}, { 0.788346427626606262009164705},
-	{ 0.122410675199216198498704474}, { 0.992479534598709998156767252},
-	{-0.992479534598709998156767252}, { 0.122410675199216198498704474},
-	{ 0.985277642388941244774018433}, { 0.170961888760301226363642357},
-	{-0.170961888760301226363642357}, { 0.985277642388941244774018433},
-	{ 0.575808191417845300745972454}, { 0.817584813151583696504920884},
-	{-0.817584813151583696504920884}, { 0.575808191417845300745972454},
-	{ 0.844853565249707073259571205}, { 0.534997619887097210663076905},
-	{-0.534997619887097210663076905}, { 0.844853565249707073259571205},
-	{ 0.219101240156869797227737547}, { 0.975702130038528544460395766},
-	{-0.975702130038528544460395766}, { 0.219101240156869797227737547},
-	{ 0.932992798834738887711660256}, { 0.359895036534988148775104572},
-	{-0.359895036534988148775104572}, { 0.932992798834738887711660256},
-	{ 0.405241314004989870908481306}, { 0.914209755703530654635014829},
-	{-0.914209755703530654635014829}, { 0.405241314004989870908481306},
-	{ 0.724247082951466920941069243}, { 0.689540544737066924616730630},
-	{-0.689540544737066924616730630}, { 0.724247082951466920941069243},
-	{ 0.024541228522912288031734529}, { 0.999698818696204220115765650},
-	{-0.999698818696204220115765650}, { 0.024541228522912288031734529},
-	{ 0.999924701839144540921646491}, { 0.012271538285719926079408262},
-	{-0.012271538285719926079408262}, { 0.999924701839144540921646491},
-	{ 0.698376249408972853554813503}, { 0.715730825283818654125532623},
-	{-0.715730825283818654125532623}, { 0.698376249408972853554813503},
-	{ 0.919113851690057743908477789}, { 0.393992040061048108596188661},
-	{-0.393992040061048108596188661}, { 0.919113851690057743908477789},
-	{ 0.371317193951837543411934967}, { 0.928506080473215565937167396},
-	{-0.928506080473215565937167396}, { 0.371317193951837543411934967},
-	{ 0.978317370719627633106240097}, { 0.207111376192218549708116020},
-	{-0.207111376192218549708116020}, { 0.978317370719627633106240097},
-	{ 0.545324988422046422313987347}, { 0.838224705554838043186996856},
-	{-0.838224705554838043186996856}, { 0.545324988422046422313987347},
-	{ 0.824589302785025264474803737}, { 0.565731810783613197389765011},
-	{-0.565731810783613197389765011}, { 0.824589302785025264474803737},
-	{ 0.183039887955140958516532578}, { 0.983105487431216327180301155},
-	{-0.983105487431216327180301155}, { 0.183039887955140958516532578},
-	{ 0.993906970002356041546922813}, { 0.110222207293883058807899140},
-	{-0.110222207293883058807899140}, { 0.993906970002356041546922813},
-	{ 0.624859488142386377084072816}, { 0.780737228572094478301588484},
-	{-0.780737228572094478301588484}, { 0.624859488142386377084072816},
-	{ 0.876070094195406607095844268}, { 0.482183772079122748517344481},
-	{-0.482183772079122748517344481}, { 0.876070094195406607095844268},
-	{ 0.278519689385053105207848526}, { 0.960430519415565811199035138},
-	{-0.960430519415565811199035138}, { 0.278519689385053105207848526},
-	{ 0.953306040354193836916740383}, { 0.302005949319228067003463232},
-	{-0.302005949319228067003463232}, { 0.953306040354193836916740383},
-	{ 0.460538710958240023633181487}, { 0.887639620402853947760181617},
-	{-0.887639620402853947760181617}, { 0.460538710958240023633181487},
-	{ 0.765167265622458925888815999}, { 0.643831542889791465068086063},
-	{-0.643831542889791465068086063}, { 0.765167265622458925888815999},
-	{ 0.085797312344439890461556332}, { 0.996312612182778012627226190},
-	{-0.996312612182778012627226190}, { 0.085797312344439890461556332},
-	{ 0.998118112900149207125155861}, { 0.061320736302208577782614593},
-	{-0.061320736302208577782614593}, { 0.998118112900149207125155861},
-	{ 0.662415777590171761113069817}, { 0.749136394523459325469203257},
-	{-0.749136394523459325469203257}, { 0.662415777590171761113069817},
-	{ 0.898674465693953843041976744}, { 0.438616238538527637647025738},
-	{-0.438616238538527637647025738}, { 0.898674465693953843041976744},
-	{ 0.325310292162262934135954708}, { 0.945607325380521325730945387},
-	{-0.945607325380521325730945387}, { 0.325310292162262934135954708},
-	{ 0.966976471044852109087220226}, { 0.254865659604514571553980779},
-	{-0.254865659604514571553980779}, { 0.966976471044852109087220226},
-	{ 0.503538383725717558691867071}, { 0.863972856121586737918147054},
-	{-0.863972856121586737918147054}, { 0.503538383725717558691867071},
-	{ 0.795836904608883536262791915}, { 0.605511041404325513920626941},
-	{-0.605511041404325513920626941}, { 0.795836904608883536262791915},
-	{ 0.134580708507126186316358409}, { 0.990902635427780025108237011},
-	{-0.990902635427780025108237011}, { 0.134580708507126186316358409},
-	{ 0.987301418157858382399815802}, { 0.158858143333861441684385360},
-	{-0.158858143333861441684385360}, { 0.987301418157858382399815802},
-	{ 0.585797857456438860328080838}, { 0.810457198252594791726703434},
-	{-0.810457198252594791726703434}, { 0.585797857456438860328080838},
-	{ 0.851355193105265142261290312}, { 0.524589682678468906215098464},
-	{-0.524589682678468906215098464}, { 0.851355193105265142261290312},
-	{ 0.231058108280671119643236018}, { 0.972939952205560145467720114},
-	{-0.972939952205560145467720114}, { 0.231058108280671119643236018},
-	{ 0.937339011912574923201899593}, { 0.348418680249434568419308588},
-	{-0.348418680249434568419308588}, { 0.937339011912574923201899593},
-	{ 0.416429560097637182562598911}, { 0.909167983090522376563884788},
-	{-0.909167983090522376563884788}, { 0.416429560097637182562598911},
-	{ 0.732654271672412834615546649}, { 0.680600997795453050594430464},
-	{-0.680600997795453050594430464}, { 0.732654271672412834615546649},
-	{ 0.036807222941358832324332691}, { 0.999322384588349500896221011},
-	{-0.999322384588349500896221011}, { 0.036807222941358832324332691},
-	{ 0.999322384588349500896221011}, { 0.036807222941358832324332691},
-	{-0.036807222941358832324332691}, { 0.999322384588349500896221011},
-	{ 0.680600997795453050594430464}, { 0.732654271672412834615546649},
-	{-0.732654271672412834615546649}, { 0.680600997795453050594430464},
-	{ 0.909167983090522376563884788}, { 0.416429560097637182562598911},
-	{-0.416429560097637182562598911}, { 0.909167983090522376563884788},
-	{ 0.348418680249434568419308588}, { 0.937339011912574923201899593},
-	{-0.937339011912574923201899593}, { 0.348418680249434568419308588},
-	{ 0.972939952205560145467720114}, { 0.231058108280671119643236018},
-	{-0.231058108280671119643236018}, { 0.972939952205560145467720114},
-	{ 0.524589682678468906215098464}, { 0.851355193105265142261290312},
-	{-0.851355193105265142261290312}, { 0.524589682678468906215098464},
-	{ 0.810457198252594791726703434}, { 0.585797857456438860328080838},
-	{-0.585797857456438860328080838}, { 0.810457198252594791726703434},
-	{ 0.158858143333861441684385360}, { 0.987301418157858382399815802},
-	{-0.987301418157858382399815802}, { 0.158858143333861441684385360},
-	{ 0.990902635427780025108237011}, { 0.134580708507126186316358409},
-	{-0.134580708507126186316358409}, { 0.990902635427780025108237011},
-	{ 0.605511041404325513920626941}, { 0.795836904608883536262791915},
-	{-0.795836904608883536262791915}, { 0.605511041404325513920626941},
-	{ 0.863972856121586737918147054}, { 0.503538383725717558691867071},
-	{-0.503538383725717558691867071}, { 0.863972856121586737918147054},
-	{ 0.254865659604514571553980779}, { 0.966976471044852109087220226},
-	{-0.966976471044852109087220226}, { 0.254865659604514571553980779},
-	{ 0.945607325380521325730945387}, { 0.325310292162262934135954708},
-	{-0.325310292162262934135954708}, { 0.945607325380521325730945387},
-	{ 0.438616238538527637647025738}, { 0.898674465693953843041976744},
-	{-0.898674465693953843041976744}, { 0.438616238538527637647025738},
-	{ 0.749136394523459325469203257}, { 0.662415777590171761113069817},
-	{-0.662415777590171761113069817}, { 0.749136394523459325469203257},
-	{ 0.061320736302208577782614593}, { 0.998118112900149207125155861},
-	{-0.998118112900149207125155861}, { 0.061320736302208577782614593},
-	{ 0.996312612182778012627226190}, { 0.085797312344439890461556332},
-	{-0.085797312344439890461556332}, { 0.996312612182778012627226190},
-	{ 0.643831542889791465068086063}, { 0.765167265622458925888815999},
-	{-0.765167265622458925888815999}, { 0.643831542889791465068086063},
-	{ 0.887639620402853947760181617}, { 0.460538710958240023633181487},
-	{-0.460538710958240023633181487}, { 0.887639620402853947760181617},
-	{ 0.302005949319228067003463232}, { 0.953306040354193836916740383},
-	{-0.953306040354193836916740383}, { 0.302005949319228067003463232},
-	{ 0.960430519415565811199035138}, { 0.278519689385053105207848526},
-	{-0.278519689385053105207848526}, { 0.960430519415565811199035138},
-	{ 0.482183772079122748517344481}, { 0.876070094195406607095844268},
-	{-0.876070094195406607095844268}, { 0.482183772079122748517344481},
-	{ 0.780737228572094478301588484}, { 0.624859488142386377084072816},
-	{-0.624859488142386377084072816}, { 0.780737228572094478301588484},
-	{ 0.110222207293883058807899140}, { 0.993906970002356041546922813},
-	{-0.993906970002356041546922813}, { 0.110222207293883058807899140},
-	{ 0.983105487431216327180301155}, { 0.183039887955140958516532578},
-	{-0.183039887955140958516532578}, { 0.983105487431216327180301155},
-	{ 0.565731810783613197389765011}, { 0.824589302785025264474803737},
-	{-0.824589302785025264474803737}, { 0.565731810783613197389765011},
-	{ 0.838224705554838043186996856}, { 0.545324988422046422313987347},
-	{-0.545324988422046422313987347}, { 0.838224705554838043186996856},
-	{ 0.207111376192218549708116020}, { 0.978317370719627633106240097},
-	{-0.978317370719627633106240097}, { 0.207111376192218549708116020},
-	{ 0.928506080473215565937167396}, { 0.371317193951837543411934967},
-	{-0.371317193951837543411934967}, { 0.928506080473215565937167396},
-	{ 0.393992040061048108596188661}, { 0.919113851690057743908477789},
-	{-0.919113851690057743908477789}, { 0.393992040061048108596188661},
-	{ 0.715730825283818654125532623}, { 0.698376249408972853554813503},
-	{-0.698376249408972853554813503}, { 0.715730825283818654125532623},
-	{ 0.012271538285719926079408262}, { 0.999924701839144540921646491},
-	{-0.999924701839144540921646491}, { 0.012271538285719926079408262},
-	{ 0.999981175282601142656990438}, { 0.006135884649154475359640235},
-	{-0.006135884649154475359640235}, { 0.999981175282601142656990438},
-	{ 0.702754744457225302452914421}, { 0.711432195745216441522130290},
-	{-0.711432195745216441522130290}, { 0.702754744457225302452914421},
-	{ 0.921514039342041943465396332}, { 0.388345046698826291624993541},
-	{-0.388345046698826291624993541}, { 0.921514039342041943465396332},
-	{ 0.377007410216418256726567823}, { 0.926210242138311341974793388},
-	{-0.926210242138311341974793388}, { 0.377007410216418256726567823},
-	{ 0.979569765685440534439326110}, { 0.201104634842091911558443546},
-	{-0.201104634842091911558443546}, { 0.979569765685440534439326110},
-	{ 0.550457972936604802977289893}, { 0.834862874986380056304401383},
-	{-0.834862874986380056304401383}, { 0.550457972936604802977289893},
-	{ 0.828045045257755752067527592}, { 0.560661576197336023839710223},
-	{-0.560661576197336023839710223}, { 0.828045045257755752067527592},
-	{ 0.189068664149806212754997837}, { 0.981963869109555264072848154},
-	{-0.981963869109555264072848154}, { 0.189068664149806212754997837},
-	{ 0.994564570734255452119106243}, { 0.104121633872054579120943880},
-	{-0.104121633872054579120943880}, { 0.994564570734255452119106243},
-	{ 0.629638238914927025372981341}, { 0.776888465673232450040827983},
-	{-0.776888465673232450040827983}, { 0.629638238914927025372981341},
-	{ 0.879012226428633477831323711}, { 0.476799230063322133342158117},
-	{-0.476799230063322133342158117}, { 0.879012226428633477831323711},
-	{ 0.284407537211271843618310615}, { 0.958703474895871555374645792},
-	{-0.958703474895871555374645792}, { 0.284407537211271843618310615},
-	{ 0.955141168305770721498157712}, { 0.296150888243623824121786128},
-	{-0.296150888243623824121786128}, { 0.955141168305770721498157712},
-	{ 0.465976495767966177902756065}, { 0.884797098430937780104007041},
-	{-0.884797098430937780104007041}, { 0.465976495767966177902756065},
-	{ 0.769103337645579639346626069}, { 0.639124444863775743801488193},
-	{-0.639124444863775743801488193}, { 0.769103337645579639346626069},
-	{ 0.091908956497132728624990979}, { 0.995767414467659793982495643},
-	{-0.995767414467659793982495643}, { 0.091908956497132728624990979},
-	{ 0.998475580573294752208559038}, { 0.055195244349689939809447526},
-	{-0.055195244349689939809447526}, { 0.998475580573294752208559038},
-	{ 0.666999922303637506650154222}, { 0.745057785441465962407907310},
-	{-0.745057785441465962407907310}, { 0.666999922303637506650154222},
-	{ 0.901348847046022014570746093}, { 0.433093818853151968484222638},
-	{-0.433093818853151968484222638}, { 0.901348847046022014570746093},
-	{ 0.331106305759876401737190737}, { 0.943593458161960361495301445},
-	{-0.943593458161960361495301445}, { 0.331106305759876401737190737},
-	{ 0.968522094274417316221088329}, { 0.248927605745720168110682816},
-	{-0.248927605745720168110682816}, { 0.968522094274417316221088329},
-	{ 0.508830142543107036931749324}, { 0.860866938637767279344583877},
-	{-0.860866938637767279344583877}, { 0.508830142543107036931749324},
-	{ 0.799537269107905033500246232}, { 0.600616479383868926653875896},
-	{-0.600616479383868926653875896}, { 0.799537269107905033500246232},
-	{ 0.140658239332849230714788846}, { 0.990058210262297105505906464},
-	{-0.990058210262297105505906464}, { 0.140658239332849230714788846},
-	{ 0.988257567730749491404792538}, { 0.152797185258443427720336613},
-	{-0.152797185258443427720336613}, { 0.988257567730749491404792538},
-	{ 0.590759701858874228423887908}, { 0.806847553543799272206514313},
-	{-0.806847553543799272206514313}, { 0.590759701858874228423887908},
-	{ 0.854557988365400520767862276}, { 0.519355990165589587361829932},
-	{-0.519355990165589587361829932}, { 0.854557988365400520767862276},
-	{ 0.237023605994367206867735915}, { 0.971503890986251775537099622},
-	{-0.971503890986251775537099622}, { 0.237023605994367206867735915},
-	{ 0.939459223602189911962669246}, { 0.342660717311994397592781983},
-	{-0.342660717311994397592781983}, { 0.939459223602189911962669246},
-	{ 0.422000270799799685941287941}, { 0.906595704514915365332960588},
-	{-0.906595704514915365332960588}, { 0.422000270799799685941287941},
-	{ 0.736816568877369875090132520}, { 0.676092703575315960360419228},
-	{-0.676092703575315960360419228}, { 0.736816568877369875090132520},
-	{ 0.042938256934940823077124540}, { 0.999077727752645382888781997},
-	{-0.999077727752645382888781997}, { 0.042938256934940823077124540},
-	{ 0.999529417501093163079703322}, { 0.030674803176636625934021028},
-	{-0.030674803176636625934021028}, { 0.999529417501093163079703322},
-	{ 0.685083667772700381362052545}, { 0.728464390448225196492035438},
-	{-0.728464390448225196492035438}, { 0.685083667772700381362052545},
-	{ 0.911706032005429851404397325}, { 0.410843171057903942183466675},
-	{-0.410843171057903942183466675}, { 0.911706032005429851404397325},
-	{ 0.354163525420490382357395796}, { 0.935183509938947577642207480},
-	{-0.935183509938947577642207480}, { 0.354163525420490382357395796},
-	{ 0.974339382785575860518721668}, { 0.225083911359792835991642120},
-	{-0.225083911359792835991642120}, { 0.974339382785575860518721668},
-	{ 0.529803624686294668216054671}, { 0.848120344803297251279133563},
-	{-0.848120344803297251279133563}, { 0.529803624686294668216054671},
-	{ 0.814036329705948361654516690}, { 0.580813958095764545075595272},
-	{-0.580813958095764545075595272}, { 0.814036329705948361654516690},
-	{ 0.164913120489969921418189113}, { 0.986308097244598647863297524},
-	{-0.986308097244598647863297524}, { 0.164913120489969921418189113},
-	{ 0.991709753669099522860049931}, { 0.128498110793793172624415589},
-	{-0.128498110793793172624415589}, { 0.991709753669099522860049931},
-	{ 0.610382806276309452716352152}, { 0.792106577300212351782342879},
-	{-0.792106577300212351782342879}, { 0.610382806276309452716352152},
-	{ 0.867046245515692651480195629}, { 0.498227666972781852410983869},
-	{-0.498227666972781852410983869}, { 0.867046245515692651480195629},
-	{ 0.260794117915275518280186509}, { 0.965394441697689374550843858},
-	{-0.965394441697689374550843858}, { 0.260794117915275518280186509},
-	{ 0.947585591017741134653387321}, { 0.319502030816015677901518272},
-	{-0.319502030816015677901518272}, { 0.947585591017741134653387321},
-	{ 0.444122144570429231642069418}, { 0.895966249756185155914560282},
-	{-0.895966249756185155914560282}, { 0.444122144570429231642069418},
-	{ 0.753186799043612482483430486}, { 0.657806693297078656931182264},
-	{-0.657806693297078656931182264}, { 0.753186799043612482483430486},
-	{ 0.067443919563664057897972422}, { 0.997723066644191609848546728},
-	{-0.997723066644191609848546728}, { 0.067443919563664057897972422},
-	{ 0.996820299291165714972629398}, { 0.079682437971430121147120656},
-	{-0.079682437971430121147120656}, { 0.996820299291165714972629398},
-	{ 0.648514401022112445084560551}, { 0.761202385484261814029709836},
-	{-0.761202385484261814029709836}, { 0.648514401022112445084560551},
-	{ 0.890448723244757889952150560}, { 0.455083587126343823535869268},
-	{-0.455083587126343823535869268}, { 0.890448723244757889952150560},
-	{ 0.307849640041534893682063646}, { 0.951435020969008369549175569},
-	{-0.951435020969008369549175569}, { 0.307849640041534893682063646},
-	{ 0.962121404269041595429604316}, { 0.272621355449948984493347477},
-	{-0.272621355449948984493347477}, { 0.962121404269041595429604316},
-	{ 0.487550160148435954641485027}, { 0.873094978418290098636085973},
-	{-0.873094978418290098636085973}, { 0.487550160148435954641485027},
-	{ 0.784556597155575233023892575}, { 0.620057211763289178646268191},
-	{-0.620057211763289178646268191}, { 0.784556597155575233023892575},
-	{ 0.116318630911904767252544319}, { 0.993211949234794533104601012},
-	{-0.993211949234794533104601012}, { 0.116318630911904767252544319},
-	{ 0.984210092386929073193874387}, { 0.177004220412148756196839844},
-	{-0.177004220412148756196839844}, { 0.984210092386929073193874387},
-	{ 0.570780745886967280232652864}, { 0.821102514991104679060430820},
-	{-0.821102514991104679060430820}, { 0.570780745886967280232652864},
-	{ 0.841554977436898409603499520}, { 0.540171472729892881297845480},
-	{-0.540171472729892881297845480}, { 0.841554977436898409603499520},
-	{ 0.213110319916091373967757518}, { 0.977028142657754351485866211},
-	{-0.977028142657754351485866211}, { 0.213110319916091373967757518},
-	{ 0.930766961078983731944872340}, { 0.365612997804773870011745909},
-	{-0.365612997804773870011745909}, { 0.930766961078983731944872340},
-	{ 0.399624199845646828544117031}, { 0.916679059921042663116457013},
-	{-0.916679059921042663116457013}, { 0.399624199845646828544117031},
-	{ 0.720002507961381629076682999}, { 0.693971460889654009003734389},
-	{-0.693971460889654009003734389}, { 0.720002507961381629076682999},
-	{ 0.018406729905804820927366313}, { 0.999830581795823422015722275},
-	{-0.999830581795823422015722275}, { 0.018406729905804820927366313},
-	{ 0.999830581795823422015722275}, { 0.018406729905804820927366313},
-	{-0.018406729905804820927366313}, { 0.999830581795823422015722275},
-	{ 0.693971460889654009003734389}, { 0.720002507961381629076682999},
-	{-0.720002507961381629076682999}, { 0.693971460889654009003734389},
-	{ 0.916679059921042663116457013}, { 0.399624199845646828544117031},
-	{-0.399624199845646828544117031}, { 0.916679059921042663116457013},
-	{ 0.365612997804773870011745909}, { 0.930766961078983731944872340},
-	{-0.930766961078983731944872340}, { 0.365612997804773870011745909},
-	{ 0.977028142657754351485866211}, { 0.213110319916091373967757518},
-	{-0.213110319916091373967757518}, { 0.977028142657754351485866211},
-	{ 0.540171472729892881297845480}, { 0.841554977436898409603499520},
-	{-0.841554977436898409603499520}, { 0.540171472729892881297845480},
-	{ 0.821102514991104679060430820}, { 0.570780745886967280232652864},
-	{-0.570780745886967280232652864}, { 0.821102514991104679060430820},
-	{ 0.177004220412148756196839844}, { 0.984210092386929073193874387},
-	{-0.984210092386929073193874387}, { 0.177004220412148756196839844},
-	{ 0.993211949234794533104601012}, { 0.116318630911904767252544319},
-	{-0.116318630911904767252544319}, { 0.993211949234794533104601012},
-	{ 0.620057211763289178646268191}, { 0.784556597155575233023892575},
-	{-0.784556597155575233023892575}, { 0.620057211763289178646268191},
-	{ 0.873094978418290098636085973}, { 0.487550160148435954641485027},
-	{-0.487550160148435954641485027}, { 0.873094978418290098636085973},
-	{ 0.272621355449948984493347477}, { 0.962121404269041595429604316},
-	{-0.962121404269041595429604316}, { 0.272621355449948984493347477},
-	{ 0.951435020969008369549175569}, { 0.307849640041534893682063646},
-	{-0.307849640041534893682063646}, { 0.951435020969008369549175569},
-	{ 0.455083587126343823535869268}, { 0.890448723244757889952150560},
-	{-0.890448723244757889952150560}, { 0.455083587126343823535869268},
-	{ 0.761202385484261814029709836}, { 0.648514401022112445084560551},
-	{-0.648514401022112445084560551}, { 0.761202385484261814029709836},
-	{ 0.079682437971430121147120656}, { 0.996820299291165714972629398},
-	{-0.996820299291165714972629398}, { 0.079682437971430121147120656},
-	{ 0.997723066644191609848546728}, { 0.067443919563664057897972422},
-	{-0.067443919563664057897972422}, { 0.997723066644191609848546728},
-	{ 0.657806693297078656931182264}, { 0.753186799043612482483430486},
-	{-0.753186799043612482483430486}, { 0.657806693297078656931182264},
-	{ 0.895966249756185155914560282}, { 0.444122144570429231642069418},
-	{-0.444122144570429231642069418}, { 0.895966249756185155914560282},
-	{ 0.319502030816015677901518272}, { 0.947585591017741134653387321},
-	{-0.947585591017741134653387321}, { 0.319502030816015677901518272},
-	{ 0.965394441697689374550843858}, { 0.260794117915275518280186509},
-	{-0.260794117915275518280186509}, { 0.965394441697689374550843858},
-	{ 0.498227666972781852410983869}, { 0.867046245515692651480195629},
-	{-0.867046245515692651480195629}, { 0.498227666972781852410983869},
-	{ 0.792106577300212351782342879}, { 0.610382806276309452716352152},
-	{-0.610382806276309452716352152}, { 0.792106577300212351782342879},
-	{ 0.128498110793793172624415589}, { 0.991709753669099522860049931},
-	{-0.991709753669099522860049931}, { 0.128498110793793172624415589},
-	{ 0.986308097244598647863297524}, { 0.164913120489969921418189113},
-	{-0.164913120489969921418189113}, { 0.986308097244598647863297524},
-	{ 0.580813958095764545075595272}, { 0.814036329705948361654516690},
-	{-0.814036329705948361654516690}, { 0.580813958095764545075595272},
-	{ 0.848120344803297251279133563}, { 0.529803624686294668216054671},
-	{-0.529803624686294668216054671}, { 0.848120344803297251279133563},
-	{ 0.225083911359792835991642120}, { 0.974339382785575860518721668},
-	{-0.974339382785575860518721668}, { 0.225083911359792835991642120},
-	{ 0.935183509938947577642207480}, { 0.354163525420490382357395796},
-	{-0.354163525420490382357395796}, { 0.935183509938947577642207480},
-	{ 0.410843171057903942183466675}, { 0.911706032005429851404397325},
-	{-0.911706032005429851404397325}, { 0.410843171057903942183466675},
-	{ 0.728464390448225196492035438}, { 0.685083667772700381362052545},
-	{-0.685083667772700381362052545}, { 0.728464390448225196492035438},
-	{ 0.030674803176636625934021028}, { 0.999529417501093163079703322},
-	{-0.999529417501093163079703322}, { 0.030674803176636625934021028},
-	{ 0.999077727752645382888781997}, { 0.042938256934940823077124540},
-	{-0.042938256934940823077124540}, { 0.999077727752645382888781997},
-	{ 0.676092703575315960360419228}, { 0.736816568877369875090132520},
-	{-0.736816568877369875090132520}, { 0.676092703575315960360419228},
-	{ 0.906595704514915365332960588}, { 0.422000270799799685941287941},
-	{-0.422000270799799685941287941}, { 0.906595704514915365332960588},
-	{ 0.342660717311994397592781983}, { 0.939459223602189911962669246},
-	{-0.939459223602189911962669246}, { 0.342660717311994397592781983},
-	{ 0.971503890986251775537099622}, { 0.237023605994367206867735915},
-	{-0.237023605994367206867735915}, { 0.971503890986251775537099622},
-	{ 0.519355990165589587361829932}, { 0.854557988365400520767862276},
-	{-0.854557988365400520767862276}, { 0.519355990165589587361829932},
-	{ 0.806847553543799272206514313}, { 0.590759701858874228423887908},
-	{-0.590759701858874228423887908}, { 0.806847553543799272206514313},
-	{ 0.152797185258443427720336613}, { 0.988257567730749491404792538},
-	{-0.988257567730749491404792538}, { 0.152797185258443427720336613},
-	{ 0.990058210262297105505906464}, { 0.140658239332849230714788846},
-	{-0.140658239332849230714788846}, { 0.990058210262297105505906464},
-	{ 0.600616479383868926653875896}, { 0.799537269107905033500246232},
-	{-0.799537269107905033500246232}, { 0.600616479383868926653875896},
-	{ 0.860866938637767279344583877}, { 0.508830142543107036931749324},
-	{-0.508830142543107036931749324}, { 0.860866938637767279344583877},
-	{ 0.248927605745720168110682816}, { 0.968522094274417316221088329},
-	{-0.968522094274417316221088329}, { 0.248927605745720168110682816},
-	{ 0.943593458161960361495301445}, { 0.331106305759876401737190737},
-	{-0.331106305759876401737190737}, { 0.943593458161960361495301445},
-	{ 0.433093818853151968484222638}, { 0.901348847046022014570746093},
-	{-0.901348847046022014570746093}, { 0.433093818853151968484222638},
-	{ 0.745057785441465962407907310}, { 0.666999922303637506650154222},
-	{-0.666999922303637506650154222}, { 0.745057785441465962407907310},
-	{ 0.055195244349689939809447526}, { 0.998475580573294752208559038},
-	{-0.998475580573294752208559038}, { 0.055195244349689939809447526},
-	{ 0.995767414467659793982495643}, { 0.091908956497132728624990979},
-	{-0.091908956497132728624990979}, { 0.995767414467659793982495643},
-	{ 0.639124444863775743801488193}, { 0.769103337645579639346626069},
-	{-0.769103337645579639346626069}, { 0.639124444863775743801488193},
-	{ 0.884797098430937780104007041}, { 0.465976495767966177902756065},
-	{-0.465976495767966177902756065}, { 0.884797098430937780104007041},
-	{ 0.296150888243623824121786128}, { 0.955141168305770721498157712},
-	{-0.955141168305770721498157712}, { 0.296150888243623824121786128},
-	{ 0.958703474895871555374645792}, { 0.284407537211271843618310615},
-	{-0.284407537211271843618310615}, { 0.958703474895871555374645792},
-	{ 0.476799230063322133342158117}, { 0.879012226428633477831323711},
-	{-0.879012226428633477831323711}, { 0.476799230063322133342158117},
-	{ 0.776888465673232450040827983}, { 0.629638238914927025372981341},
-	{-0.629638238914927025372981341}, { 0.776888465673232450040827983},
-	{ 0.104121633872054579120943880}, { 0.994564570734255452119106243},
-	{-0.994564570734255452119106243}, { 0.104121633872054579120943880},
-	{ 0.981963869109555264072848154}, { 0.189068664149806212754997837},
-	{-0.189068664149806212754997837}, { 0.981963869109555264072848154},
-	{ 0.560661576197336023839710223}, { 0.828045045257755752067527592},
-	{-0.828045045257755752067527592}, { 0.560661576197336023839710223},
-	{ 0.834862874986380056304401383}, { 0.550457972936604802977289893},
-	{-0.550457972936604802977289893}, { 0.834862874986380056304401383},
-	{ 0.201104634842091911558443546}, { 0.979569765685440534439326110},
-	{-0.979569765685440534439326110}, { 0.201104634842091911558443546},
-	{ 0.926210242138311341974793388}, { 0.377007410216418256726567823},
-	{-0.377007410216418256726567823}, { 0.926210242138311341974793388},
-	{ 0.388345046698826291624993541}, { 0.921514039342041943465396332},
-	{-0.921514039342041943465396332}, { 0.388345046698826291624993541},
-	{ 0.711432195745216441522130290}, { 0.702754744457225302452914421},
-	{-0.702754744457225302452914421}, { 0.711432195745216441522130290},
-	{ 0.006135884649154475359640235}, { 0.999981175282601142656990438},
-	{-0.999981175282601142656990438}, { 0.006135884649154475359640235},
-	{ 0.999995293809576171511580126}, { 0.003067956762965976270145365},
-	{-0.003067956762965976270145365}, { 0.999995293809576171511580126},
-	{ 0.704934080375904908852523758}, { 0.709272826438865651316533772},
-	{-0.709272826438865651316533772}, { 0.704934080375904908852523758},
-	{ 0.922701128333878570437264227}, { 0.385516053843918864075607949},
-	{-0.385516053843918864075607949}, { 0.922701128333878570437264227},
-	{ 0.379847208924051170576281147}, { 0.925049240782677590302371869},
-	{-0.925049240782677590302371869}, { 0.379847208924051170576281147},
-	{ 0.980182135968117392690210009}, { 0.198098410717953586179324918},
-	{-0.198098410717953586179324918}, { 0.980182135968117392690210009},
-	{ 0.553016705580027531764226988}, { 0.833170164701913186439915922},
-	{-0.833170164701913186439915922}, { 0.553016705580027531764226988},
-	{ 0.829761233794523042469023765}, { 0.558118531220556115693702964},
-	{-0.558118531220556115693702964}, { 0.829761233794523042469023765},
-	{ 0.192080397049892441679288205}, { 0.981379193313754574318224190},
-	{-0.981379193313754574318224190}, { 0.192080397049892441679288205},
-	{ 0.994879330794805620591166107}, { 0.101069862754827824987887585},
-	{-0.101069862754827824987887585}, { 0.994879330794805620591166107},
-	{ 0.632018735939809021909403706}, { 0.774953106594873878359129282},
-	{-0.774953106594873878359129282}, { 0.632018735939809021909403706},
-	{ 0.880470889052160770806542929}, { 0.474100214650550014398580015},
-	{-0.474100214650550014398580015}, { 0.880470889052160770806542929},
-	{ 0.287347459544729526477331841}, { 0.957826413027532890321037029},
-	{-0.957826413027532890321037029}, { 0.287347459544729526477331841},
-	{ 0.956045251349996443270479823}, { 0.293219162694258650606608599},
-	{-0.293219162694258650606608599}, { 0.956045251349996443270479823},
-	{ 0.468688822035827933697617870}, { 0.883363338665731594736308015},
-	{-0.883363338665731594736308015}, { 0.468688822035827933697617870},
-	{ 0.771060524261813773200605759}, { 0.636761861236284230413943435},
-	{-0.636761861236284230413943435}, { 0.771060524261813773200605759},
-	{ 0.094963495329638998938034312}, { 0.995480755491926941769171600},
-	{-0.995480755491926941769171600}, { 0.094963495329638998938034312},
-	{ 0.998640218180265222418199049}, { 0.052131704680283321236358216},
-	{-0.052131704680283321236358216}, { 0.998640218180265222418199049},
-	{ 0.669282588346636065720696366}, { 0.743007952135121693517362293},
-	{-0.743007952135121693517362293}, { 0.669282588346636065720696366},
-	{ 0.902673318237258806751502391}, { 0.430326481340082633908199031},
-	{-0.430326481340082633908199031}, { 0.902673318237258806751502391},
-	{ 0.333999651442009404650865481}, { 0.942573197601446879280758735},
-	{-0.942573197601446879280758735}, { 0.333999651442009404650865481},
-	{ 0.969281235356548486048290738}, { 0.245955050335794611599924709},
-	{-0.245955050335794611599924709}, { 0.969281235356548486048290738},
-	{ 0.511468850437970399504391001}, { 0.859301818357008404783582139},
-	{-0.859301818357008404783582139}, { 0.511468850437970399504391001},
-	{ 0.801376171723140219430247777}, { 0.598160706996342311724958652},
-	{-0.598160706996342311724958652}, { 0.801376171723140219430247777},
-	{ 0.143695033150294454819773349}, { 0.989622017463200834623694454},
-	{-0.989622017463200834623694454}, { 0.143695033150294454819773349},
-	{ 0.988721691960323767604516485}, { 0.149764534677321517229695737},
-	{-0.149764534677321517229695737}, { 0.988721691960323767604516485},
-	{ 0.593232295039799808047809426}, { 0.805031331142963597922659282},
-	{-0.805031331142963597922659282}, { 0.593232295039799808047809426},
-	{ 0.856147328375194481019630732}, { 0.516731799017649881508753876},
-	{-0.516731799017649881508753876}, { 0.856147328375194481019630732},
-	{ 0.240003022448741486568922365}, { 0.970772140728950302138169611},
-	{-0.970772140728950302138169611}, { 0.240003022448741486568922365},
-	{ 0.940506070593268323787291309}, { 0.339776884406826857828825803},
-	{-0.339776884406826857828825803}, { 0.940506070593268323787291309},
-	{ 0.424779681209108833357226189}, { 0.905296759318118774354048329},
-	{-0.905296759318118774354048329}, { 0.424779681209108833357226189},
-	{ 0.738887324460615147933116508}, { 0.673829000378756060917568372},
-	{-0.673829000378756060917568372}, { 0.738887324460615147933116508},
-	{ 0.046003182130914628814301788}, { 0.998941293186856850633930266},
-	{-0.998941293186856850633930266}, { 0.046003182130914628814301788},
-	{ 0.999618822495178597116830637}, { 0.027608145778965741612354872},
-	{-0.027608145778965741612354872}, { 0.999618822495178597116830637},
-	{ 0.687315340891759108199186948}, { 0.726359155084345976817494315},
-	{-0.726359155084345976817494315}, { 0.687315340891759108199186948},
-	{ 0.912962190428398164628018233}, { 0.408044162864978680820747499},
-	{-0.408044162864978680820747499}, { 0.912962190428398164628018233},
-	{ 0.357030961233430032614954036}, { 0.934092550404258914729877883},
-	{-0.934092550404258914729877883}, { 0.357030961233430032614954036},
-	{ 0.975025345066994146844913468}, { 0.222093620973203534094094721},
-	{-0.222093620973203534094094721}, { 0.975025345066994146844913468},
-	{ 0.532403127877197971442805218}, { 0.846490938774052078300544488},
-	{-0.846490938774052078300544488}, { 0.532403127877197971442805218},
-	{ 0.815814410806733789010772660}, { 0.578313796411655563342245019},
-	{-0.578313796411655563342245019}, { 0.815814410806733789010772660},
-	{ 0.167938294974731178054745536}, { 0.985797509167567424700995000},
-	{-0.985797509167567424700995000}, { 0.167938294974731178054745536},
-	{ 0.992099313142191757112085445}, { 0.125454983411546238542336453},
-	{-0.125454983411546238542336453}, { 0.992099313142191757112085445},
-	{ 0.612810082429409703935211936}, { 0.790230221437310055030217152},
-	{-0.790230221437310055030217152}, { 0.612810082429409703935211936},
-	{ 0.868570705971340895340449876}, { 0.495565261825772531150266670},
-	{-0.495565261825772531150266670}, { 0.868570705971340895340449876},
-	{ 0.263754678974831383611349322}, { 0.964589793289812723836432159},
-	{-0.964589793289812723836432159}, { 0.263754678974831383611349322},
-	{ 0.948561349915730288158494826}, { 0.316593375556165867243047035},
-	{-0.316593375556165867243047035}, { 0.948561349915730288158494826},
-	{ 0.446868840162374195353044389}, { 0.894599485631382678433072126},
-	{-0.894599485631382678433072126}, { 0.446868840162374195353044389},
-	{ 0.755201376896536527598710756}, { 0.655492852999615385312679701},
-	{-0.655492852999615385312679701}, { 0.755201376896536527598710756},
-	{ 0.070504573389613863027351471}, { 0.997511456140303459699448390},
-	{-0.997511456140303459699448390}, { 0.070504573389613863027351471},
-	{ 0.997060070339482978987989949}, { 0.076623861392031492278332463},
-	{-0.076623861392031492278332463}, { 0.997060070339482978987989949},
-	{ 0.650846684996380915068975573}, { 0.759209188978388033485525443},
-	{-0.759209188978388033485525443}, { 0.650846684996380915068975573},
-	{ 0.891840709392342727796478697}, { 0.452349587233770874133026703},
-	{-0.452349587233770874133026703}, { 0.891840709392342727796478697},
-	{ 0.310767152749611495835997250}, { 0.950486073949481721759926101},
-	{-0.950486073949481721759926101}, { 0.310767152749611495835997250},
-	{ 0.962953266873683886347921481}, { 0.269668325572915106525464462},
-	{-0.269668325572915106525464462}, { 0.962953266873683886347921481},
-	{ 0.490226483288291154229598449}, { 0.871595086655951034842481435},
-	{-0.871595086655951034842481435}, { 0.490226483288291154229598449},
-	{ 0.786455213599085757522319464}, { 0.617647307937803932403979402},
-	{-0.617647307937803932403979402}, { 0.786455213599085757522319464},
-	{ 0.119365214810991364593637790}, { 0.992850414459865090793563344},
-	{-0.992850414459865090793563344}, { 0.119365214810991364593637790},
-	{ 0.984748501801904218556553176}, { 0.173983873387463827950700807},
-	{-0.173983873387463827950700807}, { 0.984748501801904218556553176},
-	{ 0.573297166698042212820171239}, { 0.819347520076796960824689637},
-	{-0.819347520076796960824689637}, { 0.573297166698042212820171239},
-	{ 0.843208239641845437161743865}, { 0.537587076295645482502214932},
-	{-0.537587076295645482502214932}, { 0.843208239641845437161743865},
-	{ 0.216106797076219509948385131}, { 0.976369731330021149312732194},
-	{-0.976369731330021149312732194}, { 0.216106797076219509948385131},
-	{ 0.931884265581668106718557199}, { 0.362755724367397216204854462},
-	{-0.362755724367397216204854462}, { 0.931884265581668106718557199},
-	{ 0.402434650859418441082533934}, { 0.915448716088267819566431292},
-	{-0.915448716088267819566431292}, { 0.402434650859418441082533934},
-	{ 0.722128193929215321243607198}, { 0.691759258364157774906734132},
-	{-0.691759258364157774906734132}, { 0.722128193929215321243607198},
-	{ 0.021474080275469507418374898}, { 0.999769405351215321657617036},
-	{-0.999769405351215321657617036}, { 0.021474080275469507418374898},
-	{ 0.999882347454212525633049627}, { 0.015339206284988101044151868},
-	{-0.015339206284988101044151868}, { 0.999882347454212525633049627},
-	{ 0.696177131491462944788582591}, { 0.717870045055731736211325329},
-	{-0.717870045055731736211325329}, { 0.696177131491462944788582591},
-	{ 0.917900775621390457642276297}, { 0.396809987416710328595290911},
-	{-0.396809987416710328595290911}, { 0.917900775621390457642276297},
-	{ 0.368466829953372331712746222}, { 0.929640895843181265457918066},
-	{-0.929640895843181265457918066}, { 0.368466829953372331712746222},
-	{ 0.977677357824509979943404762}, { 0.210111836880469621717489972},
-	{-0.210111836880469621717489972}, { 0.977677357824509979943404762},
-	{ 0.542750784864515906586768661}, { 0.839893794195999504583383987},
-	{-0.839893794195999504583383987}, { 0.542750784864515906586768661},
-	{ 0.822849781375826332046780034}, { 0.568258952670131549790548489},
-	{-0.568258952670131549790548489}, { 0.822849781375826332046780034},
-	{ 0.180022901405699522679906590}, { 0.983662419211730274396237776},
-	{-0.983662419211730274396237776}, { 0.180022901405699522679906590},
-	{ 0.993564135520595333782021697}, { 0.113270952177564349018228733},
-	{-0.113270952177564349018228733}, { 0.993564135520595333782021697},
-	{ 0.622461279374149972519166721}, { 0.782650596166575738458949301},
-	{-0.782650596166575738458949301}, { 0.622461279374149972519166721},
-	{ 0.874586652278176112634431897}, { 0.484869248000791101822951699},
-	{-0.484869248000791101822951699}, { 0.874586652278176112634431897},
-	{ 0.275571819310958163076425168}, { 0.961280485811320641748659653},
-	{-0.961280485811320641748659653}, { 0.275571819310958163076425168},
-	{ 0.952375012719765858529893608}, { 0.304929229735402406490728633},
-	{-0.304929229735402406490728633}, { 0.952375012719765858529893608},
-	{ 0.457813303598877221904961155}, { 0.889048355854664562540777729},
-	{-0.889048355854664562540777729}, { 0.457813303598877221904961155},
-	{ 0.763188417263381271704838297}, { 0.646176012983316364832802220},
-	{-0.646176012983316364832802220}, { 0.763188417263381271704838297},
-	{ 0.082740264549375693111987083}, { 0.996571145790554847093566910},
-	{-0.996571145790554847093566910}, { 0.082740264549375693111987083},
-	{ 0.997925286198596012623025462}, { 0.064382630929857460819324537},
-	{-0.064382630929857460819324537}, { 0.997925286198596012623025462},
-	{ 0.660114342067420478559490747}, { 0.751165131909686411205819422},
-	{-0.751165131909686411205819422}, { 0.660114342067420478559490747},
-	{ 0.897324580705418281231391836}, { 0.441371268731716692879988968},
-	{-0.441371268731716692879988968}, { 0.897324580705418281231391836},
-	{ 0.322407678801069848384807478}, { 0.946600913083283570044599823},
-	{-0.946600913083283570044599823}, { 0.322407678801069848384807478},
-	{ 0.966190003445412555433832961}, { 0.257831102162159005614471295},
-	{-0.257831102162159005614471295}, { 0.966190003445412555433832961},
-	{ 0.500885382611240786241285004}, { 0.865513624090569082825488358},
-	{-0.865513624090569082825488358}, { 0.500885382611240786241285004},
-	{ 0.793975477554337164895083757}, { 0.607949784967773667243642671},
-	{-0.607949784967773667243642671}, { 0.793975477554337164895083757},
-	{ 0.131540028702883111103387493}, { 0.991310859846115418957349799},
-	{-0.991310859846115418957349799}, { 0.131540028702883111103387493},
-	{ 0.986809401814185476970235952}, { 0.161886393780111837641387995},
-	{-0.161886393780111837641387995}, { 0.986809401814185476970235952},
-	{ 0.583308652937698294392830961}, { 0.812250586585203913049744181},
-	{-0.812250586585203913049744181}, { 0.583308652937698294392830961},
-	{ 0.849741768000852489471268395}, { 0.527199134781901348464274575},
-	{-0.527199134781901348464274575}, { 0.849741768000852489471268395},
-	{ 0.228072083170885739254457379}, { 0.973644249650811925318383912},
-	{-0.973644249650811925318383912}, { 0.228072083170885739254457379},
-	{ 0.936265667170278246576310996}, { 0.351292756085567125601307623},
-	{-0.351292756085567125601307623}, { 0.936265667170278246576310996},
-	{ 0.413638312238434547471944324}, { 0.910441292258067196934095369},
-	{-0.910441292258067196934095369}, { 0.413638312238434547471944324},
-	{ 0.730562769227827561177758850}, { 0.682845546385248068164596123},
-	{-0.682845546385248068164596123}, { 0.730562769227827561177758850},
-	{ 0.033741171851377584833716112}, { 0.999430604555461772019008327},
-	{-0.999430604555461772019008327}, { 0.033741171851377584833716112},
-	{ 0.999204758618363895492950001}, { 0.039872927587739811128578738},
-	{-0.039872927587739811128578738}, { 0.999204758618363895492950001},
-	{ 0.678350043129861486873655042}, { 0.734738878095963464563223604},
-	{-0.734738878095963464563223604}, { 0.678350043129861486873655042},
-	{ 0.907886116487666212038681480}, { 0.419216888363223956433010020},
-	{-0.419216888363223956433010020}, { 0.907886116487666212038681480},
-	{ 0.345541324963989065539191723}, { 0.938403534063108112192420774},
-	{-0.938403534063108112192420774}, { 0.345541324963989065539191723},
-	{ 0.972226497078936305708321144}, { 0.234041958583543423191242045},
-	{-0.234041958583543423191242045}, { 0.972226497078936305708321144},
-	{ 0.521975292937154342694258318}, { 0.852960604930363657746588082},
-	{-0.852960604930363657746588082}, { 0.521975292937154342694258318},
-	{ 0.808656181588174991946968128}, { 0.588281548222645304786439813},
-	{-0.588281548222645304786439813}, { 0.808656181588174991946968128},
-	{ 0.155828397654265235743101486}, { 0.987784141644572154230969032},
-	{-0.987784141644572154230969032}, { 0.155828397654265235743101486},
-	{ 0.990485084256457037998682243}, { 0.137620121586486044948441663},
-	{-0.137620121586486044948441663}, { 0.990485084256457037998682243},
-	{ 0.603066598540348201693430617}, { 0.797690840943391108362662755},
-	{-0.797690840943391108362662755}, { 0.603066598540348201693430617},
-	{ 0.862423956111040538690933878}, { 0.506186645345155291048942344},
-	{-0.506186645345155291048942344}, { 0.862423956111040538690933878},
-	{ 0.251897818154216950498106628}, { 0.967753837093475465243391912},
-	{-0.967753837093475465243391912}, { 0.251897818154216950498106628},
-	{ 0.944604837261480265659265493}, { 0.328209843579092526107916817},
-	{-0.328209843579092526107916817}, { 0.944604837261480265659265493},
-	{ 0.435857079922255491032544080}, { 0.900015892016160228714535267},
-	{-0.900015892016160228714535267}, { 0.435857079922255491032544080},
-	{ 0.747100605980180144323078847}, { 0.664710978203344868130324985},
-	{-0.664710978203344868130324985}, { 0.747100605980180144323078847},
-	{ 0.058258264500435759613979782}, { 0.998301544933892840738782163},
-	{-0.998301544933892840738782163}, { 0.058258264500435759613979782},
-	{ 0.996044700901251989887944810}, { 0.088853552582524596561586535},
-	{-0.088853552582524596561586535}, { 0.996044700901251989887944810},
-	{ 0.641481012808583151988739898}, { 0.767138911935820381181694573},
-	{-0.767138911935820381181694573}, { 0.641481012808583151988739898},
-	{ 0.886222530148880631647990821}, { 0.463259783551860197390719637},
-	{-0.463259783551860197390719637}, { 0.886222530148880631647990821},
-	{ 0.299079826308040476750336973}, { 0.954228095109105629780430732},
-	{-0.954228095109105629780430732}, { 0.299079826308040476750336973},
-	{ 0.959571513081984528335528181}, { 0.281464937925757984095231007},
-	{-0.281464937925757984095231007}, { 0.959571513081984528335528181},
-	{ 0.479493757660153026679839798}, { 0.877545290207261291668470750},
-	{-0.877545290207261291668470750}, { 0.479493757660153026679839798},
-	{ 0.778816512381475953374724325}, { 0.627251815495144113509622565},
-	{-0.627251815495144113509622565}, { 0.778816512381475953374724325},
-	{ 0.107172424956808849175529148}, { 0.994240449453187946358413442},
-	{-0.994240449453187946358413442}, { 0.107172424956808849175529148},
-	{ 0.982539302287441255907040396}, { 0.186055151663446648105438304},
-	{-0.186055151663446648105438304}, { 0.982539302287441255907040396},
-	{ 0.563199344013834115007363772}, { 0.826321062845663480311195452},
-	{-0.826321062845663480311195452}, { 0.563199344013834115007363772},
-	{ 0.836547727223511984524285790}, { 0.547894059173100165608820571},
-	{-0.547894059173100165608820571}, { 0.836547727223511984524285790},
-	{ 0.204108966092816874181696950}, { 0.978948175319062194715480124},
-	{-0.978948175319062194715480124}, { 0.204108966092816874181696950},
-	{ 0.927362525650401087274536959}, { 0.374164062971457997104393020},
-	{-0.374164062971457997104393020}, { 0.927362525650401087274536959},
-	{ 0.391170384302253888687512949}, { 0.920318276709110566440076541},
-	{-0.920318276709110566440076541}, { 0.391170384302253888687512949},
-	{ 0.713584868780793592903125099}, { 0.700568793943248366792866380},
-	{-0.700568793943248366792866380}, { 0.713584868780793592903125099},
-	{ 0.009203754782059819315102378}, { 0.999957644551963866333120920},
-	{-0.999957644551963866333120920}, { 0.009203754782059819315102378},
-	{ 0.999957644551963866333120920}, { 0.009203754782059819315102378},
-	{-0.009203754782059819315102378}, { 0.999957644551963866333120920},
-	{ 0.700568793943248366792866380}, { 0.713584868780793592903125099},
-	{-0.713584868780793592903125099}, { 0.700568793943248366792866380},
-	{ 0.920318276709110566440076541}, { 0.391170384302253888687512949},
-	{-0.391170384302253888687512949}, { 0.920318276709110566440076541},
-	{ 0.374164062971457997104393020}, { 0.927362525650401087274536959},
-	{-0.927362525650401087274536959}, { 0.374164062971457997104393020},
-	{ 0.978948175319062194715480124}, { 0.204108966092816874181696950},
-	{-0.204108966092816874181696950}, { 0.978948175319062194715480124},
-	{ 0.547894059173100165608820571}, { 0.836547727223511984524285790},
-	{-0.836547727223511984524285790}, { 0.547894059173100165608820571},
-	{ 0.826321062845663480311195452}, { 0.563199344013834115007363772},
-	{-0.563199344013834115007363772}, { 0.826321062845663480311195452},
-	{ 0.186055151663446648105438304}, { 0.982539302287441255907040396},
-	{-0.982539302287441255907040396}, { 0.186055151663446648105438304},
-	{ 0.994240449453187946358413442}, { 0.107172424956808849175529148},
-	{-0.107172424956808849175529148}, { 0.994240449453187946358413442},
-	{ 0.627251815495144113509622565}, { 0.778816512381475953374724325},
-	{-0.778816512381475953374724325}, { 0.627251815495144113509622565},
-	{ 0.877545290207261291668470750}, { 0.479493757660153026679839798},
-	{-0.479493757660153026679839798}, { 0.877545290207261291668470750},
-	{ 0.281464937925757984095231007}, { 0.959571513081984528335528181},
-	{-0.959571513081984528335528181}, { 0.281464937925757984095231007},
-	{ 0.954228095109105629780430732}, { 0.299079826308040476750336973},
-	{-0.299079826308040476750336973}, { 0.954228095109105629780430732},
-	{ 0.463259783551860197390719637}, { 0.886222530148880631647990821},
-	{-0.886222530148880631647990821}, { 0.463259783551860197390719637},
-	{ 0.767138911935820381181694573}, { 0.641481012808583151988739898},
-	{-0.641481012808583151988739898}, { 0.767138911935820381181694573},
-	{ 0.088853552582524596561586535}, { 0.996044700901251989887944810},
-	{-0.996044700901251989887944810}, { 0.088853552582524596561586535},
-	{ 0.998301544933892840738782163}, { 0.058258264500435759613979782},
-	{-0.058258264500435759613979782}, { 0.998301544933892840738782163},
-	{ 0.664710978203344868130324985}, { 0.747100605980180144323078847},
-	{-0.747100605980180144323078847}, { 0.664710978203344868130324985},
-	{ 0.900015892016160228714535267}, { 0.435857079922255491032544080},
-	{-0.435857079922255491032544080}, { 0.900015892016160228714535267},
-	{ 0.328209843579092526107916817}, { 0.944604837261480265659265493},
-	{-0.944604837261480265659265493}, { 0.328209843579092526107916817},
-	{ 0.967753837093475465243391912}, { 0.251897818154216950498106628},
-	{-0.251897818154216950498106628}, { 0.967753837093475465243391912},
-	{ 0.506186645345155291048942344}, { 0.862423956111040538690933878},
-	{-0.862423956111040538690933878}, { 0.506186645345155291048942344},
-	{ 0.797690840943391108362662755}, { 0.603066598540348201693430617},
-	{-0.603066598540348201693430617}, { 0.797690840943391108362662755},
-	{ 0.137620121586486044948441663}, { 0.990485084256457037998682243},
-	{-0.990485084256457037998682243}, { 0.137620121586486044948441663},
-	{ 0.987784141644572154230969032}, { 0.155828397654265235743101486},
-	{-0.155828397654265235743101486}, { 0.987784141644572154230969032},
-	{ 0.588281548222645304786439813}, { 0.808656181588174991946968128},
-	{-0.808656181588174991946968128}, { 0.588281548222645304786439813},
-	{ 0.852960604930363657746588082}, { 0.521975292937154342694258318},
-	{-0.521975292937154342694258318}, { 0.852960604930363657746588082},
-	{ 0.234041958583543423191242045}, { 0.972226497078936305708321144},
-	{-0.972226497078936305708321144}, { 0.234041958583543423191242045},
-	{ 0.938403534063108112192420774}, { 0.345541324963989065539191723},
-	{-0.345541324963989065539191723}, { 0.938403534063108112192420774},
-	{ 0.419216888363223956433010020}, { 0.907886116487666212038681480},
-	{-0.907886116487666212038681480}, { 0.419216888363223956433010020},
-	{ 0.734738878095963464563223604}, { 0.678350043129861486873655042},
-	{-0.678350043129861486873655042}, { 0.734738878095963464563223604},
-	{ 0.039872927587739811128578738}, { 0.999204758618363895492950001},
-	{-0.999204758618363895492950001}, { 0.039872927587739811128578738},
-	{ 0.999430604555461772019008327}, { 0.033741171851377584833716112},
-	{-0.033741171851377584833716112}, { 0.999430604555461772019008327},
-	{ 0.682845546385248068164596123}, { 0.730562769227827561177758850},
-	{-0.730562769227827561177758850}, { 0.682845546385248068164596123},
-	{ 0.910441292258067196934095369}, { 0.413638312238434547471944324},
-	{-0.413638312238434547471944324}, { 0.910441292258067196934095369},
-	{ 0.351292756085567125601307623}, { 0.936265667170278246576310996},
-	{-0.936265667170278246576310996}, { 0.351292756085567125601307623},
-	{ 0.973644249650811925318383912}, { 0.228072083170885739254457379},
-	{-0.228072083170885739254457379}, { 0.973644249650811925318383912},
-	{ 0.527199134781901348464274575}, { 0.849741768000852489471268395},
-	{-0.849741768000852489471268395}, { 0.527199134781901348464274575},
-	{ 0.812250586585203913049744181}, { 0.583308652937698294392830961},
-	{-0.583308652937698294392830961}, { 0.812250586585203913049744181},
-	{ 0.161886393780111837641387995}, { 0.986809401814185476970235952},
-	{-0.986809401814185476970235952}, { 0.161886393780111837641387995},
-	{ 0.991310859846115418957349799}, { 0.131540028702883111103387493},
-	{-0.131540028702883111103387493}, { 0.991310859846115418957349799},
-	{ 0.607949784967773667243642671}, { 0.793975477554337164895083757},
-	{-0.793975477554337164895083757}, { 0.607949784967773667243642671},
-	{ 0.865513624090569082825488358}, { 0.500885382611240786241285004},
-	{-0.500885382611240786241285004}, { 0.865513624090569082825488358},
-	{ 0.257831102162159005614471295}, { 0.966190003445412555433832961},
-	{-0.966190003445412555433832961}, { 0.257831102162159005614471295},
-	{ 0.946600913083283570044599823}, { 0.322407678801069848384807478},
-	{-0.322407678801069848384807478}, { 0.946600913083283570044599823},
-	{ 0.441371268731716692879988968}, { 0.897324580705418281231391836},
-	{-0.897324580705418281231391836}, { 0.441371268731716692879988968},
-	{ 0.751165131909686411205819422}, { 0.660114342067420478559490747},
-	{-0.660114342067420478559490747}, { 0.751165131909686411205819422},
-	{ 0.064382630929857460819324537}, { 0.997925286198596012623025462},
-	{-0.997925286198596012623025462}, { 0.064382630929857460819324537},
-	{ 0.996571145790554847093566910}, { 0.082740264549375693111987083},
-	{-0.082740264549375693111987083}, { 0.996571145790554847093566910},
-	{ 0.646176012983316364832802220}, { 0.763188417263381271704838297},
-	{-0.763188417263381271704838297}, { 0.646176012983316364832802220},
-	{ 0.889048355854664562540777729}, { 0.457813303598877221904961155},
-	{-0.457813303598877221904961155}, { 0.889048355854664562540777729},
-	{ 0.304929229735402406490728633}, { 0.952375012719765858529893608},
-	{-0.952375012719765858529893608}, { 0.304929229735402406490728633},
-	{ 0.961280485811320641748659653}, { 0.275571819310958163076425168},
-	{-0.275571819310958163076425168}, { 0.961280485811320641748659653},
-	{ 0.484869248000791101822951699}, { 0.874586652278176112634431897},
-	{-0.874586652278176112634431897}, { 0.484869248000791101822951699},
-	{ 0.782650596166575738458949301}, { 0.622461279374149972519166721},
-	{-0.622461279374149972519166721}, { 0.782650596166575738458949301},
-	{ 0.113270952177564349018228733}, { 0.993564135520595333782021697},
-	{-0.993564135520595333782021697}, { 0.113270952177564349018228733},
-	{ 0.983662419211730274396237776}, { 0.180022901405699522679906590},
-	{-0.180022901405699522679906590}, { 0.983662419211730274396237776},
-	{ 0.568258952670131549790548489}, { 0.822849781375826332046780034},
-	{-0.822849781375826332046780034}, { 0.568258952670131549790548489},
-	{ 0.839893794195999504583383987}, { 0.542750784864515906586768661},
-	{-0.542750784864515906586768661}, { 0.839893794195999504583383987},
-	{ 0.210111836880469621717489972}, { 0.977677357824509979943404762},
-	{-0.977677357824509979943404762}, { 0.210111836880469621717489972},
-	{ 0.929640895843181265457918066}, { 0.368466829953372331712746222},
-	{-0.368466829953372331712746222}, { 0.929640895843181265457918066},
-	{ 0.396809987416710328595290911}, { 0.917900775621390457642276297},
-	{-0.917900775621390457642276297}, { 0.396809987416710328595290911},
-	{ 0.717870045055731736211325329}, { 0.696177131491462944788582591},
-	{-0.696177131491462944788582591}, { 0.717870045055731736211325329},
-	{ 0.015339206284988101044151868}, { 0.999882347454212525633049627},
-	{-0.999882347454212525633049627}, { 0.015339206284988101044151868},
-	{ 0.999769405351215321657617036}, { 0.021474080275469507418374898},
-	{-0.021474080275469507418374898}, { 0.999769405351215321657617036},
-	{ 0.691759258364157774906734132}, { 0.722128193929215321243607198},
-	{-0.722128193929215321243607198}, { 0.691759258364157774906734132},
-	{ 0.915448716088267819566431292}, { 0.402434650859418441082533934},
-	{-0.402434650859418441082533934}, { 0.915448716088267819566431292},
-	{ 0.362755724367397216204854462}, { 0.931884265581668106718557199},
-	{-0.931884265581668106718557199}, { 0.362755724367397216204854462},
-	{ 0.976369731330021149312732194}, { 0.216106797076219509948385131},
-	{-0.216106797076219509948385131}, { 0.976369731330021149312732194},
-	{ 0.537587076295645482502214932}, { 0.843208239641845437161743865},
-	{-0.843208239641845437161743865}, { 0.537587076295645482502214932},
-	{ 0.819347520076796960824689637}, { 0.573297166698042212820171239},
-	{-0.573297166698042212820171239}, { 0.819347520076796960824689637},
-	{ 0.173983873387463827950700807}, { 0.984748501801904218556553176},
-	{-0.984748501801904218556553176}, { 0.173983873387463827950700807},
-	{ 0.992850414459865090793563344}, { 0.119365214810991364593637790},
-	{-0.119365214810991364593637790}, { 0.992850414459865090793563344},
-	{ 0.617647307937803932403979402}, { 0.786455213599085757522319464},
-	{-0.786455213599085757522319464}, { 0.617647307937803932403979402},
-	{ 0.871595086655951034842481435}, { 0.490226483288291154229598449},
-	{-0.490226483288291154229598449}, { 0.871595086655951034842481435},
-	{ 0.269668325572915106525464462}, { 0.962953266873683886347921481},
-	{-0.962953266873683886347921481}, { 0.269668325572915106525464462},
-	{ 0.950486073949481721759926101}, { 0.310767152749611495835997250},
-	{-0.310767152749611495835997250}, { 0.950486073949481721759926101},
-	{ 0.452349587233770874133026703}, { 0.891840709392342727796478697},
-	{-0.891840709392342727796478697}, { 0.452349587233770874133026703},
-	{ 0.759209188978388033485525443}, { 0.650846684996380915068975573},
-	{-0.650846684996380915068975573}, { 0.759209188978388033485525443},
-	{ 0.076623861392031492278332463}, { 0.997060070339482978987989949},
-	{-0.997060070339482978987989949}, { 0.076623861392031492278332463},
-	{ 0.997511456140303459699448390}, { 0.070504573389613863027351471},
-	{-0.070504573389613863027351471}, { 0.997511456140303459699448390},
-	{ 0.655492852999615385312679701}, { 0.755201376896536527598710756},
-	{-0.755201376896536527598710756}, { 0.655492852999615385312679701},
-	{ 0.894599485631382678433072126}, { 0.446868840162374195353044389},
-	{-0.446868840162374195353044389}, { 0.894599485631382678433072126},
-	{ 0.316593375556165867243047035}, { 0.948561349915730288158494826},
-	{-0.948561349915730288158494826}, { 0.316593375556165867243047035},
-	{ 0.964589793289812723836432159}, { 0.263754678974831383611349322},
-	{-0.263754678974831383611349322}, { 0.964589793289812723836432159},
-	{ 0.495565261825772531150266670}, { 0.868570705971340895340449876},
-	{-0.868570705971340895340449876}, { 0.495565261825772531150266670},
-	{ 0.790230221437310055030217152}, { 0.612810082429409703935211936},
-	{-0.612810082429409703935211936}, { 0.790230221437310055030217152},
-	{ 0.125454983411546238542336453}, { 0.992099313142191757112085445},
-	{-0.992099313142191757112085445}, { 0.125454983411546238542336453},
-	{ 0.985797509167567424700995000}, { 0.167938294974731178054745536},
-	{-0.167938294974731178054745536}, { 0.985797509167567424700995000},
-	{ 0.578313796411655563342245019}, { 0.815814410806733789010772660},
-	{-0.815814410806733789010772660}, { 0.578313796411655563342245019},
-	{ 0.846490938774052078300544488}, { 0.532403127877197971442805218},
-	{-0.532403127877197971442805218}, { 0.846490938774052078300544488},
-	{ 0.222093620973203534094094721}, { 0.975025345066994146844913468},
-	{-0.975025345066994146844913468}, { 0.222093620973203534094094721},
-	{ 0.934092550404258914729877883}, { 0.357030961233430032614954036},
-	{-0.357030961233430032614954036}, { 0.934092550404258914729877883},
-	{ 0.408044162864978680820747499}, { 0.912962190428398164628018233},
-	{-0.912962190428398164628018233}, { 0.408044162864978680820747499},
-	{ 0.726359155084345976817494315}, { 0.687315340891759108199186948},
-	{-0.687315340891759108199186948}, { 0.726359155084345976817494315},
-	{ 0.027608145778965741612354872}, { 0.999618822495178597116830637},
-	{-0.999618822495178597116830637}, { 0.027608145778965741612354872},
-	{ 0.998941293186856850633930266}, { 0.046003182130914628814301788},
-	{-0.046003182130914628814301788}, { 0.998941293186856850633930266},
-	{ 0.673829000378756060917568372}, { 0.738887324460615147933116508},
-	{-0.738887324460615147933116508}, { 0.673829000378756060917568372},
-	{ 0.905296759318118774354048329}, { 0.424779681209108833357226189},
-	{-0.424779681209108833357226189}, { 0.905296759318118774354048329},
-	{ 0.339776884406826857828825803}, { 0.940506070593268323787291309},
-	{-0.940506070593268323787291309}, { 0.339776884406826857828825803},
-	{ 0.970772140728950302138169611}, { 0.240003022448741486568922365},
-	{-0.240003022448741486568922365}, { 0.970772140728950302138169611},
-	{ 0.516731799017649881508753876}, { 0.856147328375194481019630732},
-	{-0.856147328375194481019630732}, { 0.516731799017649881508753876},
-	{ 0.805031331142963597922659282}, { 0.593232295039799808047809426},
-	{-0.593232295039799808047809426}, { 0.805031331142963597922659282},
-	{ 0.149764534677321517229695737}, { 0.988721691960323767604516485},
-	{-0.988721691960323767604516485}, { 0.149764534677321517229695737},
-	{ 0.989622017463200834623694454}, { 0.143695033150294454819773349},
-	{-0.143695033150294454819773349}, { 0.989622017463200834623694454},
-	{ 0.598160706996342311724958652}, { 0.801376171723140219430247777},
-	{-0.801376171723140219430247777}, { 0.598160706996342311724958652},
-	{ 0.859301818357008404783582139}, { 0.511468850437970399504391001},
-	{-0.511468850437970399504391001}, { 0.859301818357008404783582139},
-	{ 0.245955050335794611599924709}, { 0.969281235356548486048290738},
-	{-0.969281235356548486048290738}, { 0.245955050335794611599924709},
-	{ 0.942573197601446879280758735}, { 0.333999651442009404650865481},
-	{-0.333999651442009404650865481}, { 0.942573197601446879280758735},
-	{ 0.430326481340082633908199031}, { 0.902673318237258806751502391},
-	{-0.902673318237258806751502391}, { 0.430326481340082633908199031},
-	{ 0.743007952135121693517362293}, { 0.669282588346636065720696366},
-	{-0.669282588346636065720696366}, { 0.743007952135121693517362293},
-	{ 0.052131704680283321236358216}, { 0.998640218180265222418199049},
-	{-0.998640218180265222418199049}, { 0.052131704680283321236358216},
-	{ 0.995480755491926941769171600}, { 0.094963495329638998938034312},
-	{-0.094963495329638998938034312}, { 0.995480755491926941769171600},
-	{ 0.636761861236284230413943435}, { 0.771060524261813773200605759},
-	{-0.771060524261813773200605759}, { 0.636761861236284230413943435},
-	{ 0.883363338665731594736308015}, { 0.468688822035827933697617870},
-	{-0.468688822035827933697617870}, { 0.883363338665731594736308015},
-	{ 0.293219162694258650606608599}, { 0.956045251349996443270479823},
-	{-0.956045251349996443270479823}, { 0.293219162694258650606608599},
-	{ 0.957826413027532890321037029}, { 0.287347459544729526477331841},
-	{-0.287347459544729526477331841}, { 0.957826413027532890321037029},
-	{ 0.474100214650550014398580015}, { 0.880470889052160770806542929},
-	{-0.880470889052160770806542929}, { 0.474100214650550014398580015},
-	{ 0.774953106594873878359129282}, { 0.632018735939809021909403706},
-	{-0.632018735939809021909403706}, { 0.774953106594873878359129282},
-	{ 0.101069862754827824987887585}, { 0.994879330794805620591166107},
-	{-0.994879330794805620591166107}, { 0.101069862754827824987887585},
-	{ 0.981379193313754574318224190}, { 0.192080397049892441679288205},
-	{-0.192080397049892441679288205}, { 0.981379193313754574318224190},
-	{ 0.558118531220556115693702964}, { 0.829761233794523042469023765},
-	{-0.829761233794523042469023765}, { 0.558118531220556115693702964},
-	{ 0.833170164701913186439915922}, { 0.553016705580027531764226988},
-	{-0.553016705580027531764226988}, { 0.833170164701913186439915922},
-	{ 0.198098410717953586179324918}, { 0.980182135968117392690210009},
-	{-0.980182135968117392690210009}, { 0.198098410717953586179324918},
-	{ 0.925049240782677590302371869}, { 0.379847208924051170576281147},
-	{-0.379847208924051170576281147}, { 0.925049240782677590302371869},
-	{ 0.385516053843918864075607949}, { 0.922701128333878570437264227},
-	{-0.922701128333878570437264227}, { 0.385516053843918864075607949},
-	{ 0.709272826438865651316533772}, { 0.704934080375904908852523758},
-	{-0.704934080375904908852523758}, { 0.709272826438865651316533772},
-	{ 0.003067956762965976270145365}, { 0.999995293809576171511580126},
-	{-0.999995293809576171511580126}, { 0.003067956762965976270145365}
-};
-
-const fpr fpr_p2_tab[] = {
-	{ 2.00000000000 },
-	{ 1.00000000000 },
-	{ 0.50000000000 },
-	{ 0.25000000000 },
-	{ 0.12500000000 },
-	{ 0.06250000000 },
-	{ 0.03125000000 },
-	{ 0.01562500000 },
-	{ 0.00781250000 },
-	{ 0.00390625000 },
-	{ 0.00195312500 }
-};
-
-#else // yyyFPNATIVE+0 yyyFPEMU+0
-
-#error No FP implementation selected
-
-#endif // yyyFPNATIVE- yyyFPEMU-
diff --git a/crypto_sign/falcon-1024/m4-ct/fpr.h b/crypto_sign/falcon-1024/m4-ct/fpr.h
deleted file mode 100644
index 8176212d..00000000
--- a/crypto_sign/falcon-1024/m4-ct/fpr.h
+++ /dev/null
@@ -1,893 +0,0 @@
-/*
- * Floating-point operations.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#if FALCON_FPEMU  // yyyFPEMU+1 yyyFPNATIVE+0
-
-/* ====================================================================== */
-/*
- * Custom floating-point implementation with integer arithmetics. We
- * use IEEE-754 "binary64" format, with some simplifications:
- *
- *   - Top bit is s = 1 for negative, 0 for positive.
- *
- *   - Exponent e uses the next 11 bits (bits 52 to 62, inclusive).
- *
- *   - Mantissa m uses the 52 low bits.
- *
- * Encoded value is, in general: (-1)^s * 2^(e-1023) * (1 + m*2^(-52))
- * i.e. the mantissa really is a 53-bit number (less than 2.0, but not
- * less than 1.0), but the top bit (equal to 1 by definition) is omitted
- * in the encoding.
- *
- * In IEEE-754, there are some special values:
- *
- *   - If e = 2047, then the value is either an infinite (m = 0) or
- *     a NaN (m != 0).
- *
- *   - If e = 0, then the value is either a zero (m = 0) or a subnormal,
- *     aka "denormalized number" (m != 0).
- *
- * Of these, we only need the zeros. The caller is responsible for not
- * providing operands that would lead to infinites, NaNs or subnormals.
- * If inputs are such that values go out of range, then indeterminate
- * values are returned (it would still be deterministic, but no specific
- * value may be relied upon).
- *
- * At the C level, the three parts are stored in a 64-bit unsigned
- * word.
- *
- * One may note that a property of the IEEE-754 format is that order
- * is preserved for positive values: if two positive floating-point
- * values x and y are such that x < y, then their respective encodings
- * as _signed_ 64-bit integers i64(x) and i64(y) will be such that
- * i64(x) < i64(y). For negative values, order is reversed: if x < 0,
- * y < 0, and x < y, then ia64(x) > ia64(y).
- *
- * IMPORTANT ASSUMPTIONS:
- * ======================
- *
- * For proper computations, and constant-time behaviour, we assume the
- * following:
- *
- *   - 32x32->64 multiplication (unsigned) has an execution time that
- *     is independent of its operands. This is true of most modern
- *     x86 and ARM cores. Notable exceptions are the ARM Cortex M0, M0+
- *     and M3 (in the M0 and M0+, this is done in software, so it depends
- *     on that routine), and the PowerPC cores from the G3/G4 lines.
- *     For more info, see: https://www.bearssl.org/ctmul.html
- *
- *   - Left-shifts and right-shifts of 32-bit values have an execution
- *     time which does not depend on the shifted value nor on the
- *     shift count. An historical exception is the Pentium IV, but most
- *     modern CPU have barrel shifters. Some small microcontrollers
- *     might have varying-time shifts (not the ARM Cortex M*, though).
- *
- *   - Right-shift of a signed negative value performs a sign extension.
- *     As per the C standard, this operation returns an
- *     implementation-defined result (this is NOT an "undefined
- *     behaviour"). On most/all systems, an arithmetic shift is
- *     performed, because this is what makes most sense.
- */
-
-/*
- * Normally we should declare the 'fpr' type to be a struct or union
- * around the internal 64-bit value; however, we want to use the
- * direct 64-bit integer type to enable a lighter call convention on
- * ARM platforms. This means that direct (invalid) use of operators
- * such as '*' or '+' will not be caught by the compiler. We rely on
- * the "normal" (non-emulated) code to detect such instances.
- */
-typedef uint64_t fpr;
-
-/*
- * For computations, we split values into an integral mantissa in the
- * 2^54..2^55 range, and an (adjusted) exponent. The lowest bit is
- * "sticky" (it is set to 1 if any of the bits below it is 1); when
- * re-encoding, the low two bits are dropped, but may induce an
- * increment in the value for proper rounding.
- */
-
-/*
- * Right-shift a 64-bit unsigned value by a possibly secret shift count.
- * We assumed that the underlying architecture had a barrel shifter for
- * 32-bit shifts, but for 64-bit shifts on a 32-bit system, this will
- * typically invoke a software routine that is not necessarily
- * constant-time; hence the function below.
- *
- * Shift count n MUST be in the 0..63 range.
- */
-static inline uint64_t
-fpr_ursh(uint64_t x, int n)
-{
-	x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5);
-	return x >> (n & 31);
-}
-
-/*
- * Right-shift a 64-bit signed value by a possibly secret shift count
- * (see fpr_ursh() for the rationale).
- *
- * Shift count n MUST be in the 0..63 range.
- */
-static inline int64_t
-fpr_irsh(int64_t x, int n)
-{
-	x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5);
-	return x >> (n & 31);
-}
-
-/*
- * Left-shift a 64-bit unsigned value by a possibly secret shift count
- * (see fpr_ursh() for the rationale).
- *
- * Shift count n MUST be in the 0..63 range.
- */
-static inline uint64_t
-fpr_ulsh(uint64_t x, int n)
-{
-	x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5);
-	return x << (n & 31);
-}
-
-/*
- * Expectations:
- *   s = 0 or 1
- *   exponent e is "arbitrary" and unbiased
- *   2^54 <= m < 2^55
- * Numerical value is (-1)^2 * m * 2^e
- *
- * Exponents which are too low lead to value zero. If the exponent is
- * too large, the returned value is indeterminate.
- *
- * If m = 0, then a zero is returned (using the provided sign).
- * If e < -1076, then a zero is returned (regardless of the value of m).
- * If e >= -1076 and e != 0, m must be within the expected range
- * (2^54 to 2^55-1).
- */
-static inline fpr
-FPR(int s, int e, uint64_t m)
-{
-	fpr x;
-	uint32_t t;
-	unsigned f;
-
-	/*
-	 * If e >= -1076, then the value is "normal"; otherwise, it
-	 * should be a subnormal, which we clamp down to zero.
-	 */
-	e += 1076;
-	t = (uint32_t)e >> 31;
-	m &= (uint64_t)t - 1;
-
-	/*
-	 * If m = 0 then we want a zero; make e = 0 too, but conserve
-	 * the sign.
-	 */
-	t = (uint32_t)(m >> 54);
-	e &= -(int)t;
-
-	/*
-	 * The 52 mantissa bits come from m. Value m has its top bit set
-	 * (unless it is a zero); we leave it "as is": the top bit will
-	 * increment the exponent by 1, except when m = 0, which is
-	 * exactly what we want.
-	 */
-	x = (((uint64_t)s << 63) | (m >> 2)) + ((uint64_t)(uint32_t)e << 52);
-
-	/*
-	 * Rounding: if the low three bits of m are 011, 110 or 111,
-	 * then the value should be incremented to get the next
-	 * representable value. This implements the usual
-	 * round-to-nearest rule (with preference to even values in case
-	 * of a tie). Note that the increment may make a carry spill
-	 * into the exponent field, which is again exactly what we want
-	 * in that case.
-	 */
-	f = (unsigned)m & 7U;
-	x += (0xC8U >> f) & 1;
-	return x;
-}
-
-#define fpr_scaled   Zf(fpr_scaled)
-fpr fpr_scaled(int64_t i, int sc);
-
-static inline fpr
-fpr_of(int64_t i)
-{
-	return fpr_scaled(i, 0);
-}
-
-static const fpr fpr_q = 4667981563525332992;
-static const fpr fpr_inverse_of_q = 4545632735260551042;
-static const fpr fpr_inv_2sqrsigma0 = 4594603506513722306;
-static const fpr fpr_inv_sigma = 4573359825155195350;
-static const fpr fpr_sigma_min_9 = 4608495221497168882;
-static const fpr fpr_sigma_min_10 = 4608586345619182117;
-static const fpr fpr_log2 = 4604418534313441775;
-static const fpr fpr_inv_log2 = 4609176140021203710;
-static const fpr fpr_bnorm_max = 4670353323383631276;
-static const fpr fpr_zero = 0;
-static const fpr fpr_one = 4607182418800017408;
-static const fpr fpr_two = 4611686018427387904;
-static const fpr fpr_onehalf = 4602678819172646912;
-static const fpr fpr_invsqrt2 = 4604544271217802189;
-static const fpr fpr_invsqrt8 = 4600040671590431693;
-static const fpr fpr_ptwo31 = 4746794007248502784;
-static const fpr fpr_ptwo31m1 = 4746794007244308480;
-static const fpr fpr_mtwo31m1 = 13970166044099084288U;
-static const fpr fpr_ptwo63m1 = 4890909195324358656;
-static const fpr fpr_mtwo63m1 = 14114281232179134464U;
-static const fpr fpr_ptwo63 = 4890909195324358656;
-
-static inline int64_t
-fpr_rint(fpr x)
-{
-	uint64_t m, d;
-	int e;
-	uint32_t s, dd, f;
-
-	/*
-	 * We assume that the value fits in -(2^63-1)..+(2^63-1). We can
-	 * thus extract the mantissa as a 63-bit integer, then right-shift
-	 * it as needed.
-	 */
-	m = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
-	e = 1085 - ((int)(x >> 52) & 0x7FF);
-
-	/*
-	 * If a shift of more than 63 bits is needed, then simply set m
-	 * to zero. This also covers the case of an input operand equal
-	 * to zero.
-	 */
-	m &= -(uint64_t)((uint32_t)(e - 64) >> 31);
-	e &= 63;
-
-	/*
-	 * Right-shift m as needed. Shift count is e. Proper rounding
-	 * mandates that:
-	 *   - If the highest dropped bit is zero, then round low.
-	 *   - If the highest dropped bit is one, and at least one of the
-	 *     other dropped bits is one, then round up.
-	 *   - If the highest dropped bit is one, and all other dropped
-	 *     bits are zero, then round up if the lowest kept bit is 1,
-	 *     or low otherwise (i.e. ties are broken by "rounding to even").
-	 *
-	 * We thus first extract a word consisting of all the dropped bit
-	 * AND the lowest kept bit; then we shrink it down to three bits,
-	 * the lowest being "sticky".
-	 */
-	d = fpr_ulsh(m, 63 - e);
-	dd = (uint32_t)d | ((uint32_t)(d >> 32) & 0x1FFFFFFF);
-	f = (uint32_t)(d >> 61) | ((dd | -dd) >> 31);
-	m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U);
-
-	/*
-	 * Apply the sign bit.
-	 */
-	s = (uint32_t)(x >> 63);
-	return ((int64_t)m ^ -(int64_t)s) + (int64_t)s;
-}
-
-static inline int64_t
-fpr_floor(fpr x)
-{
-	uint64_t t;
-	int64_t xi;
-	int e, cc;
-
-	/*
-	 * We extract the integer as a _signed_ 64-bit integer with
-	 * a scaling factor. Since we assume that the value fits
-	 * in the -(2^63-1)..+(2^63-1) range, we can left-shift the
-	 * absolute value to make it in the 2^62..2^63-1 range: we
-	 * will only need a right-shift afterwards.
-	 */
-	e = (int)(x >> 52) & 0x7FF;
-	t = x >> 63;
-	xi = (int64_t)(((x << 10) | ((uint64_t)1 << 62))
-		& (((uint64_t)1 << 63) - 1));
-	xi = (xi ^ -(int64_t)t) + (int64_t)t;
-	cc = 1085 - e;
-
-	/*
-	 * We perform an arithmetic right-shift on the value. This
-	 * applies floor() semantics on both positive and negative values
-	 * (rounding toward minus infinity).
-	 */
-	xi = fpr_irsh(xi, cc & 63);
-
-	/*
-	 * If the true shift count was 64 or more, then we should instead
-	 * replace xi with 0 (if nonnegative) or -1 (if negative). Edge
-	 * case: -0 will be floored to -1, not 0 (whether this is correct
-	 * is debatable; in any case, the other functions normalize zero
-	 * to +0).
-	 *
-	 * For an input of zero, the non-shifted xi was incorrect (we used
-	 * a top implicit bit of value 1, not 0), but this does not matter
-	 * since this operation will clamp it down.
-	 */
-	xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31);
-	return xi;
-}
-
-static inline int64_t
-fpr_trunc(fpr x)
-{
-	uint64_t t, xu;
-	int e, cc;
-
-	/*
-	 * Extract the absolute value. Since we assume that the value
-	 * fits in the -(2^63-1)..+(2^63-1) range, we can left-shift
-	 * the absolute value into the 2^62..2^63-1 range, and then
-	 * do a right shift afterwards.
-	 */
-	e = (int)(x >> 52) & 0x7FF;
-	xu = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
-	cc = 1085 - e;
-	xu = fpr_ursh(xu, cc & 63);
-
-	/*
-	 * If the exponent is too low (cc > 63), then the shift was wrong
-	 * and we must clamp the value to 0. This also covers the case
-	 * of an input equal to zero.
-	 */
-	xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31);
-
-	/*
-	 * Apply back the sign, if the source value is negative.
-	 */
-	t = x >> 63;
-	xu = (xu ^ -t) + t;
-	return *(int64_t *)&xu;
-}
-
-#define fpr_add   Zf(fpr_add)
-fpr fpr_add(fpr x, fpr y);
-
-static inline fpr
-fpr_sub(fpr x, fpr y)
-{
-	y ^= (uint64_t)1 << 63;
-	return fpr_add(x, y);
-}
-
-static inline fpr
-fpr_neg(fpr x)
-{
-	x ^= (uint64_t)1 << 63;
-	return x;
-}
-
-static inline fpr
-fpr_half(fpr x)
-{
-	/*
-	 * To divide a value by 2, we just have to subtract 1 from its
-	 * exponent, but we have to take care of zero.
-	 */
-	uint32_t t;
-
-	x -= (uint64_t)1 << 52;
-	t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11;
-	x &= (uint64_t)t - 1;
-	return x;
-}
-
-static inline fpr
-fpr_double(fpr x)
-{
-	/*
-	 * To double a value, we just increment by one the exponent. We
-	 * don't care about infinites or NaNs; however, 0 is a
-	 * special case.
-	 */
-	x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52;
-	return x;
-}
-
-#define fpr_mul   Zf(fpr_mul)
-fpr fpr_mul(fpr x, fpr y);
-
-static inline fpr
-fpr_sqr(fpr x)
-{
-	return fpr_mul(x, x);
-}
-
-#define fpr_div   Zf(fpr_div)
-fpr fpr_div(fpr x, fpr y);
-
-static inline fpr
-fpr_inv(fpr x)
-{
-	return fpr_div(4607182418800017408u, x);
-}
-
-#define fpr_sqrt   Zf(fpr_sqrt)
-fpr fpr_sqrt(fpr x);
-
-static inline int
-fpr_lt(fpr x, fpr y)
-{
-	/*
-	 * If x >= 0 or y >= 0, a signed comparison yields the proper
-	 * result:
-	 *   - For positive values, the order is preserved.
-	 *   - The sign bit is at the same place as in integers, so
-	 *     sign is preserved.
-	 *
-	 * If both x and y are negative, then the order is reversed.
-	 * We cannot simply invert the comparison result in that case
-	 * because it would not handle the edge case x = y properly.
-	 */
-	int cc0, cc1;
-
-	cc0 = *(int64_t *)&x < *(int64_t *)&y;
-	cc1 = *(int64_t *)&x > *(int64_t *)&y;
-	return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63));
-}
-
-/*
- * Compute exp(x) for x such that |x| <= ln 2. We want a precision of 50
- * bits or so.
- */
-#define fpr_expm_p63   Zf(fpr_expm_p63)
-uint64_t fpr_expm_p63(fpr x, fpr ccs);
-
-#define fpr_gm_tab   Zf(fpr_gm_tab)
-extern const fpr fpr_gm_tab[];
-
-#define fpr_p2_tab   Zf(fpr_p2_tab)
-extern const fpr fpr_p2_tab[];
-
-/* ====================================================================== */
-
-#elif FALCON_FPNATIVE  // yyyFPEMU+0 yyyFPNATIVE+1
-
-/* ====================================================================== */
-
-#include <math.h>
-
-/*
- * We wrap the native 'double' type into a structure so that the C compiler
- * complains if we inadvertently use raw arithmetic operators on the 'fpr'
- * type instead of using the inline functions below. This should have no
- * extra runtime cost, since all the functions below are 'inline'.
- */
-typedef struct { double v; } fpr;
-
-static inline fpr
-FPR(double v)
-{
-	fpr x;
-
-	x.v = v;
-	return x;
-}
-
-static inline fpr
-fpr_of(int64_t i)
-{
-	return FPR((double)i);
-}
-
-static const fpr fpr_q = { 12289.0 };
-static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 };
-static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 };
-static const fpr fpr_inv_sigma = { .005819826392951607426919370871 };
-static const fpr fpr_sigma_min_9 = { 1.291500756233514568549480827642 };
-static const fpr fpr_sigma_min_10 = { 1.311734375905083682667395805765 };
-static const fpr fpr_log2 = { 0.69314718055994530941723212146 };
-static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 };
-static const fpr fpr_bnorm_max = { 16822.4121 };
-static const fpr fpr_zero = { 0.0 };
-static const fpr fpr_one = { 1.0 };
-static const fpr fpr_two = { 2.0 };
-static const fpr fpr_onehalf = { 0.5 };
-static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 };
-static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 };
-static const fpr fpr_ptwo31 = { 2147483648.0 };
-static const fpr fpr_ptwo31m1 = { 2147483647.0 };
-static const fpr fpr_mtwo31m1 = { -2147483647.0 };
-static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 };
-static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 };
-static const fpr fpr_ptwo63 = { 9223372036854775808.0 };
-
-static inline int64_t
-fpr_rint(fpr x)
-{
-	/*
-	 * We do not want to use llrint() since it might be not
-	 * constant-time.
-	 *
-	 * Suppose that x >= 0. If x >= 2^52, then it is already an
-	 * integer. Otherwise, if x < 2^52, then computing x+2^52 will
-	 * yield a value that will be rounded to the nearest integer
-	 * with exactly the right rules (round-to-nearest-even).
-	 *
-	 * In order to have constant-time processing, we must do the
-	 * computation for both x >= 0 and x < 0 cases, and use a
-	 * cast to an integer to access the sign and select the proper
-	 * value. Such casts also allow us to find out if |x| < 2^52.
-	 */
-	int64_t sx, tx, rp, rn, m;
-	uint32_t ub;
-
-	sx = (int64_t)(x.v - 1.0);
-	tx = (int64_t)x.v;
-	rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496;
-	rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496;
-
-	/*
-	 * If tx >= 2^52 or tx < -2^52, then result is tx.
-	 * Otherwise, if sx >= 0, then result is rp.
-	 * Otherwise, result is rn. We use the fact that when x is
-	 * close to 0 (|x| <= 0.25) then both rp and rn are correct;
-	 * and if x is not close to 0, then trunc(x-1.0) yields the
-	 * appropriate sign.
-	 */
-
-	/*
-	 * Clamp rp to zero if tx < 0.
-	 * Clamp rn to zero if tx >= 0.
-	 */
-	m = sx >> 63;
-	rn &= m;
-	rp &= ~m;
-
-	/*
-	 * Get the 12 upper bits of tx; if they are not all zeros or
-	 * all ones, then tx >= 2^52 or tx < -2^52, and we clamp both
-	 * rp and rn to zero. Otherwise, we clamp tx to zero.
-	 */
-	ub = (uint32_t)((uint64_t)tx >> 52);
-	m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31);
-	rp &= m;
-	rn &= m;
-	tx &= ~m;
-
-	/*
-	 * Only one of tx, rn or rp (at most) can be non-zero at this
-	 * point.
-	 */
-	return tx | rn | rp;
-}
-
-static inline int64_t
-fpr_floor(fpr x)
-{
-	int64_t r;
-
-	/*
-	 * The cast performs a trunc() (rounding toward 0) and thus is
-	 * wrong by 1 for most negative values. The correction below is
-	 * constant-time as long as the compiler turns the
-	 * floating-point conversion result into a 0/1 integer without a
-	 * conditional branch or another non-constant-time construction.
-	 * This should hold on all modern architectures with an FPU (and
-	 * if it is false on a given arch, then chances are that the FPU
-	 * itself is not constant-time, making the point moot).
-	 */
-	r = (int64_t)x.v;
-	return r - (x.v < (double)r);
-}
-
-static inline int64_t
-fpr_trunc(fpr x)
-{
-	return (int64_t)x.v;
-}
-
-static inline fpr
-fpr_add(fpr x, fpr y)
-{
-	return FPR(x.v + y.v);
-}
-
-static inline fpr
-fpr_sub(fpr x, fpr y)
-{
-	return FPR(x.v - y.v);
-}
-
-static inline fpr
-fpr_neg(fpr x)
-{
-	return FPR(-x.v);
-}
-
-static inline fpr
-fpr_half(fpr x)
-{
-	return FPR(x.v * 0.5);
-}
-
-static inline fpr
-fpr_double(fpr x)
-{
-	return FPR(x.v + x.v);
-}
-
-static inline fpr
-fpr_mul(fpr x, fpr y)
-{
-	return FPR(x.v * y.v);
-}
-
-static inline fpr
-fpr_sqr(fpr x)
-{
-	return FPR(x.v * x.v);
-}
-
-static inline fpr
-fpr_inv(fpr x)
-{
-	return FPR(1.0 / x.v);
-}
-
-static inline fpr
-fpr_div(fpr x, fpr y)
-{
-	return FPR(x.v / y.v);
-}
-
-#if FALCON_AVX2  // yyyAVX2+1
-TARGET_AVX2
-static inline void
-fpr_sqrt_avx2(double *t)
-{
-	__m128d x;
-
-	x = _mm_load1_pd(t);
-	x = _mm_sqrt_pd(x);
-	_mm_storel_pd(t, x);
-}
-#endif  // yyyAVX2-
-
-static inline fpr
-fpr_sqrt(fpr x)
-{
-	/*
-	 * We prefer not to have a dependency on libm when it can be
-	 * avoided. On x86, calling the sqrt() libm function inlines
-	 * the relevant opcode (fsqrt or sqrtsd, depending on whether
-	 * the 387 FPU or SSE2 is used for floating-point operations)
-	 * but then makes an optional call to the library function
-	 * for proper error handling, in case the operand is negative.
-	 *
-	 * To avoid this dependency, we use intrinsics or inline assembly
-	 * on recognized platforms:
-	 *
-	 *  - If AVX2 is explicitly enabled, then we use SSE2 intrinsics.
-	 *
-	 *  - On GCC/Clang with SSE maths, we use SSE2 intrinsics.
-	 *
-	 *  - On GCC/Clang on i386, or MSVC on i386, we use inline assembly
-	 *    to call the 387 FPU fsqrt opcode.
-	 *
-	 *  - On GCC/Clang/XLC on PowerPC, we use inline assembly to call
-	 *    the fsqrt opcode (Clang needs a special hack).
-	 *
-	 *  - On GCC/Clang on ARM with hardware floating-point, we use
-	 *    inline assembly to call the vqsrt.f64 opcode. Due to a
-	 *    complex ecosystem of compilers and assembly syntaxes, we
-	 *    have to call it "fsqrt" or "fsqrtd", depending on case.
-	 *
-	 * If the platform is not recognized, a call to the system
-	 * library function sqrt() is performed. On some compilers, this
-	 * may actually inline the relevant opcode, and call the library
-	 * function only when the input is invalid (e.g. negative);
-	 * Falcon never actually calls sqrt() on a negative value, but
-	 * the dependency to libm will still be there.
-	 */
-
-#if FALCON_AVX2  // yyyAVX2+1
-	fpr_sqrt_avx2(&x.v);
-	return x;
-#else  // yyyAVX2+0
-#if defined __GNUC__ && defined __SSE2_MATH__
-	return FPR(_mm_cvtsd_f64(_mm_sqrt_pd(_mm_set1_pd(x.v))));
-#elif defined __GNUC__ && defined __i386__
-	__asm__ __volatile__ (
-		"fldl   %0\n\t"
-		"fsqrt\n\t"
-		"fstpl  %0\n\t"
-		: "+m" (x.v) : : );
-	return x;
-#elif defined _M_IX86
-	__asm {
-		fld x.v
-		fsqrt
-		fstp x.v
-	}
-	return x;
-#elif defined __PPC__ && defined __GNUC__
-	fpr y;
-
-#if defined __clang__
-	/*
-	 * Normally we should use a 'd' constraint (register that contains
-	 * a 'double' value) but Clang 3.8.1 chokes on it. Instead we use
-	 * an 'f' constraint, counting on the fact that 'float' values
-	 * are managed in double-precision registers anyway, and the
-	 * compiler will not add extra rounding steps.
-	 */
-	__asm__ ( "fsqrt  %0, %1" : "=f" (y.v) : "f" (x.v) : );
-#else
-	__asm__ ( "fsqrt  %0, %1" : "=d" (y.v) : "d" (x.v) : );
-#endif
-	return y;
-#elif (defined __ARM_FP && ((__ARM_FP & 0x08) == 0x08)) \
-	|| (!defined __ARM_FP && defined __ARM_VFPV2__)
-	/*
-	 * On ARM, assembly syntaxes are a bit of a mess, depending on
-	 * whether GCC or Clang is used, and the binutils version, and
-	 * whether this is 32-bit or 64-bit mode. The code below appears
-	 * to work on:
-	 *    32-bit   GCC-4.9.2   Clang-3.5   Binutils-2.25
-	 *    64-bit   GCC-6.3.0   Clang-3.9   Binutils-2.28
-	 */
-#if defined __aarch64__ && __aarch64__
-	__asm__ ( "fsqrt   %d0, %d0" : "+w" (x.v) : : );
-#else
-	__asm__ ( "fsqrtd  %P0, %P0" : "+w" (x.v) : : );
-#endif
-	return x;
-#else
-	return FPR(sqrt(x.v));
-#endif
-#endif  // yyyAVX2-
-}
-
-static inline int
-fpr_lt(fpr x, fpr y)
-{
-	return x.v < y.v;
-}
-
-TARGET_AVX2
-static inline uint64_t
-fpr_expm_p63(fpr x, fpr ccs)
-{
-	/*
-	 * Polynomial approximation of exp(-x) is taken from FACCT:
-	 *   https://eprint.iacr.org/2018/1234
-	 * Specifically, values are extracted from the implementation
-	 * referenced from the FACCT article, and available at:
-	 *   https://github.com/raykzhao/gaussian
-	 * Tests over more than 24 billions of random inputs in the
-	 * 0..log(2) range have never shown a deviation larger than
-	 * 2^(-50) from the true mathematical value.
-	 */
-
-#if FALCON_AVX2  // yyyAVX2+1
-
-	/*
-	 * AVX2 implementation uses more operations than Horner's method,
-	 * but with a lower expression tree depth. This helps because
-	 * additions and multiplications have a latency of 4 cycles on
-	 * a Skylake, but the CPU can issue two of them per cycle.
-	 */
-
-	static const union {
-		double d[12];
-		__m256d v[3];
-	} c = {
-		{
-			0.999999999999994892974086724280,
-			0.500000000000019206858326015208,
-			0.166666666666984014666397229121,
-			0.041666666666110491190622155955,
-			0.008333333327800835146903501993,
-			0.001388888894063186997887560103,
-			0.000198412739277311890541063977,
-			0.000024801566833585381209939524,
-			0.000002755586350219122514855659,
-			0.000000275607356160477811864927,
-			0.000000025299506379442070029551,
-			0.000000002073772366009083061987
-		}
-	};
-
-	double d1, d2, d4, d8, y;
-	__m256d d14, d58, d9c;
-
-	d1 = -x.v;
-	d2 = d1 * d1;
-	d4 = d2 * d2;
-	d8 = d4 * d4;
-	d14 = _mm256_set_pd(d4, d2 * d1, d2, d1);
-	d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4));
-	d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8));
-	d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0]));
-	d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14);
-	d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58);
-	d9c = _mm256_hadd_pd(d9c, d9c);
-	y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c)
-		+ _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1));
-	y *= ccs.v;
-
-	/*
-	 * Final conversion goes through int64_t first, because that's what
-	 * the underlying opcode (vcvttsd2si) will do, and we know that the
-	 * result will fit, since x >= 0 and ccs < 1. If we did the
-	 * conversion directly to uint64_t, then the compiler would add some
-	 * extra code to cover the case of a source value of 2^63 or more,
-	 * and though the alternate path would never be exercised, the
-	 * extra comparison would cost us some cycles.
-	 */
-	return (uint64_t)(int64_t)(y * fpr_ptwo63.v);
-
-#else  // yyyAVX2+0
-
-	/*
-	 * Normal implementation uses Horner's method, which minimizes
-	 * the number of operations.
-	 */
-
-	double d, y;
-
-	d = x.v;
-	y = 0.000000002073772366009083061987;
-	y = 0.000000025299506379442070029551 - y * d;
-	y = 0.000000275607356160477811864927 - y * d;
-	y = 0.000002755586350219122514855659 - y * d;
-	y = 0.000024801566833585381209939524 - y * d;
-	y = 0.000198412739277311890541063977 - y * d;
-	y = 0.001388888894063186997887560103 - y * d;
-	y = 0.008333333327800835146903501993 - y * d;
-	y = 0.041666666666110491190622155955 - y * d;
-	y = 0.166666666666984014666397229121 - y * d;
-	y = 0.500000000000019206858326015208 - y * d;
-	y = 0.999999999999994892974086724280 - y * d;
-	y = 1.000000000000000000000000000000 - y * d;
-	y *= ccs.v;
-	return (uint64_t)(y * fpr_ptwo63.v);
-
-#endif  // yyyAVX2-
-}
-
-#define fpr_gm_tab   Zf(fpr_gm_tab)
-extern const fpr fpr_gm_tab[];
-
-#define fpr_p2_tab   Zf(fpr_p2_tab)
-extern const fpr fpr_p2_tab[];
-
-/* ====================================================================== */
-
-#else  // yyyFPEMU+0 yyyFPNATIVE+0
-
-#error No FP implementation selected
-
-#endif  // yyyFPEMU- yyyFPNATIVE-
diff --git a/crypto_sign/falcon-1024/m4-ct/inner.h b/crypto_sign/falcon-1024/m4-ct/inner.h
deleted file mode 100644
index 1f7d0819..00000000
--- a/crypto_sign/falcon-1024/m4-ct/inner.h
+++ /dev/null
@@ -1,1168 +0,0 @@
-#ifndef FALCON_INNER_H__
-#define FALCON_INNER_H__
-
-/*
- * Internal functions for Falcon. This is not the API intended to be
- * used by applications; instead, this internal API provides all the
- * primitives on which wrappers build to provide external APIs.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-/*
- * IMPORTANT API RULES
- * -------------------
- *
- * This API has some non-trivial usage rules:
- *
- *
- *  - All public functions (i.e. the non-static ones) must be referenced
- *    with the Zf() macro (e.g. Zf(verify_raw) for the verify_raw()
- *    function). That macro adds a prefix to the name, which is
- *    configurable with the FALCON_PREFIX macro. This allows compiling
- *    the code into a specific "namespace" and potentially including
- *    several versions of this code into a single application (e.g. to
- *    have an AVX2 and a non-AVX2 variants and select the one to use at
- *    runtime based on availability of AVX2 opcodes).
- *
- *  - Functions that need temporary buffers expects them as a final
- *    tmp[] array of type uint8_t*, with a size which is documented for
- *    each function. However, most have some alignment requirements,
- *    because they will use the array to store 16-bit, 32-bit or 64-bit
- *    values (e.g. uint64_t or double). The caller must ensure proper
- *    alignment. What happens on unaligned access depends on the
- *    underlying architecture, ranging from a slight time penalty
- *    to immediate termination of the process.
- *
- *  - Some functions rely on specific rounding rules and precision for
- *    floating-point numbers. On some systems (in particular 32-bit x86
- *    with the 387 FPU), this requires setting an hardware control
- *    word. The caller MUST use set_fpu_cw() to ensure proper precision:
- *
- *      oldcw = set_fpu_cw(2);
- *      Zf(sign_dyn)(...);
- *      set_fpu_cw(oldcw);
- *
- *    On systems where the native floating-point precision is already
- *    proper, or integer-based emulation is used, the set_fpu_cw()
- *    function does nothing, so it can be called systematically.
- */
-
-// yyyPQCLEAN+0 yyyNIST+0 yyySUPERCOP+0
-#include "config.h"
-// yyyPQCLEAN- yyyNIST- yyySUPERCOP-
-// yyySUPERCOP+1
-// yyyCONF*
-// yyySUPERCOP-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#if defined FALCON_AVX2 && FALCON_AVX2 // yyyAVX2+1
-/*
- * This implementation uses AVX2 and optionally FMA intrinsics.
- */
-#include <immintrin.h>
-#ifndef FALCON_LE
-#define FALCON_LE   1
-#endif
-#ifndef FALCON_UNALIGNED
-#define FALCON_UNALIGNED   1
-#endif
-#if defined __GNUC__
-#if defined FALCON_FMA && FALCON_FMA
-#define TARGET_AVX2   __attribute__((target("avx2,fma")))
-#else
-#define TARGET_AVX2   __attribute__((target("avx2")))
-#endif
-#elif defined _MSC_VER && _MSC_VER
-#pragma warning( disable : 4752 )
-#endif
-#if defined FALCON_FMA && FALCON_FMA
-#define FMADD(a, b, c)   _mm256_fmadd_pd(a, b, c)
-#define FMSUB(a, b, c)   _mm256_fmsub_pd(a, b, c)
-#else
-#define FMADD(a, b, c)   _mm256_add_pd(_mm256_mul_pd(a, b), c)
-#define FMSUB(a, b, c)   _mm256_sub_pd(_mm256_mul_pd(a, b), c)
-#endif
-#endif // yyyAVX2-
-
-// yyyNIST+0 yyyPQCLEAN+0
-/*
- * On MSVC, disable warning about applying unary minus on an unsigned
- * type: this is perfectly defined standard behaviour and we do it
- * quite often.
- */
-#if defined _MSC_VER && _MSC_VER
-#pragma warning( disable : 4146 )
-#endif
-
-// yyySUPERCOP+0
-/*
- * Enable ARM assembly on any ARMv7m platform (if it was not done before).
- */
-#ifndef FALCON_ASM_CORTEXM4
-#if (defined __ARM_ARCH_7EM__ && __ARM_ARCH_7EM__) \
-	&& (defined __ARM_FEATURE_DSP && __ARM_FEATURE_DSP)
-#define FALCON_ASM_CORTEXM4   1
-#else
-#define FALCON_ASM_CORTEXM4   0
-#endif
-#endif
-// yyySUPERCOP-
-
-#if defined __i386__ || defined _M_IX86 \
-	|| defined __x86_64__ || defined _M_X64 || \
-	(defined _ARCH_PWR8 && \
-		(defined __LITTLE_ENDIAN || defined __LITTLE_ENDIAN__))
-
-#ifndef FALCON_LE
-#define FALCON_LE     1
-#endif
-#ifndef FALCON_UNALIGNED
-#define FALCON_UNALIGNED   1
-#endif
-
-#elif defined FALCON_ASM_CORTEXM4 && FALCON_ASM_CORTEXM4
-
-#ifndef FALCON_LE
-#define FALCON_LE     1
-#endif
-#ifndef FALCON_UNALIGNED
-#define FALCON_UNALIGNED   0
-#endif
-
-#elif (defined __LITTLE_ENDIAN__ && __LITTLE_ENDIAN__) \
-	|| (defined __BYTE_ORDER__ && defined __ORDER_LITTLE_ENDIAN__ \
-		&& __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-
-#ifndef FALCON_LE
-#define FALCON_LE     1
-#endif
-#ifndef FALCON_UNALIGNED
-#define FALCON_UNALIGNED   0
-#endif
-
-#else
-
-#ifndef FALCON_LE
-#define FALCON_LE     0
-#endif
-#ifndef FALCON_UNALIGNED
-#define FALCON_UNALIGNED   0
-#endif
-
-#endif
-
-/*
- * We ensure that both FALCON_FPEMU and FALCON_FPNATIVE are defined,
- * with compatible values (exactly one of them must be non-zero).
- * If none is defined, then default FP implementation is 'native'
- * except on ARM Cortex M4.
- */
-#if !defined FALCON_FPEMU && !defined FALCON_FPNATIVE
-
-#if (defined __ARM_FP && ((__ARM_FP & 0x08) == 0x08)) \
-	|| (!defined __ARM_FP && defined __ARM_VFPV2__)
-#define FALCON_FPEMU      0
-#define FALCON_FPNATIVE   1
-#elif defined FALCON_ASM_CORTEXM4 && FALCON_ASM_CORTEXM4
-#define FALCON_FPEMU      1
-#define FALCON_FPNATIVE   0
-#else
-#define FALCON_FPEMU      0
-#define FALCON_FPNATIVE   1
-#endif
-
-#elif defined FALCON_FPEMU && !defined FALCON_FPNATIVE
-
-#if FALCON_FPEMU
-#define FALCON_FPNATIVE   0
-#else
-#define FALCON_FPNATIVE   1
-#endif
-
-#elif defined FALCON_FPNATIVE && !defined FALCON_FPEMU
-
-#if FALCON_FPNATIVE
-#define FALCON_FPEMU   0
-#else
-#define FALCON_FPEMU   1
-#endif
-
-#endif
-
-#if (FALCON_FPEMU && FALCON_FPNATIVE) || (!FALCON_FPEMU && !FALCON_FPNATIVE)
-#error Exactly one of FALCON_FPEMU and FALCON_FPNATIVE must be selected
-#endif
-
-// yyySUPERCOP+0
-/*
- * For seed generation from the operating system:
- *  - On Linux and glibc-2.25+, FreeBSD 12+ and OpenBSD, use getentropy().
- *  - On Unix-like systems, use /dev/urandom (including as a fallback
- *    for failed getentropy() calls).
- *  - On Windows, use CryptGenRandom().
- */
-
-#ifndef FALCON_RAND_GETENTROPY
-#if (defined __linux__ && defined __GLIBC__ \
-	&& (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 25))) \
-	|| (defined __FreeBSD__ && __FreeBSD__ >= 12) \
-	|| defined __OpenBSD__
-#define FALCON_RAND_GETENTROPY   1
-#else
-#define FALCON_RAND_GETENTROPY   0
-#endif
-#endif
-
-#ifndef FALCON_RAND_URANDOM
-#if defined _AIX \
-	|| defined __ANDROID__ \
-	|| defined __FreeBSD__ \
-	|| defined __NetBSD__ \
-	|| defined __OpenBSD__ \
-	|| defined __DragonFly__ \
-	|| defined __linux__ \
-	|| (defined __sun && (defined __SVR4 || defined __svr4__)) \
-	|| (defined __APPLE__ && defined __MACH__)
-#define FALCON_RAND_URANDOM   1
-#else
-#define FALCON_RAND_URANDOM   0
-#endif
-#endif
-
-#ifndef FALCON_RAND_WIN32
-#if defined _WIN32 || defined _WIN64
-#define FALCON_RAND_WIN32   1
-#else
-#define FALCON_RAND_WIN32   0
-#endif
-#endif
-// yyySUPERCOP-
-
-/*
- * For still undefined compile-time macros, define them to 0 to avoid
- * warnings with -Wundef.
- */
-#ifndef FALCON_AVX2
-#define FALCON_AVX2   0
-#endif
-#ifndef FALCON_FMA
-#define FALCON_FMA   0
-#endif
-#ifndef FALCON_KG_CHACHA20
-#define FALCON_KG_CHACHA20   0
-#endif
-// yyyNIST- yyyPQCLEAN-
-
-// yyyPQCLEAN+0 yyySUPERCOP+0
-/*
- * "Naming" macro used to apply a consistent prefix over all global
- * symbols.
- */
-#ifndef FALCON_PREFIX
-#define FALCON_PREFIX   falcon_inner
-#endif
-#define Zf(name)             Zf_(FALCON_PREFIX, name)
-#define Zf_(prefix, name)    Zf__(prefix, name)
-#define Zf__(prefix, name)   prefix ## _ ## name
-// yyyPQCLEAN- yyySUPERCOP-
-
-// yyyAVX2+1
-/*
- * We use the TARGET_AVX2 macro to tag some functions which, in some
- * configurations, may use AVX2 and FMA intrinsics; this depends on
- * the compiler. In all other cases, we just define it to emptiness
- * (i.e. it will have no effect).
- */
-#ifndef TARGET_AVX2
-#define TARGET_AVX2
-#endif
-// yyyAVX2-
-
-/*
- * Some computations with floating-point elements, in particular
- * rounding to the nearest integer, rely on operations using _exactly_
- * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
- * x86, the 387 FPU may be used (depending on the target OS) and, in
- * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
- * total type length); to prevent miscomputations, we define an explicit
- * function that modifies the precision in the FPU control word.
- *
- * set_fpu_cw() sets the precision to the provided value, and returns
- * the previously set precision; callers are supposed to restore the
- * previous precision on exit. The correct (52-bit) precision is
- * configured with the value "2". On unsupported compilers, or on
- * targets other than 32-bit x86, or when the native 'double' type is
- * not used, the set_fpu_cw() function does nothing at all.
- */
-#if FALCON_FPNATIVE  // yyyFPNATIVE+1
-#if defined __GNUC__ && defined __i386__
-static inline unsigned
-set_fpu_cw(unsigned x)
-{
-	unsigned short t;
-	unsigned old;
-
-	__asm__ __volatile__ ("fstcw %0" : "=m" (t) : : );
-	old = (t & 0x0300u) >> 8;
-	t = (unsigned short)((t & ~0x0300u) | (x << 8));
-	__asm__ __volatile__ ("fldcw %0" : : "m" (t) : );
-	return old;
-}
-#elif defined _M_IX86
-static inline unsigned
-set_fpu_cw(unsigned x)
-{
-	unsigned short t;
-	unsigned old;
-
-	__asm { fstcw t }
-	old = (t & 0x0300u) >> 8;
-	t = (unsigned short)((t & ~0x0300u) | (x << 8));
-	__asm { fldcw t }
-	return old;
-}
-#else
-static inline unsigned
-set_fpu_cw(unsigned x)
-{
-	return x;
-}
-#endif
-#else  // yyyFPNATIVE+0
-static inline unsigned
-set_fpu_cw(unsigned x)
-{
-	return x;
-}
-#endif  // yyyFPNATIVE-
-
-#if FALCON_FPNATIVE && !FALCON_AVX2  // yyyFPNATIVE+1 yyyAVX2+0
-/*
- * If using the native 'double' type but not AVX2 code, on an x86
- * machine with SSE2 activated for maths, then we will use the
- * SSE2 intrinsics.
- */
-#if defined __GNUC__ && defined __SSE2_MATH__
-#include <immintrin.h>
-#endif
-#endif  // yyyFPNATIVE- yyyAVX2-
-
-#if FALCON_FPNATIVE  // yyyFPNATIVE+1
-/*
- * For optimal reproducibility of values, we need to disable contraction
- * of floating-point expressions; otherwise, on some architectures (e.g.
- * PowerPC), the compiler may generate fused-multiply-add opcodes that
- * may round differently than two successive separate opcodes. C99 defines
- * a standard pragma for that, but GCC-6.2.2 appears to ignore it,
- * hence the GCC-specific pragma (that Clang does not support).
- */
-#if defined __clang__
-#pragma STDC FP_CONTRACT OFF
-#elif defined __GNUC__
-#pragma GCC optimize ("fp-contract=off")
-#endif
-#endif  // yyyFPNATIVE-
-
-// yyyPQCLEAN+0
-/*
- * MSVC 2015 does not know the C99 keyword 'restrict'.
- */
-#if defined _MSC_VER && _MSC_VER
-#ifndef restrict
-#define restrict   __restrict
-#endif
-#endif
-// yyyPQCLEAN-
-
-/* ==================================================================== */
-/*
- * SHAKE256 implementation (shake.c).
- *
- * API is defined to be easily replaced with the fips202.h API defined
- * as part of PQClean.
- */
-
-// yyyPQCLEAN+0
-/*
-typedef struct {
-	union {
-		uint64_t A[25];
-		uint8_t dbuf[200];
-	} st;
-	uint64_t dptr;
-} inner_shake256_context;
-
-#define inner_shake256_init      Zf(i_shake256_init)
-#define inner_shake256_inject    Zf(i_shake256_inject)
-#define inner_shake256_flip      Zf(i_shake256_flip)
-#define inner_shake256_extract   Zf(i_shake256_extract)
-
-void Zf(i_shake256_init)(
-	inner_shake256_context *sc);
-void Zf(i_shake256_inject)(
-	inner_shake256_context *sc, const uint8_t *in, size_t len);
-void Zf(i_shake256_flip)(
-	inner_shake256_context *sc);
-void Zf(i_shake256_extract)(
-	inner_shake256_context *sc, uint8_t *out, size_t len);
-*/
-
-// yyyPQCLEAN+1
-
-#include "fips202.h"
-
-#define inner_shake256_context                shake256incctx
-#define inner_shake256_init(sc)               shake256_inc_init(sc)
-#define inner_shake256_inject(sc, in, len)    shake256_inc_absorb(sc, in, len)
-#define inner_shake256_flip(sc)               shake256_inc_finalize(sc)
-#define inner_shake256_extract(sc, out, len)  shake256_inc_squeeze(out, len, sc)
-
-// yyyPQCLEAN+0
-
-// yyyPQCLEAN-
-
-/* ==================================================================== */
-/*
- * Encoding/decoding functions (codec.c).
- *
- * Encoding functions take as parameters an output buffer (out) with
- * a given maximum length (max_out_len); returned value is the actual
- * number of bytes which have been written. If the output buffer is
- * not large enough, then 0 is returned (some bytes may have been
- * written to the buffer). If 'out' is NULL, then 'max_out_len' is
- * ignored; instead, the function computes and returns the actual
- * required output length (in bytes).
- *
- * Decoding functions take as parameters an input buffer (in) with
- * its maximum length (max_in_len); returned value is the actual number
- * of bytes that have been read from the buffer. If the provided length
- * is too short, then 0 is returned.
- *
- * Values to encode or decode are vectors of integers, with N = 2^logn
- * elements.
- *
- * Three encoding formats are defined:
- *
- *   - modq: sequence of values modulo 12289, each encoded over exactly
- *     14 bits. The encoder and decoder verify that integers are within
- *     the valid range (0..12288). Values are arrays of uint16.
- *
- *   - trim: sequence of signed integers, a specified number of bits
- *     each. The number of bits is provided as parameter and includes
- *     the sign bit. Each integer x must be such that |x| < 2^(bits-1)
- *     (which means that the -2^(bits-1) value is forbidden); encode and
- *     decode functions check that property. Values are arrays of
- *     int16_t or int8_t, corresponding to names 'trim_i16' and
- *     'trim_i8', respectively.
- *
- *   - comp: variable-length encoding for signed integers; each integer
- *     uses a minimum of 9 bits, possibly more. This is normally used
- *     only for signatures.
- *
- */
-
-size_t Zf(modq_encode)(void *out, size_t max_out_len,
-	const uint16_t *x, unsigned logn);
-size_t Zf(trim_i16_encode)(void *out, size_t max_out_len,
-	const int16_t *x, unsigned logn, unsigned bits);
-size_t Zf(trim_i8_encode)(void *out, size_t max_out_len,
-	const int8_t *x, unsigned logn, unsigned bits);
-size_t Zf(comp_encode)(void *out, size_t max_out_len,
-	const int16_t *x, unsigned logn);
-
-size_t Zf(modq_decode)(uint16_t *x, unsigned logn,
-	const void *in, size_t max_in_len);
-size_t Zf(trim_i16_decode)(int16_t *x, unsigned logn, unsigned bits,
-	const void *in, size_t max_in_len);
-size_t Zf(trim_i8_decode)(int8_t *x, unsigned logn, unsigned bits,
-	const void *in, size_t max_in_len);
-size_t Zf(comp_decode)(int16_t *x, unsigned logn,
-	const void *in, size_t max_in_len);
-
-/*
- * Number of bits for key elements, indexed by logn (1 to 10). This
- * is at most 8 bits for all degrees, but some degrees may have shorter
- * elements.
- */
-extern const uint8_t Zf(max_fg_bits)[];
-extern const uint8_t Zf(max_FG_bits)[];
-
-/*
- * Maximum size, in bits, of elements in a signature, indexed by logn
- * (1 to 10). The size includes the sign bit.
- */
-extern const uint8_t Zf(max_sig_bits)[];
-
-/* ==================================================================== */
-/*
- * Support functions used for both signature generation and signature
- * verification (common.c).
- */
-
-/*
- * From a SHAKE256 context (must be already flipped), produce a new
- * point. This is the non-constant-time version, which may leak enough
- * information to serve as a stop condition on a brute force attack on
- * the hashed message (provided that the nonce value is known).
- */
-void Zf(hash_to_point_vartime)(inner_shake256_context *sc,
-	uint16_t *x, unsigned logn);
-
-/*
- * From a SHAKE256 context (must be already flipped), produce a new
- * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
- * This function is constant-time but is typically more expensive than
- * Zf(hash_to_point_vartime)().
- *
- * tmp[] must have 16-bit alignment.
- */
-void Zf(hash_to_point_ct)(inner_shake256_context *sc,
-	uint16_t *x, unsigned logn, uint8_t *tmp);
-
-/*
- * Tell whether a given vector (2N coordinates, in two halves) is
- * acceptable as a signature. This compares the appropriate norm of the
- * vector with the acceptance bound. Returned value is 1 on success
- * (vector is short enough to be acceptable), 0 otherwise.
- */
-int Zf(is_short)(const int16_t *s1, const int16_t *s2, unsigned logn);
-
-/*
- * Tell whether a given vector (2N coordinates, in two halves) is
- * acceptable as a signature. Instead of the first half s1, this
- * function receives the "saturated squared norm" of s1, i.e. the
- * sum of the squares of the coordinates of s1 (saturated at 2^32-1
- * if the sum exceeds 2^31-1).
- *
- * Returned value is 1 on success (vector is short enough to be
- * acceptable), 0 otherwise.
- */
-int Zf(is_short_half)(uint32_t sqn, const int16_t *s2, unsigned logn);
-
-/* ==================================================================== */
-/*
- * Signature verification functions (vrfy.c).
- */
-
-/*
- * Convert a public key to NTT + Montgomery format. Conversion is done
- * in place.
- */
-void Zf(to_ntt_monty)(uint16_t *h, unsigned logn);
-
-/*
- * Internal signature verification code:
- *   c0[]      contains the hashed nonce+message
- *   s2[]      is the decoded signature
- *   h[]       contains the public key, in NTT + Montgomery format
- *   logn      is the degree log
- *   tmp[]     temporary, must have at least 2*2^logn bytes
- * Returned value is 1 on success, 0 on error.
- *
- * tmp[] must have 16-bit alignment.
- */
-int Zf(verify_raw)(const uint16_t *c0, const int16_t *s2,
-	const uint16_t *h, unsigned logn, uint8_t *tmp);
-
-/*
- * Compute the public key h[], given the private key elements f[] and
- * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial
- * modulus. This function returns 1 on success, 0 on error (an error is
- * reported if f is not invertible mod phi mod q).
- *
- * The tmp[] array must have room for at least 2*2^logn elements.
- * tmp[] must have 16-bit alignment.
- */
-int Zf(compute_public)(uint16_t *h,
-	const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp);
-
-/*
- * Recompute the fourth private key element. Private key consists in
- * four polynomials with small coefficients f, g, F and G, which are
- * such that fG - gF = q mod phi; furthermore, f is invertible modulo
- * phi and modulo q. This function recomputes G from f, g and F.
- *
- * The tmp[] array must have room for at least 4*2^logn bytes.
- *
- * Returned value is 1 in success, 0 on error (f not invertible).
- * tmp[] must have 16-bit alignment.
- */
-int Zf(complete_private)(int8_t *G,
-	const int8_t *f, const int8_t *g, const int8_t *F,
-	unsigned logn, uint8_t *tmp);
-
-/*
- * Test whether a given polynomial is invertible modulo phi and q.
- * Polynomial coefficients are small integers.
- *
- * tmp[] must have 16-bit alignment.
- */
-int Zf(is_invertible)(
-	const int16_t *s2, unsigned logn, uint8_t *tmp);
-
-/*
- * Count the number of elements of value zero in the NTT representation
- * of the given polynomial: this is the number of primitive 2n-th roots
- * of unity (modulo q = 12289) that are roots of the provided polynomial
- * (taken modulo q).
- *
- * tmp[] must have 16-bit alignment.
- */
-int Zf(count_nttzero)(const int16_t *sig, unsigned logn, uint8_t *tmp);
-
-/*
- * Internal signature verification with public key recovery:
- *   h[]       receives the public key (NOT in NTT/Montgomery format)
- *   c0[]      contains the hashed nonce+message
- *   s1[]      is the first signature half
- *   s2[]      is the second signature half
- *   logn      is the degree log
- *   tmp[]     temporary, must have at least 2*2^logn bytes
- * Returned value is 1 on success, 0 on error. Success is returned if
- * the signature is a short enough vector; in that case, the public
- * key has been written to h[]. However, the caller must still
- * verify that h[] is the correct value (e.g. with regards to a known
- * hash of the public key).
- *
- * h[] may not overlap with any of the other arrays.
- *
- * tmp[] must have 16-bit alignment.
- */
-int Zf(verify_recover)(uint16_t *h,
-	const uint16_t *c0, const int16_t *s1, const int16_t *s2,
-	unsigned logn, uint8_t *tmp);
-
-/* ==================================================================== */
-/*
- * Implementation of floating-point real numbers (fpr.h, fpr.c).
- */
-
-/*
- * Real numbers are implemented by an extra header file, included below.
- * This is meant to support pluggable implementations. The default
- * implementation relies on the C type 'double'.
- *
- * The included file must define the following types, functions and
- * constants:
- *
- *   fpr
- *         type for a real number
- *
- *   fpr fpr_of(int64_t i)
- *         cast an integer into a real number; source must be in the
- *         -(2^63-1)..+(2^63-1) range
- *
- *   fpr fpr_scaled(int64_t i, int sc)
- *         compute i*2^sc as a real number; source 'i' must be in the
- *         -(2^63-1)..+(2^63-1) range
- *
- *   fpr fpr_ldexp(fpr x, int e)
- *         compute x*2^e
- *
- *   int64_t fpr_rint(fpr x)
- *         round x to the nearest integer; x must be in the -(2^63-1)
- *         to +(2^63-1) range
- *
- *   int64_t fpr_trunc(fpr x)
- *         round to an integer; this rounds towards zero; value must
- *         be in the -(2^63-1) to +(2^63-1) range
- *
- *   fpr fpr_add(fpr x, fpr y)
- *         compute x + y
- *
- *   fpr fpr_sub(fpr x, fpr y)
- *         compute x - y
- *
- *   fpr fpr_neg(fpr x)
- *         compute -x
- *
- *   fpr fpr_half(fpr x)
- *         compute x/2
- *
- *   fpr fpr_double(fpr x)
- *         compute x*2
- *
- *   fpr fpr_mul(fpr x, fpr y)
- *         compute x * y
- *
- *   fpr fpr_sqr(fpr x)
- *         compute x * x
- *
- *   fpr fpr_inv(fpr x)
- *         compute 1/x
- *
- *   fpr fpr_div(fpr x, fpr y)
- *         compute x/y
- *
- *   fpr fpr_sqrt(fpr x)
- *         compute the square root of x
- *
- *   int fpr_lt(fpr x, fpr y)
- *         return 1 if x < y, 0 otherwise
- *
- *   uint64_t fpr_expm_p63(fpr x)
- *         return exp(x), assuming that 0 <= x < log(2). Returned value
- *         is scaled to 63 bits (i.e. it really returns 2^63*exp(-x),
- *         rounded to the nearest integer). Computation should have a
- *         precision of at least 45 bits.
- *
- *   const fpr fpr_gm_tab[]
- *         array of constants for FFT / iFFT
- *
- *   const fpr fpr_p2_tab[]
- *         precomputed powers of 2 (by index, 0 to 10)
- *
- * Constants of type 'fpr':
- *
- *   fpr fpr_q                 12289
- *   fpr fpr_inverse_of_q      1/12289
- *   fpr fpr_inv_2sqrsigma0    1/(2*(1.8205^2))
- *   fpr fpr_inv_sigma         1/(1.55*sqrt(12289))
- *   fpr fpr_sigma_min_9       1.291500756233514568549480827642
- *   fpr fpr_sigma_min_10      1.311734375905083682667395805765
- *   fpr fpr_log2              log(2)
- *   fpr fpr_inv_log2          1/log(2)
- *   fpr fpr_bnorm_max         16822.4121
- *   fpr fpr_zero              0
- *   fpr fpr_one               1
- *   fpr fpr_two               2
- *   fpr fpr_onehalf           0.5
- *   fpr fpr_ptwo31            2^31
- *   fpr fpr_ptwo31m1          2^31-1
- *   fpr fpr_mtwo31m1          -(2^31-1)
- *   fpr fpr_ptwo63m1          2^63-1
- *   fpr fpr_mtwo63m1          -(2^63-1)
- *   fpr fpr_ptwo63            2^63
- */
-#include "fpr.h"
-
-/* ==================================================================== */
-/*
- * RNG (rng.c).
- *
- * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256
- * context (flipped) and is used for bulk pseudorandom generation.
- * A system-dependent seed generator is also provided.
- */
-
-/*
- * Obtain a random seed from the system RNG.
- *
- * Returned value is 1 on success, 0 on error.
- */
-int Zf(get_seed)(void *seed, size_t seed_len);
-
-/*
- * Structure for a PRNG. This includes a large buffer so that values
- * get generated in advance. The 'state' is used to keep the current
- * PRNG algorithm state (contents depend on the selected algorithm).
- *
- * The unions with 'dummy_u64' are there to ensure proper alignment for
- * 64-bit direct access.
- */
-typedef struct {
-	union {
-		uint8_t d[512]; /* MUST be 512, exactly */
-		uint64_t dummy_u64;
-	} buf;
-	size_t ptr;
-	union {
-		uint8_t d[256];
-		uint64_t dummy_u64;
-	} state;
-	int type;
-} prng;
-
-/*
- * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
- * context (in "flipped" state) to obtain its initial state.
- */
-void Zf(prng_init)(prng *p, inner_shake256_context *src);
-
-/*
- * Refill the PRNG buffer. This is normally invoked automatically, and
- * is declared here only so that prng_get_u64() may be inlined.
- */
-void Zf(prng_refill)(prng *p);
-
-/*
- * Get some bytes from a PRNG.
- */
-void Zf(prng_get_bytes)(prng *p, void *dst, size_t len);
-
-/*
- * Get a 64-bit random value from a PRNG.
- */
-static inline uint64_t
-prng_get_u64(prng *p)
-{
-	size_t u;
-
-	/*
-	 * If there are less than 9 bytes in the buffer, we refill it.
-	 * This means that we may drop the last few bytes, but this allows
-	 * for faster extraction code. Also, it means that we never leave
-	 * an empty buffer.
-	 */
-	u = p->ptr;
-	if (u >= (sizeof p->buf.d) - 9) {
-		Zf(prng_refill)(p);
-		u = 0;
-	}
-	p->ptr = u + 8;
-
-	/*
-	 * On systems that use little-endian encoding and allow
-	 * unaligned accesses, we can simply read the data where it is.
-	 */
-#if FALCON_LE && FALCON_UNALIGNED  // yyyLEU+1
-	return *(uint64_t *)(p->buf.d + u);
-#else  // yyyLEU+0
-	return (uint64_t)p->buf.d[u + 0]
-		| ((uint64_t)p->buf.d[u + 1] << 8)
-		| ((uint64_t)p->buf.d[u + 2] << 16)
-		| ((uint64_t)p->buf.d[u + 3] << 24)
-		| ((uint64_t)p->buf.d[u + 4] << 32)
-		| ((uint64_t)p->buf.d[u + 5] << 40)
-		| ((uint64_t)p->buf.d[u + 6] << 48)
-		| ((uint64_t)p->buf.d[u + 7] << 56);
-#endif  // yyyLEU-
-}
-
-/*
- * Get an 8-bit random value from a PRNG.
- */
-static inline unsigned
-prng_get_u8(prng *p)
-{
-	unsigned v;
-
-	v = p->buf.d[p->ptr ++];
-	if (p->ptr == sizeof p->buf.d) {
-		Zf(prng_refill)(p);
-	}
-	return v;
-}
-
-/* ==================================================================== */
-/*
- * FFT (falcon-fft.c).
- *
- * A real polynomial is represented as an array of N 'fpr' elements.
- * The FFT representation of a real polynomial contains N/2 complex
- * elements; each is stored as two real numbers, for the real and
- * imaginary parts, respectively. See falcon-fft.c for details on the
- * internal representation.
- */
-
-/*
- * Compute FFT in-place: the source array should contain a real
- * polynomial (N coefficients); its storage area is reused to store
- * the FFT representation of that polynomial (N/2 complex numbers).
- *
- * 'logn' MUST lie between 1 and 10 (inclusive).
- */
-void Zf(FFT)(fpr *f, unsigned logn);
-
-/*
- * Compute the inverse FFT in-place: the source array should contain the
- * FFT representation of a real polynomial (N/2 elements); the resulting
- * real polynomial (N coefficients of type 'fpr') is written over the
- * array.
- *
- * 'logn' MUST lie between 1 and 10 (inclusive).
- */
-void Zf(iFFT)(fpr *f, unsigned logn);
-
-/*
- * Add polynomial b to polynomial a. a and b MUST NOT overlap. This
- * function works in both normal and FFT representations.
- */
-void Zf(poly_add)(fpr *restrict a, const fpr *restrict b, unsigned logn);
-
-/*
- * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This
- * function works in both normal and FFT representations.
- */
-void Zf(poly_sub)(fpr *restrict a, const fpr *restrict b, unsigned logn);
-
-/*
- * Negate polynomial a. This function works in both normal and FFT
- * representations.
- */
-void Zf(poly_neg)(fpr *a, unsigned logn);
-
-/*
- * Compute adjoint of polynomial a. This function works only in FFT
- * representation.
- */
-void Zf(poly_adj_fft)(fpr *a, unsigned logn);
-
-/*
- * Multiply polynomial a with polynomial b. a and b MUST NOT overlap.
- * This function works only in FFT representation.
- */
-void Zf(poly_mul_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn);
-
-/*
- * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT
- * overlap. This function works only in FFT representation.
- */
-void Zf(poly_muladj_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn);
-
-/*
- * Multiply polynomial with its own adjoint. This function works only in FFT
- * representation.
- */
-void Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn);
-
-/*
- * Multiply polynomial with a real constant. This function works in both
- * normal and FFT representations.
- */
-void Zf(poly_mulconst)(fpr *a, fpr x, unsigned logn);
-
-/*
- * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation).
- * a and b MUST NOT overlap.
- */
-void Zf(poly_div_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn);
-
-/*
- * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g))
- * (also in FFT representation). Since the result is auto-adjoint, all its
- * coordinates in FFT representation are real; as such, only the first N/2
- * values of d[] are filled (the imaginary parts are skipped).
- *
- * Array d MUST NOT overlap with either a or b.
- */
-void Zf(poly_invnorm2_fft)(fpr *restrict d,
-	const fpr *restrict a, const fpr *restrict b, unsigned logn);
-
-/*
- * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g)
- * (also in FFT representation). Destination d MUST NOT overlap with
- * any of the source arrays.
- */
-void Zf(poly_add_muladj_fft)(fpr *restrict d,
-	const fpr *restrict F, const fpr *restrict G,
-	const fpr *restrict f, const fpr *restrict g, unsigned logn);
-
-/*
- * Multiply polynomial a by polynomial b, where b is autoadjoint. Both
- * a and b are in FFT representation. Since b is autoadjoint, all its
- * FFT coefficients are real, and the array b contains only N/2 elements.
- * a and b MUST NOT overlap.
- */
-void Zf(poly_mul_autoadj_fft)(fpr *restrict a,
-	const fpr *restrict b, unsigned logn);
-
-/*
- * Divide polynomial a by polynomial b, where b is autoadjoint. Both
- * a and b are in FFT representation. Since b is autoadjoint, all its
- * FFT coefficients are real, and the array b contains only N/2 elements.
- * a and b MUST NOT overlap.
- */
-void Zf(poly_div_autoadj_fft)(fpr *restrict a,
-	const fpr *restrict b, unsigned logn);
-
-/*
- * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
- * representation. On input, g00, g01 and g11 are provided (where the
- * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10
- * and d11 values are written in g00, g01 and g11, respectively
- * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]).
- * (In fact, d00 = g00, so the g00 operand is left unmodified.)
- */
-void Zf(poly_LDL_fft)(const fpr *restrict g00,
-	fpr *restrict g01, fpr *restrict g11, unsigned logn);
-
-/*
- * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
- * representation. This is identical to poly_LDL_fft() except that
- * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written
- * in two other separate buffers provided as extra parameters.
- */
-void Zf(poly_LDLmv_fft)(fpr *restrict d11, fpr *restrict l10,
-	const fpr *restrict g00, const fpr *restrict g01,
-	const fpr *restrict g11, unsigned logn);
-
-/*
- * Apply "split" operation on a polynomial in FFT representation:
- * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1
- * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap.
- */
-void Zf(poly_split_fft)(fpr *restrict f0, fpr *restrict f1,
-	const fpr *restrict f, unsigned logn);
-
-/*
- * Apply "merge" operation on two polynomials in FFT representation:
- * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes
- * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1.
- * f MUST NOT overlap with either f0 or f1.
- */
-void Zf(poly_merge_fft)(fpr *restrict f,
-	const fpr *restrict f0, const fpr *restrict f1, unsigned logn);
-
-/* ==================================================================== */
-/*
- * Key pair generation.
- */
-
-/*
- * Required sizes of the temporary buffer (in bytes).
- *
- * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
- * or 2) where it is slightly greater.
- */
-#define FALCON_KEYGEN_TEMP_1      136
-#define FALCON_KEYGEN_TEMP_2      272
-#define FALCON_KEYGEN_TEMP_3      224
-#define FALCON_KEYGEN_TEMP_4      448
-#define FALCON_KEYGEN_TEMP_5      896
-#define FALCON_KEYGEN_TEMP_6     1792
-#define FALCON_KEYGEN_TEMP_7     3584
-#define FALCON_KEYGEN_TEMP_8     7168
-#define FALCON_KEYGEN_TEMP_9    14336
-#define FALCON_KEYGEN_TEMP_10   28672
-
-/*
- * Generate a new key pair. Randomness is extracted from the provided
- * SHAKE256 context, which must have already been seeded and flipped.
- * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_*
- * macros) and be aligned for the uint32_t, uint64_t and fpr types.
- *
- * The private key elements are written in f, g, F and G, and the
- * public key is written in h. Either or both of G and h may be NULL,
- * in which case the corresponding element is not returned (they can
- * be recomputed from f, g and F).
- *
- * tmp[] must have 64-bit alignment.
- * This function uses floating-point rounding (see set_fpu_cw()).
- */
-void Zf(keygen)(inner_shake256_context *rng,
-	int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
-	unsigned logn, uint8_t *tmp);
-
-/* ==================================================================== */
-/*
- * Signature generation.
- */
-
-/*
- * Expand a private key into the B0 matrix in FFT representation and
- * the LDL tree. All the values are written in 'expanded_key', for
- * a total of (8*logn+40)*2^logn bytes.
- *
- * The tmp[] array must have room for at least 48*2^logn bytes.
- *
- * tmp[] must have 64-bit alignment.
- * This function uses floating-point rounding (see set_fpu_cw()).
- */
-void Zf(expand_privkey)(fpr *restrict expanded_key,
-	const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
-	unsigned logn, uint8_t *restrict tmp);
-
-/*
- * Compute a signature over the provided hashed message (hm); the
- * signature value is one short vector. This function uses an
- * expanded key (as generated by Zf(expand_privkey)()).
- *
- * The sig[] and hm[] buffers may overlap.
- *
- * On successful output, the start of the tmp[] buffer contains the s1
- * vector (as int16_t elements).
- *
- * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
- *
- * tmp[] must have 64-bit alignment.
- * This function uses floating-point rounding (see set_fpu_cw()).
- */
-void Zf(sign_tree)(int16_t *sig, inner_shake256_context *rng,
-	const fpr *restrict expanded_key,
-	const uint16_t *hm, unsigned logn, uint8_t *tmp);
-
-/*
- * Compute a signature over the provided hashed message (hm); the
- * signature value is one short vector. This function uses a raw
- * key and dynamically recompute the B0 matrix and LDL tree; this
- * saves RAM since there is no needed for an expanded key, but
- * increases the signature cost.
- *
- * The sig[] and hm[] buffers may overlap.
- *
- * On successful output, the start of the tmp[] buffer contains the s1
- * vector (as int16_t elements).
- *
- * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
- *
- * tmp[] must have 64-bit alignment.
- * This function uses floating-point rounding (see set_fpu_cw()).
- */
-void Zf(sign_dyn)(int16_t *sig, inner_shake256_context *rng,
-	const int8_t *restrict f, const int8_t *restrict g,
-	const int8_t *restrict F, const int8_t *restrict G,
-	const uint16_t *hm, unsigned logn, uint8_t *tmp);
-
-/*
- * Internal sampler engine. Exported for tests.
- *
- * sampler_context wraps around a source of random numbers (PRNG) and
- * the sigma_min value (nominally dependent on the degree).
- *
- * sampler() takes as parameters:
- *   ctx      pointer to the sampler_context structure
- *   mu       center for the distribution
- *   isigma   inverse of the distribution standard deviation
- * It returns an integer sampled along the Gaussian distribution centered
- * on mu and of standard deviation sigma = 1/isigma.
- *
- * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
- * returns an integer sampled along a half-Gaussian with standard
- * deviation sigma0 = 1.8205 (center is 0, returned value is
- * nonnegative).
- */
-
-typedef struct {
-	prng p;
-	fpr sigma_min;
-} sampler_context;
-
-TARGET_AVX2
-int Zf(sampler)(void *ctx, fpr mu, fpr isigma);
-
-TARGET_AVX2
-int Zf(gaussian0_sampler)(prng *p);
-
-/* ==================================================================== */
-
-#endif
diff --git a/crypto_sign/falcon-1024/m4-ct/keygen.c b/crypto_sign/falcon-1024/m4-ct/keygen.c
deleted file mode 100644
index cf7de008..00000000
--- a/crypto_sign/falcon-1024/m4-ct/keygen.c
+++ /dev/null
@@ -1,4301 +0,0 @@
-/*
- * Falcon key pair generation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-#define MKN(logn)   ((size_t)1 << (logn))
-
-/* ==================================================================== */
-/*
- * Modular arithmetics.
- *
- * We implement a few functions for computing modulo a small integer p.
- *
- * All functions require that 2^30 < p < 2^31. Moreover, operands must
- * be in the 0..p-1 range.
- *
- * Modular addition and subtraction work for all such p.
- *
- * Montgomery multiplication requires that p is odd, and must be provided
- * with an additional value p0i = -1/p mod 2^31. See below for some basics
- * on Montgomery multiplication.
- *
- * Division computes an inverse modulo p by an exponentiation (with
- * exponent p-2): this works only if p is prime. Multiplication
- * requirements also apply, i.e. p must be odd and p0i must be provided.
- *
- * The NTT and inverse NTT need all of the above, and also that
- * p = 1 mod 2048.
- *
- * -----------------------------------------------------------------------
- *
- * We use Montgomery representation with 31-bit values:
- *
- *   Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p.
- *   Montgomery representation of an integer x modulo p is x*R mod p.
- *
- *   Montgomery multiplication computes (x*y)/R mod p for
- *   operands x and y. Therefore:
- *
- *    - if operands are x*R and y*R (Montgomery representations of x and
- *      y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R
- *      mod p, which is the Montgomery representation of the product x*y;
- *
- *    - if operands are x*R and y (or x and y*R), then Montgomery
- *      multiplication returns x*y mod p: mixed-representation
- *      multiplications yield results in normal representation.
- *
- * To convert to Montgomery representation, we multiply by R, which is done
- * by Montgomery-multiplying by R^2. Stand-alone conversion back from
- * Montgomery representation is Montgomery-multiplication by 1.
- */
-
-/*
- * Precomputed small primes. Each element contains the following:
- *
- *  p   The prime itself.
- *
- *  g   A primitive root of phi = X^N+1 (in field Z_p).
- *
- *  s   The inverse of the product of all previous primes in the array,
- *      computed modulo p and in Montgomery representation.
- *
- * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
- * are listed in decreasing order.
- */
-
-typedef struct {
-	uint32_t p;
-	uint32_t g;
-	uint32_t s;
-} small_prime;
-
-static const small_prime PRIMES[] = {
-	{ 2147473409,  383167813,      10239 },
-	{ 2147389441,  211808905,  471403745 },
-	{ 2147387393,   37672282, 1329335065 },
-	{ 2147377153, 1977035326,  968223422 },
-	{ 2147358721, 1067163706,  132460015 },
-	{ 2147352577, 1606082042,  598693809 },
-	{ 2147346433, 2033915641, 1056257184 },
-	{ 2147338241, 1653770625,  421286710 },
-	{ 2147309569,  631200819, 1111201074 },
-	{ 2147297281, 2038364663, 1042003613 },
-	{ 2147295233, 1962540515,   19440033 },
-	{ 2147239937, 2100082663,  353296760 },
-	{ 2147235841, 1991153006, 1703918027 },
-	{ 2147217409,  516405114, 1258919613 },
-	{ 2147205121,  409347988, 1089726929 },
-	{ 2147196929,  927788991, 1946238668 },
-	{ 2147178497, 1136922411, 1347028164 },
-	{ 2147100673,  868626236,  701164723 },
-	{ 2147082241, 1897279176,  617820870 },
-	{ 2147074049, 1888819123,  158382189 },
-	{ 2147051521,   25006327,  522758543 },
-	{ 2147043329,  327546255,   37227845 },
-	{ 2147039233,  766324424, 1133356428 },
-	{ 2146988033, 1862817362,   73861329 },
-	{ 2146963457,  404622040,  653019435 },
-	{ 2146959361, 1936581214,  995143093 },
-	{ 2146938881, 1559770096,  634921513 },
-	{ 2146908161,  422623708, 1985060172 },
-	{ 2146885633, 1751189170,  298238186 },
-	{ 2146871297,  578919515,  291810829 },
-	{ 2146846721, 1114060353,  915902322 },
-	{ 2146834433, 2069565474,   47859524 },
-	{ 2146818049, 1552824584,  646281055 },
-	{ 2146775041, 1906267847, 1597832891 },
-	{ 2146756609, 1847414714, 1228090888 },
-	{ 2146744321, 1818792070, 1176377637 },
-	{ 2146738177, 1118066398, 1054971214 },
-	{ 2146736129,   52057278,  933422153 },
-	{ 2146713601,  592259376, 1406621510 },
-	{ 2146695169,  263161877, 1514178701 },
-	{ 2146656257,  685363115,  384505091 },
-	{ 2146650113,  927727032,  537575289 },
-	{ 2146646017,   52575506, 1799464037 },
-	{ 2146643969, 1276803876, 1348954416 },
-	{ 2146603009,  814028633, 1521547704 },
-	{ 2146572289, 1846678872, 1310832121 },
-	{ 2146547713,  919368090, 1019041349 },
-	{ 2146508801,  671847612,   38582496 },
-	{ 2146492417,  283911680,  532424562 },
-	{ 2146490369, 1780044827,  896447978 },
-	{ 2146459649,  327980850, 1327906900 },
-	{ 2146447361, 1310561493,  958645253 },
-	{ 2146441217,  412148926,  287271128 },
-	{ 2146437121,  293186449, 2009822534 },
-	{ 2146430977,  179034356, 1359155584 },
-	{ 2146418689, 1517345488, 1790248672 },
-	{ 2146406401, 1615820390, 1584833571 },
-	{ 2146404353,  826651445,  607120498 },
-	{ 2146379777,    3816988, 1897049071 },
-	{ 2146363393, 1221409784, 1986921567 },
-	{ 2146355201, 1388081168,  849968120 },
-	{ 2146336769, 1803473237, 1655544036 },
-	{ 2146312193, 1023484977,  273671831 },
-	{ 2146293761, 1074591448,  467406983 },
-	{ 2146283521,  831604668, 1523950494 },
-	{ 2146203649,  712865423, 1170834574 },
-	{ 2146154497, 1764991362, 1064856763 },
-	{ 2146142209,  627386213, 1406840151 },
-	{ 2146127873, 1638674429, 2088393537 },
-	{ 2146099201, 1516001018,  690673370 },
-	{ 2146093057, 1294931393,  315136610 },
-	{ 2146091009, 1942399533,  973539425 },
-	{ 2146078721, 1843461814, 2132275436 },
-	{ 2146060289, 1098740778,  360423481 },
-	{ 2146048001, 1617213232, 1951981294 },
-	{ 2146041857, 1805783169, 2075683489 },
-	{ 2146019329,  272027909, 1753219918 },
-	{ 2145986561, 1206530344, 2034028118 },
-	{ 2145976321, 1243769360, 1173377644 },
-	{ 2145964033,  887200839, 1281344586 },
-	{ 2145906689, 1651026455,  906178216 },
-	{ 2145875969, 1673238256, 1043521212 },
-	{ 2145871873, 1226591210, 1399796492 },
-	{ 2145841153, 1465353397, 1324527802 },
-	{ 2145832961, 1150638905,  554084759 },
-	{ 2145816577,  221601706,  427340863 },
-	{ 2145785857,  608896761,  316590738 },
-	{ 2145755137, 1712054942, 1684294304 },
-	{ 2145742849, 1302302867,  724873116 },
-	{ 2145728513,  516717693,  431671476 },
-	{ 2145699841,  524575579, 1619722537 },
-	{ 2145691649, 1925625239,  982974435 },
-	{ 2145687553,  463795662, 1293154300 },
-	{ 2145673217,  771716636,  881778029 },
-	{ 2145630209, 1509556977,  837364988 },
-	{ 2145595393,  229091856,  851648427 },
-	{ 2145587201, 1796903241,  635342424 },
-	{ 2145525761,  715310882, 1677228081 },
-	{ 2145495041, 1040930522,  200685896 },
-	{ 2145466369,  949804237, 1809146322 },
-	{ 2145445889, 1673903706,   95316881 },
-	{ 2145390593,  806941852, 1428671135 },
-	{ 2145372161, 1402525292,  159350694 },
-	{ 2145361921, 2124760298, 1589134749 },
-	{ 2145359873, 1217503067, 1561543010 },
-	{ 2145355777,  338341402,   83865711 },
-	{ 2145343489, 1381532164,  641430002 },
-	{ 2145325057, 1883895478, 1528469895 },
-	{ 2145318913, 1335370424,   65809740 },
-	{ 2145312769, 2000008042, 1919775760 },
-	{ 2145300481,  961450962, 1229540578 },
-	{ 2145282049,  910466767, 1964062701 },
-	{ 2145232897,  816527501,  450152063 },
-	{ 2145218561, 1435128058, 1794509700 },
-	{ 2145187841,   33505311, 1272467582 },
-	{ 2145181697,  269767433, 1380363849 },
-	{ 2145175553,   56386299, 1316870546 },
-	{ 2145079297, 2106880293, 1391797340 },
-	{ 2145021953, 1347906152,  720510798 },
-	{ 2145015809,  206769262, 1651459955 },
-	{ 2145003521, 1885513236, 1393381284 },
-	{ 2144960513, 1810381315,   31937275 },
-	{ 2144944129, 1306487838, 2019419520 },
-	{ 2144935937,   37304730, 1841489054 },
-	{ 2144894977, 1601434616,  157985831 },
-	{ 2144888833,   98749330, 2128592228 },
-	{ 2144880641, 1772327002, 2076128344 },
-	{ 2144864257, 1404514762, 2029969964 },
-	{ 2144827393,  801236594,  406627220 },
-	{ 2144806913,  349217443, 1501080290 },
-	{ 2144796673, 1542656776, 2084736519 },
-	{ 2144778241, 1210734884, 1746416203 },
-	{ 2144759809, 1146598851,  716464489 },
-	{ 2144757761,  286328400, 1823728177 },
-	{ 2144729089, 1347555695, 1836644881 },
-	{ 2144727041, 1795703790,  520296412 },
-	{ 2144696321, 1302475157,  852964281 },
-	{ 2144667649, 1075877614,  504992927 },
-	{ 2144573441,  198765808, 1617144982 },
-	{ 2144555009,  321528767,  155821259 },
-	{ 2144550913,  814139516, 1819937644 },
-	{ 2144536577,  571143206,  962942255 },
-	{ 2144524289, 1746733766,    2471321 },
-	{ 2144512001, 1821415077,  124190939 },
-	{ 2144468993,  917871546, 1260072806 },
-	{ 2144458753,  378417981, 1569240563 },
-	{ 2144421889,  175229668, 1825620763 },
-	{ 2144409601, 1699216963,  351648117 },
-	{ 2144370689, 1071885991,  958186029 },
-	{ 2144348161, 1763151227,  540353574 },
-	{ 2144335873, 1060214804,  919598847 },
-	{ 2144329729,  663515846, 1448552668 },
-	{ 2144327681, 1057776305,  590222840 },
-	{ 2144309249, 1705149168, 1459294624 },
-	{ 2144296961,  325823721, 1649016934 },
-	{ 2144290817,  738775789,  447427206 },
-	{ 2144243713,  962347618,  893050215 },
-	{ 2144237569, 1655257077,  900860862 },
-	{ 2144161793,  242206694, 1567868672 },
-	{ 2144155649,  769415308, 1247993134 },
-	{ 2144137217,  320492023,  515841070 },
-	{ 2144120833, 1639388522,  770877302 },
-	{ 2144071681, 1761785233,  964296120 },
-	{ 2144065537,  419817825,  204564472 },
-	{ 2144028673,  666050597, 2091019760 },
-	{ 2144010241, 1413657615, 1518702610 },
-	{ 2143952897, 1238327946,  475672271 },
-	{ 2143940609,  307063413, 1176750846 },
-	{ 2143918081, 2062905559,  786785803 },
-	{ 2143899649, 1338112849, 1562292083 },
-	{ 2143891457,   68149545,   87166451 },
-	{ 2143885313,  921750778,  394460854 },
-	{ 2143854593,  719766593,  133877196 },
-	{ 2143836161, 1149399850, 1861591875 },
-	{ 2143762433, 1848739366, 1335934145 },
-	{ 2143756289, 1326674710,  102999236 },
-	{ 2143713281,  808061791, 1156900308 },
-	{ 2143690753,  388399459, 1926468019 },
-	{ 2143670273, 1427891374, 1756689401 },
-	{ 2143666177, 1912173949,  986629565 },
-	{ 2143645697, 2041160111,  371842865 },
-	{ 2143641601, 1279906897, 2023974350 },
-	{ 2143635457,  720473174, 1389027526 },
-	{ 2143621121, 1298309455, 1732632006 },
-	{ 2143598593, 1548762216, 1825417506 },
-	{ 2143567873,  620475784, 1073787233 },
-	{ 2143561729, 1932954575,  949167309 },
-	{ 2143553537,  354315656, 1652037534 },
-	{ 2143541249,  577424288, 1097027618 },
-	{ 2143531009,  357862822,  478640055 },
-	{ 2143522817, 2017706025, 1550531668 },
-	{ 2143506433, 2078127419, 1824320165 },
-	{ 2143488001,  613475285, 1604011510 },
-	{ 2143469569, 1466594987,  502095196 },
-	{ 2143426561, 1115430331, 1044637111 },
-	{ 2143383553,    9778045, 1902463734 },
-	{ 2143377409, 1557401276, 2056861771 },
-	{ 2143363073,  652036455, 1965915971 },
-	{ 2143260673, 1464581171, 1523257541 },
-	{ 2143246337, 1876119649,  764541916 },
-	{ 2143209473, 1614992673, 1920672844 },
-	{ 2143203329,  981052047, 2049774209 },
-	{ 2143160321, 1847355533,  728535665 },
-	{ 2143129601,  965558457,  603052992 },
-	{ 2143123457, 2140817191,    8348679 },
-	{ 2143100929, 1547263683,  694209023 },
-	{ 2143092737,  643459066, 1979934533 },
-	{ 2143082497,  188603778, 2026175670 },
-	{ 2143062017, 1657329695,  377451099 },
-	{ 2143051777,  114967950,  979255473 },
-	{ 2143025153, 1698431342, 1449196896 },
-	{ 2143006721, 1862741675, 1739650365 },
-	{ 2142996481,  756660457,  996160050 },
-	{ 2142976001,  927864010, 1166847574 },
-	{ 2142965761,  905070557,  661974566 },
-	{ 2142916609,   40932754, 1787161127 },
-	{ 2142892033, 1987985648,  675335382 },
-	{ 2142885889,  797497211, 1323096997 },
-	{ 2142871553, 2068025830, 1411877159 },
-	{ 2142861313, 1217177090, 1438410687 },
-	{ 2142830593,  409906375, 1767860634 },
-	{ 2142803969, 1197788993,  359782919 },
-	{ 2142785537,  643817365,  513932862 },
-	{ 2142779393, 1717046338,  218943121 },
-	{ 2142724097,   89336830,  416687049 },
-	{ 2142707713,    5944581, 1356813523 },
-	{ 2142658561,  887942135, 2074011722 },
-	{ 2142638081,  151851972, 1647339939 },
-	{ 2142564353, 1691505537, 1483107336 },
-	{ 2142533633, 1989920200, 1135938817 },
-	{ 2142529537,  959263126, 1531961857 },
-	{ 2142527489,  453251129, 1725566162 },
-	{ 2142502913, 1536028102,  182053257 },
-	{ 2142498817,  570138730,  701443447 },
-	{ 2142416897,  326965800,  411931819 },
-	{ 2142363649, 1675665410, 1517191733 },
-	{ 2142351361,  968529566, 1575712703 },
-	{ 2142330881, 1384953238, 1769087884 },
-	{ 2142314497, 1977173242, 1833745524 },
-	{ 2142289921,   95082313, 1714775493 },
-	{ 2142283777,  109377615, 1070584533 },
-	{ 2142277633,   16960510,  702157145 },
-	{ 2142263297,  553850819,  431364395 },
-	{ 2142208001,  241466367, 2053967982 },
-	{ 2142164993, 1795661326, 1031836848 },
-	{ 2142097409, 1212530046,  712772031 },
-	{ 2142087169, 1763869720,  822276067 },
-	{ 2142078977,  644065713, 1765268066 },
-	{ 2142074881,  112671944,  643204925 },
-	{ 2142044161, 1387785471, 1297890174 },
-	{ 2142025729,  783885537, 1000425730 },
-	{ 2142011393,  905662232, 1679401033 },
-	{ 2141974529,  799788433,  468119557 },
-	{ 2141943809, 1932544124,  449305555 },
-	{ 2141933569, 1527403256,  841867925 },
-	{ 2141931521, 1247076451,  743823916 },
-	{ 2141902849, 1199660531,  401687910 },
-	{ 2141890561,  150132350, 1720336972 },
-	{ 2141857793, 1287438162,  663880489 },
-	{ 2141833217,  618017731, 1819208266 },
-	{ 2141820929,  999578638, 1403090096 },
-	{ 2141786113,   81834325, 1523542501 },
-	{ 2141771777,  120001928,  463556492 },
-	{ 2141759489,  122455485, 2124928282 },
-	{ 2141749249,  141986041,  940339153 },
-	{ 2141685761,  889088734,  477141499 },
-	{ 2141673473,  324212681, 1122558298 },
-	{ 2141669377, 1175806187, 1373818177 },
-	{ 2141655041, 1113654822,  296887082 },
-	{ 2141587457,  991103258, 1585913875 },
-	{ 2141583361, 1401451409, 1802457360 },
-	{ 2141575169, 1571977166,  712760980 },
-	{ 2141546497, 1107849376, 1250270109 },
-	{ 2141515777,  196544219,  356001130 },
-	{ 2141495297, 1733571506, 1060744866 },
-	{ 2141483009,  321552363, 1168297026 },
-	{ 2141458433,  505818251,  733225819 },
-	{ 2141360129, 1026840098,  948342276 },
-	{ 2141325313,  945133744, 2129965998 },
-	{ 2141317121, 1871100260, 1843844634 },
-	{ 2141286401, 1790639498, 1750465696 },
-	{ 2141267969, 1376858592,  186160720 },
-	{ 2141255681, 2129698296, 1876677959 },
-	{ 2141243393, 2138900688, 1340009628 },
-	{ 2141214721, 1933049835, 1087819477 },
-	{ 2141212673, 1898664939, 1786328049 },
-	{ 2141202433,  990234828,  940682169 },
-	{ 2141175809, 1406392421,  993089586 },
-	{ 2141165569, 1263518371,  289019479 },
-	{ 2141073409, 1485624211,  507864514 },
-	{ 2141052929, 1885134788,  311252465 },
-	{ 2141040641, 1285021247,  280941862 },
-	{ 2141028353, 1527610374,  375035110 },
-	{ 2141011969, 1400626168,  164696620 },
-	{ 2140999681,  632959608,  966175067 },
-	{ 2140997633, 2045628978, 1290889438 },
-	{ 2140993537, 1412755491,  375366253 },
-	{ 2140942337,  719477232,  785367828 },
-	{ 2140925953,   45224252,  836552317 },
-	{ 2140917761, 1157376588, 1001839569 },
-	{ 2140887041,  278480752, 2098732796 },
-	{ 2140837889, 1663139953,  924094810 },
-	{ 2140788737,  802501511, 2045368990 },
-	{ 2140766209, 1820083885, 1800295504 },
-	{ 2140764161, 1169561905, 2106792035 },
-	{ 2140696577,  127781498, 1885987531 },
-	{ 2140684289,   16014477, 1098116827 },
-	{ 2140653569,  665960598, 1796728247 },
-	{ 2140594177, 1043085491,  377310938 },
-	{ 2140579841, 1732838211, 1504505945 },
-	{ 2140569601,  302071939,  358291016 },
-	{ 2140567553,  192393733, 1909137143 },
-	{ 2140557313,  406595731, 1175330270 },
-	{ 2140549121, 1748850918,  525007007 },
-	{ 2140477441,  499436566, 1031159814 },
-	{ 2140469249, 1886004401, 1029951320 },
-	{ 2140426241, 1483168100, 1676273461 },
-	{ 2140420097, 1779917297,  846024476 },
-	{ 2140413953,  522948893, 1816354149 },
-	{ 2140383233, 1931364473, 1296921241 },
-	{ 2140366849, 1917356555,  147196204 },
-	{ 2140354561,   16466177, 1349052107 },
-	{ 2140348417, 1875366972, 1860485634 },
-	{ 2140323841,  456498717, 1790256483 },
-	{ 2140321793, 1629493973,  150031888 },
-	{ 2140315649, 1904063898,  395510935 },
-	{ 2140280833, 1784104328,  831417909 },
-	{ 2140250113,  256087139,  697349101 },
-	{ 2140229633,  388553070,  243875754 },
-	{ 2140223489,  747459608, 1396270850 },
-	{ 2140200961,  507423743, 1895572209 },
-	{ 2140162049,  580106016, 2045297469 },
-	{ 2140149761,  712426444,  785217995 },
-	{ 2140137473, 1441607584,  536866543 },
-	{ 2140119041,  346538902, 1740434653 },
-	{ 2140090369,  282642885,   21051094 },
-	{ 2140076033, 1407456228,  319910029 },
-	{ 2140047361, 1619330500, 1488632070 },
-	{ 2140041217, 2089408064, 2012026134 },
-	{ 2140008449, 1705524800, 1613440760 },
-	{ 2139924481, 1846208233, 1280649481 },
-	{ 2139906049,  989438755, 1185646076 },
-	{ 2139867137, 1522314850,  372783595 },
-	{ 2139842561, 1681587377,  216848235 },
-	{ 2139826177, 2066284988, 1784999464 },
-	{ 2139824129,  480888214, 1513323027 },
-	{ 2139789313,  847937200,  858192859 },
-	{ 2139783169, 1642000434, 1583261448 },
-	{ 2139770881,  940699589,  179702100 },
-	{ 2139768833,  315623242,  964612676 },
-	{ 2139666433,  331649203,  764666914 },
-	{ 2139641857, 2118730799, 1313764644 },
-	{ 2139635713,  519149027,  519212449 },
-	{ 2139598849, 1526413634, 1769667104 },
-	{ 2139574273,  551148610,  820739925 },
-	{ 2139568129, 1386800242,  472447405 },
-	{ 2139549697,  813760130, 1412328531 },
-	{ 2139537409, 1615286260, 1609362979 },
-	{ 2139475969, 1352559299, 1696720421 },
-	{ 2139455489, 1048691649, 1584935400 },
-	{ 2139432961,  836025845,  950121150 },
-	{ 2139424769, 1558281165, 1635486858 },
-	{ 2139406337, 1728402143, 1674423301 },
-	{ 2139396097, 1727715782, 1483470544 },
-	{ 2139383809, 1092853491, 1741699084 },
-	{ 2139369473,  690776899, 1242798709 },
-	{ 2139351041, 1768782380, 2120712049 },
-	{ 2139334657, 1739968247, 1427249225 },
-	{ 2139332609, 1547189119,  623011170 },
-	{ 2139310081, 1346827917, 1605466350 },
-	{ 2139303937,  369317948,  828392831 },
-	{ 2139301889, 1560417239, 1788073219 },
-	{ 2139283457, 1303121623,  595079358 },
-	{ 2139248641, 1354555286,  573424177 },
-	{ 2139240449,   60974056,  885781403 },
-	{ 2139222017,  355573421, 1221054839 },
-	{ 2139215873,  566477826, 1724006500 },
-	{ 2139150337,  871437673, 1609133294 },
-	{ 2139144193, 1478130914, 1137491905 },
-	{ 2139117569, 1854880922,  964728507 },
-	{ 2139076609,  202405335,  756508944 },
-	{ 2139062273, 1399715741,  884826059 },
-	{ 2139045889, 1051045798, 1202295476 },
-	{ 2139033601, 1707715206,  632234634 },
-	{ 2139006977, 2035853139,  231626690 },
-	{ 2138951681,  183867876,  838350879 },
-	{ 2138945537, 1403254661,  404460202 },
-	{ 2138920961,  310865011, 1282911681 },
-	{ 2138910721, 1328496553,  103472415 },
-	{ 2138904577,   78831681,  993513549 },
-	{ 2138902529, 1319697451, 1055904361 },
-	{ 2138816513,  384338872, 1706202469 },
-	{ 2138810369, 1084868275,  405677177 },
-	{ 2138787841,  401181788, 1964773901 },
-	{ 2138775553, 1850532988, 1247087473 },
-	{ 2138767361,  874261901, 1576073565 },
-	{ 2138757121, 1187474742,  993541415 },
-	{ 2138748929, 1782458888, 1043206483 },
-	{ 2138744833, 1221500487,  800141243 },
-	{ 2138738689,  413465368, 1450660558 },
-	{ 2138695681,  739045140,  342611472 },
-	{ 2138658817, 1355845756,  672674190 },
-	{ 2138644481,  608379162, 1538874380 },
-	{ 2138632193, 1444914034,  686911254 },
-	{ 2138607617,  484707818, 1435142134 },
-	{ 2138591233,  539460669, 1290458549 },
-	{ 2138572801, 2093538990, 2011138646 },
-	{ 2138552321, 1149786988, 1076414907 },
-	{ 2138546177,  840688206, 2108985273 },
-	{ 2138533889,  209669619,  198172413 },
-	{ 2138523649, 1975879426, 1277003968 },
-	{ 2138490881, 1351891144, 1976858109 },
-	{ 2138460161, 1817321013, 1979278293 },
-	{ 2138429441, 1950077177,  203441928 },
-	{ 2138400769,  908970113,  628395069 },
-	{ 2138398721,  219890864,  758486760 },
-	{ 2138376193, 1306654379,  977554090 },
-	{ 2138351617,  298822498, 2004708503 },
-	{ 2138337281,  441457816, 1049002108 },
-	{ 2138320897, 1517731724, 1442269609 },
-	{ 2138290177, 1355911197, 1647139103 },
-	{ 2138234881,  531313247, 1746591962 },
-	{ 2138214401, 1899410930,  781416444 },
-	{ 2138202113, 1813477173, 1622508515 },
-	{ 2138191873, 1086458299, 1025408615 },
-	{ 2138183681, 1998800427,  827063290 },
-	{ 2138173441, 1921308898,  749670117 },
-	{ 2138103809, 1620902804, 2126787647 },
-	{ 2138099713,  828647069, 1892961817 },
-	{ 2138085377,  179405355, 1525506535 },
-	{ 2138060801,  615683235, 1259580138 },
-	{ 2138044417, 2030277840, 1731266562 },
-	{ 2138042369, 2087222316, 1627902259 },
-	{ 2138032129,  126388712, 1108640984 },
-	{ 2138011649,  715026550, 1017980050 },
-	{ 2137993217, 1693714349, 1351778704 },
-	{ 2137888769, 1289762259, 1053090405 },
-	{ 2137853953,  199991890, 1254192789 },
-	{ 2137833473,  941421685,  896995556 },
-	{ 2137817089,  750416446, 1251031181 },
-	{ 2137792513,  798075119,  368077456 },
-	{ 2137786369,  878543495, 1035375025 },
-	{ 2137767937,    9351178, 1156563902 },
-	{ 2137755649, 1382297614, 1686559583 },
-	{ 2137724929, 1345472850, 1681096331 },
-	{ 2137704449,  834666929,  630551727 },
-	{ 2137673729, 1646165729, 1892091571 },
-	{ 2137620481,  778943821,   48456461 },
-	{ 2137618433, 1730837875, 1713336725 },
-	{ 2137581569,  805610339, 1378891359 },
-	{ 2137538561,  204342388, 1950165220 },
-	{ 2137526273, 1947629754, 1500789441 },
-	{ 2137516033,  719902645, 1499525372 },
-	{ 2137491457,  230451261,  556382829 },
-	{ 2137440257,  979573541,  412760291 },
-	{ 2137374721,  927841248, 1954137185 },
-	{ 2137362433, 1243778559,  861024672 },
-	{ 2137313281, 1341338501,  980638386 },
-	{ 2137311233,  937415182, 1793212117 },
-	{ 2137255937,  795331324, 1410253405 },
-	{ 2137243649,  150756339, 1966999887 },
-	{ 2137182209,  163346914, 1939301431 },
-	{ 2137171969, 1952552395,  758913141 },
-	{ 2137159681,  570788721,  218668666 },
-	{ 2137147393, 1896656810, 2045670345 },
-	{ 2137141249,  358493842,  518199643 },
-	{ 2137139201, 1505023029,  674695848 },
-	{ 2137133057,   27911103,  830956306 },
-	{ 2137122817,  439771337, 1555268614 },
-	{ 2137116673,  790988579, 1871449599 },
-	{ 2137110529,  432109234,  811805080 },
-	{ 2137102337, 1357900653, 1184997641 },
-	{ 2137098241,  515119035, 1715693095 },
-	{ 2137090049,  408575203, 2085660657 },
-	{ 2137085953, 2097793407, 1349626963 },
-	{ 2137055233, 1556739954, 1449960883 },
-	{ 2137030657, 1545758650, 1369303716 },
-	{ 2136987649,  332602570,  103875114 },
-	{ 2136969217, 1499989506, 1662964115 },
-	{ 2136924161,  857040753,    4738842 },
-	{ 2136895489, 1948872712,  570436091 },
-	{ 2136893441,   58969960, 1568349634 },
-	{ 2136887297, 2127193379,  273612548 },
-	{ 2136850433,  111208983, 1181257116 },
-	{ 2136809473, 1627275942, 1680317971 },
-	{ 2136764417, 1574888217,   14011331 },
-	{ 2136741889,   14011055, 1129154251 },
-	{ 2136727553,   35862563, 1838555253 },
-	{ 2136721409,  310235666, 1363928244 },
-	{ 2136698881, 1612429202, 1560383828 },
-	{ 2136649729, 1138540131,  800014364 },
-	{ 2136606721,  602323503, 1433096652 },
-	{ 2136563713,  182209265, 1919611038 },
-	{ 2136555521,  324156477,  165591039 },
-	{ 2136549377,  195513113,  217165345 },
-	{ 2136526849, 1050768046,  939647887 },
-	{ 2136508417, 1886286237, 1619926572 },
-	{ 2136477697,  609647664,   35065157 },
-	{ 2136471553,  679352216, 1452259468 },
-	{ 2136457217,  128630031,  824816521 },
-	{ 2136422401,   19787464, 1526049830 },
-	{ 2136420353,  698316836, 1530623527 },
-	{ 2136371201, 1651862373, 1804812805 },
-	{ 2136334337,  326596005,  336977082 },
-	{ 2136322049,   63253370, 1904972151 },
-	{ 2136297473,  312176076,  172182411 },
-	{ 2136248321,  381261841,  369032670 },
-	{ 2136242177,  358688773, 1640007994 },
-	{ 2136229889,  512677188,   75585225 },
-	{ 2136219649, 2095003250, 1970086149 },
-	{ 2136207361, 1909650722,  537760675 },
-	{ 2136176641, 1334616195, 1533487619 },
-	{ 2136158209, 2096285632, 1793285210 },
-	{ 2136143873, 1897347517,  293843959 },
-	{ 2136133633,  923586222, 1022655978 },
-	{ 2136096769, 1464868191, 1515074410 },
-	{ 2136094721, 2020679520, 2061636104 },
-	{ 2136076289,  290798503, 1814726809 },
-	{ 2136041473,  156415894, 1250757633 },
-	{ 2135996417,  297459940, 1132158924 },
-	{ 2135955457,  538755304, 1688831340 },
-	{ 0, 0, 0 }
-};
-
-/*
- * Reduce a small signed integer modulo a small prime. The source
- * value x MUST be such that -p < x < p.
- */
-static inline uint32_t
-modp_set(int32_t x, uint32_t p)
-{
-	uint32_t w;
-
-	w = (uint32_t)x;
-	w += p & -(w >> 31);
-	return w;
-}
-
-/*
- * Normalize a modular integer around 0.
- */
-static inline int32_t
-modp_norm(uint32_t x, uint32_t p)
-{
-	return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
-}
-
-/*
- * Compute -1/p mod 2^31. This works for all odd integers p that fit
- * on 31 bits.
- */
-static uint32_t
-modp_ninv31(uint32_t p)
-{
-	uint32_t y;
-
-	y = 2 - p;
-	y *= 2 - p * y;
-	y *= 2 - p * y;
-	y *= 2 - p * y;
-	y *= 2 - p * y;
-	return (uint32_t)0x7FFFFFFF & -y;
-}
-
-/*
- * Compute R = 2^31 mod p.
- */
-static inline uint32_t
-modp_R(uint32_t p)
-{
-	/*
-	 * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
-	 * 2^31 - p.
-	 */
-	return ((uint32_t)1 << 31) - p;
-}
-
-/*
- * Addition modulo p.
- */
-static inline uint32_t
-modp_add(uint32_t a, uint32_t b, uint32_t p)
-{
-	uint32_t d;
-
-	d = a + b - p;
-	d += p & -(d >> 31);
-	return d;
-}
-
-/*
- * Subtraction modulo p.
- */
-static inline uint32_t
-modp_sub(uint32_t a, uint32_t b, uint32_t p)
-{
-	uint32_t d;
-
-	d = a - b;
-	d += p & -(d >> 31);
-	return d;
-}
-
-/*
- * Halving modulo p.
- */
-/* unused
-static inline uint32_t
-modp_half(uint32_t a, uint32_t p)
-{
-	a += p & -(a & 1);
-	return a >> 1;
-}
-*/
-
-/*
- * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
- * It is required that p is an odd integer.
- */
-static inline uint32_t
-modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i)
-{
-	uint64_t z, w;
-	uint32_t d;
-
-	z = (uint64_t)a * (uint64_t)b;
-	w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
-	d = (uint32_t)((z + w) >> 31) - p;
-	d += p & -(d >> 31);
-	return d;
-}
-
-/*
- * Compute R2 = 2^62 mod p.
- */
-static uint32_t
-modp_R2(uint32_t p, uint32_t p0i)
-{
-	uint32_t z;
-
-	/*
-	 * Compute z = 2^31 mod p (this is the value 1 in Montgomery
-	 * representation), then double it with an addition.
-	 */
-	z = modp_R(p);
-	z = modp_add(z, z, p);
-
-	/*
-	 * Square it five times to obtain 2^32 in Montgomery representation
-	 * (i.e. 2^63 mod p).
-	 */
-	z = modp_montymul(z, z, p, p0i);
-	z = modp_montymul(z, z, p, p0i);
-	z = modp_montymul(z, z, p, p0i);
-	z = modp_montymul(z, z, p, p0i);
-	z = modp_montymul(z, z, p, p0i);
-
-	/*
-	 * Halve the value mod p to get 2^62.
-	 */
-	z = (z + (p & -(z & 1))) >> 1;
-	return z;
-}
-
-/*
- * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
- * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
- * -1/p mod 2^31; R2 must be equal to 2^62 mod p.
- */
-static inline uint32_t
-modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2)
-{
-	int i;
-	uint32_t r, z;
-
-	/*
-	 * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
-	 * representation of (2^31)^e mod p, where e = x-1.
-	 * R2 is 2^31 in Montgomery representation.
-	 */
-	x --;
-	r = R2;
-	z = modp_R(p);
-	for (i = 0; (1U << i) <= x; i ++) {
-		if ((x & (1U << i)) != 0) {
-			z = modp_montymul(z, r, p, p0i);
-		}
-		r = modp_montymul(r, r, p, p0i);
-	}
-	return z;
-}
-
-/*
- * Division modulo p. If the divisor (b) is 0, then 0 is returned.
- * This function computes proper results only when p is prime.
- * Parameters:
- *   a     dividend
- *   b     divisor
- *   p     odd prime modulus
- *   p0i   -1/p mod 2^31
- *   R     2^31 mod R
- */
-static uint32_t
-modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R)
-{
-	uint32_t z, e;
-	int i;
-
-	e = p - 2;
-	z = R;
-	for (i = 30; i >= 0; i --) {
-		uint32_t z2;
-
-		z = modp_montymul(z, z, p, p0i);
-		z2 = modp_montymul(z, b, p, p0i);
-		z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
-	}
-
-	/*
-	 * The loop above just assumed that b was in Montgomery
-	 * representation, i.e. really contained b*R; under that
-	 * assumption, it returns 1/b in Montgomery representation,
-	 * which is R/b. But we gave it b in normal representation,
-	 * so the loop really returned R/(b/R) = R^2/b.
-	 *
-	 * We want a/b, so we need one Montgomery multiplication with a,
-	 * which also remove one of the R factors, and another such
-	 * multiplication to remove the second R factor.
-	 */
-	z = modp_montymul(z, 1, p, p0i);
-	return modp_montymul(a, z, p, p0i);
-}
-
-/*
- * Bit-reversal index table.
- */
-static const uint16_t REV10[] = {
-	   0,  512,  256,  768,  128,  640,  384,  896,   64,  576,  320,  832,
-	 192,  704,  448,  960,   32,  544,  288,  800,  160,  672,  416,  928,
-	  96,  608,  352,  864,  224,  736,  480,  992,   16,  528,  272,  784,
-	 144,  656,  400,  912,   80,  592,  336,  848,  208,  720,  464,  976,
-	  48,  560,  304,  816,  176,  688,  432,  944,  112,  624,  368,  880,
-	 240,  752,  496, 1008,    8,  520,  264,  776,  136,  648,  392,  904,
-	  72,  584,  328,  840,  200,  712,  456,  968,   40,  552,  296,  808,
-	 168,  680,  424,  936,  104,  616,  360,  872,  232,  744,  488, 1000,
-	  24,  536,  280,  792,  152,  664,  408,  920,   88,  600,  344,  856,
-	 216,  728,  472,  984,   56,  568,  312,  824,  184,  696,  440,  952,
-	 120,  632,  376,  888,  248,  760,  504, 1016,    4,  516,  260,  772,
-	 132,  644,  388,  900,   68,  580,  324,  836,  196,  708,  452,  964,
-	  36,  548,  292,  804,  164,  676,  420,  932,  100,  612,  356,  868,
-	 228,  740,  484,  996,   20,  532,  276,  788,  148,  660,  404,  916,
-	  84,  596,  340,  852,  212,  724,  468,  980,   52,  564,  308,  820,
-	 180,  692,  436,  948,  116,  628,  372,  884,  244,  756,  500, 1012,
-	  12,  524,  268,  780,  140,  652,  396,  908,   76,  588,  332,  844,
-	 204,  716,  460,  972,   44,  556,  300,  812,  172,  684,  428,  940,
-	 108,  620,  364,  876,  236,  748,  492, 1004,   28,  540,  284,  796,
-	 156,  668,  412,  924,   92,  604,  348,  860,  220,  732,  476,  988,
-	  60,  572,  316,  828,  188,  700,  444,  956,  124,  636,  380,  892,
-	 252,  764,  508, 1020,    2,  514,  258,  770,  130,  642,  386,  898,
-	  66,  578,  322,  834,  194,  706,  450,  962,   34,  546,  290,  802,
-	 162,  674,  418,  930,   98,  610,  354,  866,  226,  738,  482,  994,
-	  18,  530,  274,  786,  146,  658,  402,  914,   82,  594,  338,  850,
-	 210,  722,  466,  978,   50,  562,  306,  818,  178,  690,  434,  946,
-	 114,  626,  370,  882,  242,  754,  498, 1010,   10,  522,  266,  778,
-	 138,  650,  394,  906,   74,  586,  330,  842,  202,  714,  458,  970,
-	  42,  554,  298,  810,  170,  682,  426,  938,  106,  618,  362,  874,
-	 234,  746,  490, 1002,   26,  538,  282,  794,  154,  666,  410,  922,
-	  90,  602,  346,  858,  218,  730,  474,  986,   58,  570,  314,  826,
-	 186,  698,  442,  954,  122,  634,  378,  890,  250,  762,  506, 1018,
-	   6,  518,  262,  774,  134,  646,  390,  902,   70,  582,  326,  838,
-	 198,  710,  454,  966,   38,  550,  294,  806,  166,  678,  422,  934,
-	 102,  614,  358,  870,  230,  742,  486,  998,   22,  534,  278,  790,
-	 150,  662,  406,  918,   86,  598,  342,  854,  214,  726,  470,  982,
-	  54,  566,  310,  822,  182,  694,  438,  950,  118,  630,  374,  886,
-	 246,  758,  502, 1014,   14,  526,  270,  782,  142,  654,  398,  910,
-	  78,  590,  334,  846,  206,  718,  462,  974,   46,  558,  302,  814,
-	 174,  686,  430,  942,  110,  622,  366,  878,  238,  750,  494, 1006,
-	  30,  542,  286,  798,  158,  670,  414,  926,   94,  606,  350,  862,
-	 222,  734,  478,  990,   62,  574,  318,  830,  190,  702,  446,  958,
-	 126,  638,  382,  894,  254,  766,  510, 1022,    1,  513,  257,  769,
-	 129,  641,  385,  897,   65,  577,  321,  833,  193,  705,  449,  961,
-	  33,  545,  289,  801,  161,  673,  417,  929,   97,  609,  353,  865,
-	 225,  737,  481,  993,   17,  529,  273,  785,  145,  657,  401,  913,
-	  81,  593,  337,  849,  209,  721,  465,  977,   49,  561,  305,  817,
-	 177,  689,  433,  945,  113,  625,  369,  881,  241,  753,  497, 1009,
-	   9,  521,  265,  777,  137,  649,  393,  905,   73,  585,  329,  841,
-	 201,  713,  457,  969,   41,  553,  297,  809,  169,  681,  425,  937,
-	 105,  617,  361,  873,  233,  745,  489, 1001,   25,  537,  281,  793,
-	 153,  665,  409,  921,   89,  601,  345,  857,  217,  729,  473,  985,
-	  57,  569,  313,  825,  185,  697,  441,  953,  121,  633,  377,  889,
-	 249,  761,  505, 1017,    5,  517,  261,  773,  133,  645,  389,  901,
-	  69,  581,  325,  837,  197,  709,  453,  965,   37,  549,  293,  805,
-	 165,  677,  421,  933,  101,  613,  357,  869,  229,  741,  485,  997,
-	  21,  533,  277,  789,  149,  661,  405,  917,   85,  597,  341,  853,
-	 213,  725,  469,  981,   53,  565,  309,  821,  181,  693,  437,  949,
-	 117,  629,  373,  885,  245,  757,  501, 1013,   13,  525,  269,  781,
-	 141,  653,  397,  909,   77,  589,  333,  845,  205,  717,  461,  973,
-	  45,  557,  301,  813,  173,  685,  429,  941,  109,  621,  365,  877,
-	 237,  749,  493, 1005,   29,  541,  285,  797,  157,  669,  413,  925,
-	  93,  605,  349,  861,  221,  733,  477,  989,   61,  573,  317,  829,
-	 189,  701,  445,  957,  125,  637,  381,  893,  253,  765,  509, 1021,
-	   3,  515,  259,  771,  131,  643,  387,  899,   67,  579,  323,  835,
-	 195,  707,  451,  963,   35,  547,  291,  803,  163,  675,  419,  931,
-	  99,  611,  355,  867,  227,  739,  483,  995,   19,  531,  275,  787,
-	 147,  659,  403,  915,   83,  595,  339,  851,  211,  723,  467,  979,
-	  51,  563,  307,  819,  179,  691,  435,  947,  115,  627,  371,  883,
-	 243,  755,  499, 1011,   11,  523,  267,  779,  139,  651,  395,  907,
-	  75,  587,  331,  843,  203,  715,  459,  971,   43,  555,  299,  811,
-	 171,  683,  427,  939,  107,  619,  363,  875,  235,  747,  491, 1003,
-	  27,  539,  283,  795,  155,  667,  411,  923,   91,  603,  347,  859,
-	 219,  731,  475,  987,   59,  571,  315,  827,  187,  699,  443,  955,
-	 123,  635,  379,  891,  251,  763,  507, 1019,    7,  519,  263,  775,
-	 135,  647,  391,  903,   71,  583,  327,  839,  199,  711,  455,  967,
-	  39,  551,  295,  807,  167,  679,  423,  935,  103,  615,  359,  871,
-	 231,  743,  487,  999,   23,  535,  279,  791,  151,  663,  407,  919,
-	  87,  599,  343,  855,  215,  727,  471,  983,   55,  567,  311,  823,
-	 183,  695,  439,  951,  119,  631,  375,  887,  247,  759,  503, 1015,
-	  15,  527,  271,  783,  143,  655,  399,  911,   79,  591,  335,  847,
-	 207,  719,  463,  975,   47,  559,  303,  815,  175,  687,  431,  943,
-	 111,  623,  367,  879,  239,  751,  495, 1007,   31,  543,  287,  799,
-	 159,  671,  415,  927,   95,  607,  351,  863,  223,  735,  479,  991,
-	  63,  575,  319,  831,  191,  703,  447,  959,  127,  639,  383,  895,
-	 255,  767,  511, 1023
-};
-
-/*
- * Compute the roots for NTT and inverse NTT (binary case). Input
- * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
- * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
- *   gm[rev(i)] = g^i mod p
- *   igm[rev(i)] = (1/g)^i mod p
- * where rev() is the "bit reversal" function over 10 bits. It fills
- * the arrays only up to N = 2^logn values.
- *
- * The values stored in gm[] and igm[] are in Montgomery representation.
- *
- * p must be a prime such that p = 1 mod 2048.
- */
-static void
-modp_mkgm2(uint32_t *restrict gm, uint32_t *restrict igm, unsigned logn,
-	uint32_t g, uint32_t p, uint32_t p0i)
-{
-	size_t u, n;
-	unsigned k;
-	uint32_t ig, x1, x2, R2;
-
-	n = (size_t)1 << logn;
-
-	/*
-	 * We want g such that g^(2N) = 1 mod p, but the provided
-	 * generator has order 2048. We must square it a few times.
-	 */
-	R2 = modp_R2(p, p0i);
-	g = modp_montymul(g, R2, p, p0i);
-	for (k = logn; k < 10; k ++) {
-		g = modp_montymul(g, g, p, p0i);
-	}
-
-	ig = modp_div(R2, g, p, p0i, modp_R(p));
-	k = 10 - logn;
-	x1 = x2 = modp_R(p);
-	for (u = 0; u < n; u ++) {
-		size_t v;
-
-		v = REV10[u << k];
-		gm[v] = x1;
-		igm[v] = x2;
-		x1 = modp_montymul(x1, g, p, p0i);
-		x2 = modp_montymul(x2, ig, p, p0i);
-	}
-}
-
-/*
- * Compute the NTT over a polynomial (binary case). Polynomial elements
- * are a[0], a[stride], a[2 * stride]...
- */
-static void
-modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
-	uint32_t p, uint32_t p0i)
-{
-	size_t t, m, n;
-
-	if (logn == 0) {
-		return;
-	}
-	n = (size_t)1 << logn;
-	t = n;
-	for (m = 1; m < n; m <<= 1) {
-		size_t ht, u, v1;
-
-		ht = t >> 1;
-		for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
-			uint32_t s;
-			size_t v;
-			uint32_t *r1, *r2;
-
-			s = gm[m + u];
-			r1 = a + v1 * stride;
-			r2 = r1 + ht * stride;
-			for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
-				uint32_t x, y;
-
-				x = *r1;
-				y = modp_montymul(*r2, s, p, p0i);
-				*r1 = modp_add(x, y, p);
-				*r2 = modp_sub(x, y, p);
-			}
-		}
-		t = ht;
-	}
-}
-
-/*
- * Compute the inverse NTT over a polynomial (binary case).
- */
-static void
-modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
-	uint32_t p, uint32_t p0i)
-{
-	size_t t, m, n, k;
-	uint32_t ni;
-	uint32_t *r;
-
-	if (logn == 0) {
-		return;
-	}
-	n = (size_t)1 << logn;
-	t = 1;
-	for (m = n; m > 1; m >>= 1) {
-		size_t hm, dt, u, v1;
-
-		hm = m >> 1;
-		dt = t << 1;
-		for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
-			uint32_t s;
-			size_t v;
-			uint32_t *r1, *r2;
-
-			s = igm[hm + u];
-			r1 = a + v1 * stride;
-			r2 = r1 + t * stride;
-			for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
-				uint32_t x, y;
-
-				x = *r1;
-				y = *r2;
-				*r1 = modp_add(x, y, p);
-				*r2 = modp_montymul(
-					modp_sub(x, y, p), s, p, p0i);;
-			}
-		}
-		t = dt;
-	}
-
-	/*
-	 * We need 1/n in Montgomery representation, i.e. R/n. Since
-	 * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
-	 * thus a simple shift will do.
-	 */
-	ni = (uint32_t)1 << (31 - logn);
-	for (k = 0, r = a; k < n; k ++, r += stride) {
-		*r = modp_montymul(*r, ni, p, p0i);
-	}
-}
-
-/*
- * Simplified macros for NTT and iNTT (binary case) when the elements
- * are consecutive in RAM.
- */
-#define modp_NTT2(a, gm, logn, p, p0i)   modp_NTT2_ext(a, 1, gm, logn, p, p0i)
-#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)
-
-/*
- * Given polynomial f in NTT representation modulo p, compute f' of degree
- * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
- * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
- *
- * The new polynomial is written "in place" over the first N/2 elements
- * of f.
- *
- * If applied logn times successively on a given polynomial, the resulting
- * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
- *
- * This function applies only to the binary case; it is invoked from
- * solve_NTRU_binary_depth1().
- */
-static void
-modp_poly_rec_res(uint32_t *f, unsigned logn,
-	uint32_t p, uint32_t p0i, uint32_t R2)
-{
-	size_t hn, u;
-
-	hn = (size_t)1 << (logn - 1);
-	for (u = 0; u < hn; u ++) {
-		uint32_t w0, w1;
-
-		w0 = f[(u << 1) + 0];
-		w1 = f[(u << 1) + 1];
-		f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
-	}
-}
-
-/* ==================================================================== */
-/*
- * Custom bignum implementation.
- *
- * This is a very reduced set of functionalities. We need to do the
- * following operations:
- *
- *  - Rebuild the resultant and the polynomial coefficients from their
- *    values modulo small primes (of length 31 bits each).
- *
- *  - Compute an extended GCD between the two computed resultants.
- *
- *  - Extract top bits and add scaled values during the successive steps
- *    of Babai rounding.
- *
- * When rebuilding values using CRT, we must also recompute the product
- * of the small prime factors. We always do it one small factor at a
- * time, so the "complicated" operations can be done modulo the small
- * prime with the modp_* functions. CRT coefficients (inverses) are
- * precomputed.
- *
- * All values are positive until the last step: when the polynomial
- * coefficients have been rebuilt, we normalize them around 0. But then,
- * only additions and subtractions on the upper few bits are needed
- * afterwards.
- *
- * We keep big integers as arrays of 31-bit words (in uint32_t values);
- * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
- * makes it easier to keep track of carries. When negative values are
- * used, two's complement is used.
- */
-
-/*
- * Subtract integer b from integer a. Both integers are supposed to have
- * the same size. The carry (0 or 1) is returned. Source arrays a and b
- * MUST be distinct.
- *
- * The operation is performed as described above if ctr = 1. If
- * ctl = 0, the value a[] is unmodified, but all memory accesses are
- * still performed, and the carry is computed and returned.
- */
-static uint32_t
-zint_sub(uint32_t *restrict a, const uint32_t *restrict b, size_t len,
-	uint32_t ctl)
-{
-	size_t u;
-	uint32_t cc, m;
-
-	cc = 0;
-	m = -ctl;
-	for (u = 0; u < len; u ++) {
-		uint32_t aw, w;
-
-		aw = a[u];
-		w = aw - b[u] - cc;
-		cc = w >> 31;
-		aw ^= ((w & 0x7FFFFFFF) ^ aw) & m;
-		a[u] = aw;
-	}
-	return cc;
-}
-
-/*
- * Mutiply the provided big integer m with a small value x.
- * This function assumes that x < 2^31. The carry word is returned.
- */
-static uint32_t
-zint_mul_small(uint32_t *m, size_t mlen, uint32_t x)
-{
-	size_t u;
-	uint32_t cc;
-
-	cc = 0;
-	for (u = 0; u < mlen; u ++) {
-		uint64_t z;
-
-		z = (uint64_t)m[u] * (uint64_t)x + cc;
-		m[u] = (uint32_t)z & 0x7FFFFFFF;
-		cc = (uint32_t)(z >> 31);
-	}
-	return cc;
-}
-
-/*
- * Reduce a big integer d modulo a small integer p.
- * Rules:
- *  d is unsigned
- *  p is prime
- *  2^30 < p < 2^31
- *  p0i = -(1/p) mod 2^31
- *  R2 = 2^62 mod p
- */
-static uint32_t
-zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
-	uint32_t p, uint32_t p0i, uint32_t R2)
-{
-	uint32_t x;
-	size_t u;
-
-	/*
-	 * Algorithm: we inject words one by one, starting with the high
-	 * word. Each step is:
-	 *  - multiply x by 2^31
-	 *  - add new word
-	 */
-	x = 0;
-	u = dlen;
-	while (u -- > 0) {
-		uint32_t w;
-
-		x = modp_montymul(x, R2, p, p0i);
-		w = d[u] - p;
-		w += p & -(w >> 31);
-		x = modp_add(x, w, p);
-	}
-	return x;
-}
-
-/*
- * Similar to zint_mod_small_unsigned(), except that d may be signed.
- * Extra parameter is Rx = 2^(31*dlen) mod p.
- */
-static uint32_t
-zint_mod_small_signed(const uint32_t *d, size_t dlen,
-	uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx)
-{
-	uint32_t z;
-
-	if (dlen == 0) {
-		return 0;
-	}
-	z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
-	z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
-	return z;
-}
-
-/*
- * Add y*s to x. x and y initially have length 'len' words; the new x
- * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must
- * not overlap.
- */
-static void
-zint_add_mul_small(uint32_t *restrict x,
-	const uint32_t *restrict y, size_t len, uint32_t s)
-{
-	size_t u;
-	uint32_t cc;
-
-	cc = 0;
-	for (u = 0; u < len; u ++) {
-		uint32_t xw, yw;
-		uint64_t z;
-
-		xw = x[u];
-		yw = y[u];
-		z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
-		x[u] = (uint32_t)z & 0x7FFFFFFF;
-		cc = (uint32_t)(z >> 31);
-	}
-	x[len] = cc;
-}
-
-/*
- * Normalize a modular integer around 0: if x > p/2, then x is replaced
- * with x - p (signed encoding with two's complement); otherwise, x is
- * untouched. The two integers x and p are encoded over the same length.
- */
-static void
-zint_norm_zero(uint32_t *restrict x, const uint32_t *restrict p, size_t len)
-{
-	size_t u;
-	uint32_t r, bb;
-
-	/*
-	 * Compare x with p/2. We use the shifted version of p, and p
-	 * is odd, so we really compare with (p-1)/2; we want to perform
-	 * the subtraction if and only if x > (p-1)/2.
-	 */
-	r = 0;
-	bb = 0;
-	u = len;
-	while (u -- > 0) {
-		uint32_t wx, wp, cc;
-
-		/*
-		 * Get the two words to compare in wx and wp (both over
-		 * 31 bits exactly).
-		 */
-		wx = x[u];
-		wp = (p[u] >> 1) | (bb << 30);
-		bb = p[u] & 1;
-
-		/*
-		 * We set cc to -1, 0 or 1, depending on whether wp is
-		 * lower than, equal to, or greater than wx.
-		 */
-		cc = wp - wx;
-		cc = ((-cc) >> 31) | -(cc >> 31);
-
-		/*
-		 * If r != 0 then it is either 1 or -1, and we keep its
-		 * value. Otherwise, if r = 0, then we replace it with cc.
-		 */
-		r |= cc & ((r & 1) - 1);
-	}
-
-	/*
-	 * At this point, r = -1, 0 or 1, depending on whether (p-1)/2
-	 * is lower than, equal to, or greater than x. We thus want to
-	 * do the subtraction only if r = -1.
-	 */
-	zint_sub(x, p, len, r >> 31);
-}
-
-/*
- * Rebuild integers from their RNS representation. There are 'num'
- * integers, and each consists in 'xlen' words. 'xx' points at that
- * first word of the first integer; subsequent integers are accessed
- * by adding 'xstride' repeatedly.
- *
- * The words of an integer are the RNS representation of that integer,
- * using the provided 'primes' are moduli. This function replaces
- * each integer with its multi-word value (little-endian order).
- *
- * If "normalize_signed" is non-zero, then the returned value is
- * normalized to the -m/2..m/2 interval (where m is the product of all
- * small prime moduli); two's complement is used for negative values.
- */
-static void
-zint_rebuild_CRT(uint32_t *restrict xx, size_t xlen, size_t xstride,
-	size_t num, const small_prime *primes, int normalize_signed,
-	uint32_t *restrict tmp)
-{
-	size_t u;
-	uint32_t *x;
-
-	tmp[0] = primes[0].p;
-	for (u = 1; u < xlen; u ++) {
-		/*
-		 * At the entry of each loop iteration:
-		 *  - the first u words of each array have been
-		 *    reassembled;
-		 *  - the first u words of tmp[] contains the
-		 * product of the prime moduli processed so far.
-		 *
-		 * We call 'q' the product of all previous primes.
-		 */
-		uint32_t p, p0i, s, R2;
-		size_t v;
-
-		p = primes[u].p;
-		s = primes[u].s;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-
-		for (v = 0, x = xx; v < num; v ++, x += xstride) {
-			uint32_t xp, xq, xr;
-			/*
-			 * xp = the integer x modulo the prime p for this
-			 *      iteration
-			 * xq = (x mod q) mod p
-			 */
-			xp = x[u];
-			xq = zint_mod_small_unsigned(x, u, p, p0i, R2);
-
-			/*
-			 * New value is (x mod q) + q * (s * (xp - xq) mod p)
-			 */
-			xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
-			zint_add_mul_small(x, tmp, u, xr);
-		}
-
-		/*
-		 * Update product of primes in tmp[].
-		 */
-		tmp[u] = zint_mul_small(tmp, u, p);
-	}
-
-	/*
-	 * Normalize the reconstructed values around 0.
-	 */
-	if (normalize_signed) {
-		for (u = 0, x = xx; u < num; u ++, x += xstride) {
-			zint_norm_zero(x, tmp, xlen);
-		}
-	}
-}
-
-/*
- * Negate a big integer conditionally: value a is replaced with -a if
- * and only if ctl = 1. Control value ctl must be 0 or 1.
- */
-static void
-zint_negate(uint32_t *a, size_t len, uint32_t ctl)
-{
-	size_t u;
-	uint32_t cc, m;
-
-	/*
-	 * If ctl = 1 then we flip the bits of a by XORing with
-	 * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR
-	 * with 0 and add 0, which leaves the value unchanged.
-	 */
-	cc = ctl;
-	m = -ctl >> 1;
-	for (u = 0; u < len; u ++) {
-		uint32_t aw;
-
-		aw = a[u];
-		aw = (aw ^ m) + cc;
-		a[u] = aw & 0x7FFFFFFF;
-		cc = aw >> 31;
-	}
-}
-
-/*
- * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
- * The low bits are dropped (the caller should compute the coefficients
- * such that these dropped bits are all zeros). If either or both
- * yields a negative value, then the value is negated.
- *
- * Returned value is:
- *  0  both values were positive
- *  1  new a had to be negated
- *  2  new b had to be negated
- *  3  both new a and new b had to be negated
- *
- * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
- */
-static uint32_t
-zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
-	int64_t xa, int64_t xb, int64_t ya, int64_t yb)
-{
-	size_t u;
-	int64_t cca, ccb;
-	uint32_t nega, negb;
-
-	cca = 0;
-	ccb = 0;
-	for (u = 0; u < len; u ++) {
-		uint32_t wa, wb;
-		uint64_t za, zb;
-
-		wa = a[u];
-		wb = b[u];
-		za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca;
-		zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb;
-		if (u > 0) {
-			a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
-			b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
-		}
-		cca = *(int64_t *)&za >> 31;
-		ccb = *(int64_t *)&zb >> 31;
-	}
-	a[len - 1] = (uint32_t)cca;
-	b[len - 1] = (uint32_t)ccb;
-
-	nega = (uint32_t)((uint64_t)cca >> 63);
-	negb = (uint32_t)((uint64_t)ccb >> 63);
-	zint_negate(a, len, nega);
-	zint_negate(b, len, negb);
-	return nega | (negb << 1);
-}
-
-/*
- * Finish modular reduction. Rules on input parameters:
- *
- *   if neg = 1, then -m <= a < 0
- *   if neg = 0, then 0 <= a < 2*m
- *
- * If neg = 0, then the top word of a[] is allowed to use 32 bits.
- *
- * Modulus m must be odd.
- */
-static void
-zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg)
-{
-	size_t u;
-	uint32_t cc, xm, ym;
-
-	/*
-	 * First pass: compare a (assumed nonnegative) with m. Note that
-	 * if the top word uses 32 bits, subtracting m must yield a
-	 * value less than 2^31 since a < 2*m.
-	 */
-	cc = 0;
-	for (u = 0; u < len; u ++) {
-		cc = (a[u] - m[u] - cc) >> 31;
-	}
-
-	/*
-	 * If neg = 1 then we must add m (regardless of cc)
-	 * If neg = 0 and cc = 0 then we must subtract m
-	 * If neg = 0 and cc = 1 then we must do nothing
-	 *
-	 * In the loop below, we conditionally subtract either m or -m
-	 * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1);
-	 * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0.
-	 */
-	xm = -neg >> 1;
-	ym = -(neg | (1 - cc));
-	cc = neg;
-	for (u = 0; u < len; u ++) {
-		uint32_t aw, mw;
-
-		aw = a[u];
-		mw = (m[u] ^ xm) & ym;
-		aw = aw - mw - cc;
-		a[u] = aw & 0x7FFFFFFF;
-		cc = aw >> 31;
-	}
-}
-
-/*
- * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
- * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
- */
-static void
-zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
-	uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb)
-{
-	size_t u;
-	int64_t cca, ccb;
-	uint32_t fa, fb;
-
-	/*
-	 * These are actually four combined Montgomery multiplications.
-	 */
-	cca = 0;
-	ccb = 0;
-	fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
-	fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
-	for (u = 0; u < len; u ++) {
-		uint32_t wa, wb;
-		uint64_t za, zb;
-
-		wa = a[u];
-		wb = b[u];
-		za = wa * (uint64_t)xa + wb * (uint64_t)xb
-			+ m[u] * (uint64_t)fa + (uint64_t)cca;
-		zb = wa * (uint64_t)ya + wb * (uint64_t)yb
-			+ m[u] * (uint64_t)fb + (uint64_t)ccb;
-		if (u > 0) {
-			a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
-			b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
-		}
-		cca = *(int64_t *)&za >> 31;
-		ccb = *(int64_t *)&zb >> 31;
-	}
-	a[len - 1] = (uint32_t)cca;
-	b[len - 1] = (uint32_t)ccb;
-
-	/*
-	 * At this point:
-	 *   -m <= a < 2*m
-	 *   -m <= b < 2*m
-	 * (this is a case of Montgomery reduction)
-	 * The top words of 'a' and 'b' may have a 32-th bit set.
-	 * We want to add or subtract the modulus, as required.
-	 */
-	zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
-	zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
-}
-
-/*
- * Compute a GCD between two positive big integers x and y. The two
- * integers must be odd. Returned value is 1 if the GCD is 1, 0
- * otherwise. When 1 is returned, arrays u and v are filled with values
- * such that:
- *   0 <= u <= y
- *   0 <= v <= x
- *   x*u - y*v = 1
- * x[] and y[] are unmodified. Both input values must have the same
- * encoded length. Temporary array must be large enough to accommodate 4
- * extra values of that length. Arrays u, v and tmp may not overlap with
- * each other, or with either x or y.
- */
-static int
-zint_bezout(uint32_t *restrict u, uint32_t *restrict v,
-	const uint32_t *restrict x, const uint32_t *restrict y,
-	size_t len, uint32_t *restrict tmp)
-{
-	/*
-	 * Algorithm is an extended binary GCD. We maintain 6 values
-	 * a, b, u0, u1, v0 and v1 with the following invariants:
-	 *
-	 *  a = x*u0 - y*v0
-	 *  b = x*u1 - y*v1
-	 *  0 <= a <= x
-	 *  0 <= b <= y
-	 *  0 <= u0 < y
-	 *  0 <= v0 < x
-	 *  0 <= u1 <= y
-	 *  0 <= v1 < x
-	 *
-	 * Initial values are:
-	 *
-	 *  a = x   u0 = 1   v0 = 0
-	 *  b = y   u1 = y   v1 = x-1
-	 *
-	 * Each iteration reduces either a or b, and maintains the
-	 * invariants. Algorithm stops when a = b, at which point their
-	 * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
-	 * the values (u,v) we want to return.
-	 *
-	 * The formal definition of the algorithm is a sequence of steps:
-	 *
-	 *  - If a is even, then:
-	 *        a <- a/2
-	 *        u0 <- u0/2 mod y
-	 *        v0 <- v0/2 mod x
-	 *
-	 *  - Otherwise, if b is even, then:
-	 *        b <- b/2
-	 *        u1 <- u1/2 mod y
-	 *        v1 <- v1/2 mod x
-	 *
-	 *  - Otherwise, if a > b, then:
-	 *        a <- (a-b)/2
-	 *        u0 <- (u0-u1)/2 mod y
-	 *        v0 <- (v0-v1)/2 mod x
-	 *
-	 *  - Otherwise:
-	 *        b <- (b-a)/2
-	 *        u1 <- (u1-u0)/2 mod y
-	 *        v1 <- (v1-v0)/2 mod y
-	 *
-	 * We can show that the operations above preserve the invariants:
-	 *
-	 *  - If a is even, then u0 and v0 are either both even or both
-	 *    odd (since a = x*u0 - y*v0, and x and y are both odd).
-	 *    If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2).
-	 *    Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way,
-	 *    the a = x*u0 - y*v0 invariant is preserved.
-	 *
-	 *  - The same holds for the case where b is even.
-	 *
-	 *  - If a and b are odd, and a > b, then:
-	 *
-	 *      a-b = x*(u0-u1) - y*(v0-v1)
-	 *
-	 *    In that situation, if u0 < u1, then x*(u0-u1) < 0, but
-	 *    a-b > 0; therefore, it must be that v0 < v1, and the
-	 *    first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x),
-	 *    which preserves the invariants. Otherwise, if u0 > u1,
-	 *    then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and
-	 *    b >= 0, hence a-b <= x. It follows that, in that case,
-	 *    v0-v1 >= 0. The first part of the update is then:
-	 *    (u0,v0) <- (u0-u1,v0-v1), which again preserves the
-	 *    invariants.
-	 *
-	 *    Either way, once the subtraction is done, the new value of
-	 *    a, which is the difference of two odd values, is even,
-	 *    and the remaining of this step is a subcase of the
-	 *    first algorithm case (i.e. when a is even).
-	 *
-	 *  - If a and b are odd, and b > a, then the a similar
-	 *    argument holds.
-	 *
-	 * The values a and b start at x and y, respectively. Since x
-	 * and y are odd, their GCD is odd, and it is easily seen that
-	 * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b);
-	 * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a
-	 * or b is reduced by at least one bit at each iteration, so
-	 * the algorithm necessarily converges on the case a = b, at
-	 * which point the common value is the GCD.
-	 *
-	 * In the algorithm expressed above, when a = b, the fourth case
-	 * applies, and sets b = 0. Since a contains the GCD of x and y,
-	 * which are both odd, a must be odd, and subsequent iterations
-	 * (if any) will simply divide b by 2 repeatedly, which has no
-	 * consequence. Thus, the algorithm can run for more iterations
-	 * than necessary; the final GCD will be in a, and the (u,v)
-	 * coefficients will be (u0,v0).
-	 *
-	 *
-	 * The presentation above is bit-by-bit. It can be sped up by
-	 * noticing that all decisions are taken based on the low bits
-	 * and high bits of a and b. We can extract the two top words
-	 * and low word of each of a and b, and compute reduction
-	 * parameters pa, pb, qa and qb such that the new values for
-	 * a and b are:
-	 *    a' = (a*pa + b*pb) / (2^31)
-	 *    b' = (a*qa + b*qb) / (2^31)
-	 * the two divisions being exact. The coefficients are obtained
-	 * just from the extracted words, and may be slightly off, requiring
-	 * an optional correction: if a' < 0, then we replace pa with -pa
-	 * and pb with -pb. Each such step will reduce the total length
-	 * (sum of lengths of a and b) by at least 30 bits at each
-	 * iteration.
-	 */
-	uint32_t *u0, *u1, *v0, *v1, *a, *b;
-	uint32_t x0i, y0i;
-	uint32_t num, rc;
-	size_t j;
-
-	if (len == 0) {
-		return 0;
-	}
-
-	/*
-	 * u0 and v0 are the u and v result buffers; the four other
-	 * values (u1, v1, a and b) are taken from tmp[].
-	 */
-	u0 = u;
-	v0 = v;
-	u1 = tmp;
-	v1 = u1 + len;
-	a = v1 + len;
-	b = a + len;
-
-	/*
-	 * We'll need the Montgomery reduction coefficients.
-	 */
-	x0i = modp_ninv31(x[0]);
-	y0i = modp_ninv31(y[0]);
-
-	/*
-	 * Initialize a, b, u0, u1, v0 and v1.
-	 *  a = x   u0 = 1   v0 = 0
-	 *  b = y   u1 = y   v1 = x-1
-	 * Note that x is odd, so computing x-1 is easy.
-	 */
-	memcpy(a, x, len * sizeof *x);
-	memcpy(b, y, len * sizeof *y);
-	u0[0] = 1;
-	memset(u0 + 1, 0, (len - 1) * sizeof *u0);
-	memset(v0, 0, len * sizeof *v0);
-	memcpy(u1, y, len * sizeof *u1);
-	memcpy(v1, x, len * sizeof *v1);
-	v1[0] --;
-
-	/*
-	 * Each input operand may be as large as 31*len bits, and we
-	 * reduce the total length by at least 30 bits at each iteration.
-	 */
-	for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) {
-		uint32_t c0, c1;
-		uint32_t a0, a1, b0, b1;
-		uint64_t a_hi, b_hi;
-		uint32_t a_lo, b_lo;
-		int64_t pa, pb, qa, qb;
-		int i;
-		uint32_t r;
-
-		/*
-		 * Extract the top words of a and b. If j is the highest
-		 * index >= 1 such that a[j] != 0 or b[j] != 0, then we
-		 * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1].
-		 * If a and b are down to one word each, then we use
-		 * a[0] and b[0].
-		 */
-		c0 = (uint32_t)-1;
-		c1 = (uint32_t)-1;
-		a0 = 0;
-		a1 = 0;
-		b0 = 0;
-		b1 = 0;
-		j = len;
-		while (j -- > 0) {
-			uint32_t aw, bw;
-
-			aw = a[j];
-			bw = b[j];
-			a0 ^= (a0 ^ aw) & c0;
-			a1 ^= (a1 ^ aw) & c1;
-			b0 ^= (b0 ^ bw) & c0;
-			b1 ^= (b1 ^ bw) & c1;
-			c1 = c0;
-			c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
-		}
-
-		/*
-		 * If c1 = 0, then we grabbed two words for a and b.
-		 * If c1 != 0 but c0 = 0, then we grabbed one word. It
-		 * is not possible that c1 != 0 and c0 != 0, because that
-		 * would mean that both integers are zero.
-		 */
-		a1 |= a0 & c1;
-		a0 &= ~c1;
-		b1 |= b0 & c1;
-		b0 &= ~c1;
-		a_hi = ((uint64_t)a0 << 31) + a1;
-		b_hi = ((uint64_t)b0 << 31) + b1;
-		a_lo = a[0];
-		b_lo = b[0];
-
-		/*
-		 * Compute reduction factors:
-		 *
-		 *   a' = a*pa + b*pb
-		 *   b' = a*qa + b*qb
-		 *
-		 * such that a' and b' are both multiple of 2^31, but are
-		 * only marginally larger than a and b.
-		 */
-		pa = 1;
-		pb = 0;
-		qa = 0;
-		qb = 1;
-		for (i = 0; i < 31; i ++) {
-			/*
-			 * At each iteration:
-			 *
-			 *   a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
-			 *   b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
-			 *   a <- a/2 if: a is even
-			 *   b <- b/2 if: a is odd, b is even
-			 *
-			 * We multiply a_lo and b_lo by 2 at each
-			 * iteration, thus a division by 2 really is a
-			 * non-multiplication by 2.
-			 */
-			uint32_t rt, oa, ob, cAB, cBA, cA;
-			uint64_t rz;
-
-			/*
-			 * rt = 1 if a_hi > b_hi, 0 otherwise.
-			 */
-			rz = b_hi - a_hi;
-			rt = (uint32_t)((rz ^ ((a_hi ^ b_hi)
-				& (a_hi ^ rz))) >> 63);
-
-			/*
-			 * cAB = 1 if b must be subtracted from a
-			 * cBA = 1 if a must be subtracted from b
-			 * cA = 1 if a must be divided by 2
-			 *
-			 * Rules:
-			 *
-			 *   cAB and cBA cannot both be 1.
-			 *   If a is not divided by 2, b is.
-			 */
-			oa = (a_lo >> i) & 1;
-			ob = (b_lo >> i) & 1;
-			cAB = oa & ob & rt;
-			cBA = oa & ob & ~rt;
-			cA = cAB | (oa ^ 1);
-
-			/*
-			 * Conditional subtractions.
-			 */
-			a_lo -= b_lo & -cAB;
-			a_hi -= b_hi & -(uint64_t)cAB;
-			pa -= qa & -(int64_t)cAB;
-			pb -= qb & -(int64_t)cAB;
-			b_lo -= a_lo & -cBA;
-			b_hi -= a_hi & -(uint64_t)cBA;
-			qa -= pa & -(int64_t)cBA;
-			qb -= pb & -(int64_t)cBA;
-
-			/*
-			 * Shifting.
-			 */
-			a_lo += a_lo & (cA - 1);
-			pa += pa & ((int64_t)cA - 1);
-			pb += pb & ((int64_t)cA - 1);
-			a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
-			b_lo += b_lo & -cA;
-			qa += qa & -(int64_t)cA;
-			qb += qb & -(int64_t)cA;
-			b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
-		}
-
-		/*
-		 * Apply the computed parameters to our values. We
-		 * may have to correct pa and pb depending on the
-		 * returned value of zint_co_reduce() (when a and/or b
-		 * had to be negated).
-		 */
-		r = zint_co_reduce(a, b, len, pa, pb, qa, qb);
-		pa -= (pa + pa) & -(int64_t)(r & 1);
-		pb -= (pb + pb) & -(int64_t)(r & 1);
-		qa -= (qa + qa) & -(int64_t)(r >> 1);
-		qb -= (qb + qb) & -(int64_t)(r >> 1);
-		zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb);
-		zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb);
-	}
-
-	/*
-	 * At that point, array a[] should contain the GCD, and the
-	 * results (u,v) should already be set. We check that the GCD
-	 * is indeed 1. We also check that the two operands x and y
-	 * are odd.
-	 */
-	rc = a[0] ^ 1;
-	for (j = 1; j < len; j ++) {
-		rc |= a[j];
-	}
-	return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]);
-}
-
-/*
- * Add k*y*2^sc to x. The result is assumed to fit in the array of
- * size xlen (truncation is applied if necessary).
- * Scale factor 'sc' is provided as sch and scl, such that:
- *   sch = sc / 31
- *   scl = sc % 31
- * xlen MUST NOT be lower than ylen.
- *
- * x[] and y[] are both signed integers, using two's complement for
- * negative values.
- */
-static void
-zint_add_scaled_mul_small(uint32_t *restrict x, size_t xlen,
-	const uint32_t *restrict y, size_t ylen, int32_t k,
-	uint32_t sch, uint32_t scl)
-{
-	size_t u;
-	uint32_t ysign, tw;
-	int32_t cc;
-
-	if (ylen == 0) {
-		return;
-	}
-
-	ysign = -(y[ylen - 1] >> 30) >> 1;
-	tw = 0;
-	cc = 0;
-	for (u = sch; u < xlen; u ++) {
-		size_t v;
-		uint32_t wy, wys, ccu;
-		uint64_t z;
-
-		/*
-		 * Get the next word of y (scaled).
-		 */
-		v = u - sch;
-		wy = v < ylen ? y[v] : ysign;
-		wys = ((wy << scl) & 0x7FFFFFFF) | tw;
-		tw = wy >> (31 - scl);
-
-		/*
-		 * The expression below does not overflow.
-		 */
-		z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc);
-		x[u] = (uint32_t)z & 0x7FFFFFFF;
-
-		/*
-		 * Right-shifting the signed value z would yield
-		 * implementation-defined results (arithmetic shift is
-		 * not guaranteed). However, we can cast to unsigned,
-		 * and get the next carry as an unsigned word. We can
-		 * then convert it back to signed by using the guaranteed
-		 * fact that 'int32_t' uses two's complement with no
-		 * trap representation or padding bit, and with a layout
-		 * compatible with that of 'uint32_t'.
-		 */
-		ccu = (uint32_t)(z >> 31);
-		cc = *(int32_t *)&ccu;
-	}
-}
-
-/*
- * Subtract y*2^sc from x. The result is assumed to fit in the array of
- * size xlen (truncation is applied if necessary).
- * Scale factor 'sc' is provided as sch and scl, such that:
- *   sch = sc / 31
- *   scl = sc % 31
- * xlen MUST NOT be lower than ylen.
- *
- * x[] and y[] are both signed integers, using two's complement for
- * negative values.
- */
-static void
-zint_sub_scaled(uint32_t *restrict x, size_t xlen,
-	const uint32_t *restrict y, size_t ylen, uint32_t sch, uint32_t scl)
-{
-	size_t u;
-	uint32_t ysign, tw;
-	uint32_t cc;
-
-	if (ylen == 0) {
-		return;
-	}
-
-	ysign = -(y[ylen - 1] >> 30) >> 1;
-	tw = 0;
-	cc = 0;
-	for (u = sch; u < xlen; u ++) {
-		size_t v;
-		uint32_t w, wy, wys;
-
-		/*
-		 * Get the next word of y (scaled).
-		 */
-		v = u - sch;
-		wy = v < ylen ? y[v] : ysign;
-		wys = ((wy << scl) & 0x7FFFFFFF) | tw;
-		tw = wy >> (31 - scl);
-
-		w = x[u] - wys - cc;
-		x[u] = w & 0x7FFFFFFF;
-		cc = w >> 31;
-	}
-}
-
-/*
- * Convert a one-word signed big integer into a signed value.
- */
-static inline int32_t
-zint_one_to_plain(const uint32_t *x)
-{
-	uint32_t w;
-
-	w = x[0];
-	w |= (w & 0x40000000) << 1;
-	return *(int32_t *)&w;
-}
-
-/* ==================================================================== */
-
-/*
- * Convert a polynomial to floating-point values.
- *
- * Each coefficient has length flen words, and starts fstride words after
- * the previous.
- *
- * IEEE-754 binary64 values can represent values in a finite range,
- * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large,
- * they should be "trimmed" by pointing not to the lowest word of each,
- * but upper.
- */
-static void
-poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
-	unsigned logn)
-{
-	size_t n, u;
-
-	n = MKN(logn);
-	if (flen == 0) {
-		for (u = 0; u < n; u ++) {
-			d[u] = fpr_zero;
-		}
-		return;
-	}
-	for (u = 0; u < n; u ++, f += fstride) {
-		size_t v;
-		uint32_t neg, cc, xm;
-		fpr x, fsc;
-
-		/*
-		 * Get sign of the integer; if it is negative, then we
-		 * will load its absolute value instead, and negate the
-		 * result.
-		 */
-		neg = -(f[flen - 1] >> 30);
-		xm = neg >> 1;
-		cc = neg & 1;
-		x = fpr_zero;
-		fsc = fpr_one;
-		for (v = 0; v < flen; v ++, fsc = fpr_mul(fsc, fpr_ptwo31)) {
-			uint32_t w;
-
-			w = (f[v] ^ xm) + cc;
-			cc = w >> 31;
-			w &= 0x7FFFFFFF;
-			w -= (w << 1) & neg;
-			x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc));
-		}
-		d[u] = x;
-	}
-}
-
-/*
- * Convert a polynomial to small integers. Source values are supposed
- * to be one-word integers, signed over 31 bits. Returned value is 0
- * if any of the coefficients exceeds the provided limit (in absolute
- * value), or 1 on success.
- *
- * This is not constant-time; this is not a problem here, because on
- * any failure, the NTRU-solving process will be deemed to have failed
- * and the (f,g) polynomials will be discarded.
- */
-static int
-poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn)
-{
-	size_t n, u;
-
-	n = MKN(logn);
-	for (u = 0; u < n; u ++) {
-		int32_t z;
-
-		z = zint_one_to_plain(s + u);
-		if (z < -lim || z > lim) {
-			return 0;
-		}
-		d[u] = (int8_t)z;
-	}
-	return 1;
-}
-
-/*
- * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
- * Coefficients of polynomial k are small integers (signed values in the
- * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31
- * and scl = sc % 31.
- *
- * This function implements the basic quadratic multiplication algorithm,
- * which is efficient in space (no extra buffer needed) but slow at
- * high degree.
- */
-static void
-poly_sub_scaled(uint32_t *restrict F, size_t Flen, size_t Fstride,
-	const uint32_t *restrict f, size_t flen, size_t fstride,
-	const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn)
-{
-	size_t n, u;
-
-	n = MKN(logn);
-	for (u = 0; u < n; u ++) {
-		int32_t kf;
-		size_t v;
-		uint32_t *x;
-		const uint32_t *y;
-
-		kf = -k[u];
-		x = F + u * Fstride;
-		y = f;
-		for (v = 0; v < n; v ++) {
-			zint_add_scaled_mul_small(
-				x, Flen, y, flen, kf, sch, scl);
-			if (u + v == n - 1) {
-				x = F;
-				kf = -kf;
-			} else {
-				x += Fstride;
-			}
-			y += fstride;
-		}
-	}
-}
-
-/*
- * Subtract k*f from F. Coefficients of polynomial k are small integers
- * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
- * assumes that the degree is large, and integers relatively small.
- * The value sc is provided as sch = sc / 31 and scl = sc % 31.
- */
-static void
-poly_sub_scaled_ntt(uint32_t *restrict F, size_t Flen, size_t Fstride,
-	const uint32_t *restrict f, size_t flen, size_t fstride,
-	const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn,
-	uint32_t *restrict tmp)
-{
-	uint32_t *gm, *igm, *fk, *t1, *x;
-	const uint32_t *y;
-	size_t n, u, tlen;
-	const small_prime *primes;
-
-	n = MKN(logn);
-	tlen = flen + 1;
-	gm = tmp;
-	igm = gm + MKN(logn);
-	fk = igm + MKN(logn);
-	t1 = fk + n * tlen;
-
-	primes = PRIMES;
-
-	/*
-	 * Compute k*f in fk[], in RNS notation.
-	 */
-	for (u = 0; u < tlen; u ++) {
-		uint32_t p, p0i, R2, Rx;
-		size_t v;
-
-		p = primes[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-		Rx = modp_Rx((unsigned)flen, p, p0i, R2);
-		modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
-
-		for (v = 0; v < n; v ++) {
-			t1[v] = modp_set(k[v], p);
-		}
-		modp_NTT2(t1, gm, logn, p, p0i);
-		for (v = 0, y = f, x = fk + u;
-			v < n; v ++, y += fstride, x += tlen)
-		{
-			*x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
-		}
-		modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
-		for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
-			*x = modp_montymul(
-				modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
-		}
-		modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
-	}
-
-	/*
-	 * Rebuild k*f.
-	 */
-	zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);
-
-	/*
-	 * Subtract k*f, scaled, from F.
-	 */
-	for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
-		zint_sub_scaled(x, Flen, y, tlen, sch, scl);
-	}
-}
-
-/* ==================================================================== */
-
-#if FALCON_KG_CHACHA20  // yyyKG_CHACHA20+1
-
-#define RNG_CONTEXT   prng
-#define get_rng_u64   prng_get_u64
-
-#else  // yyyKG_CHACHA20+0
-
-#define RNG_CONTEXT   inner_shake256_context
-
-/*
- * Get a random 8-byte integer from a SHAKE-based RNG. This function
- * ensures consistent interpretation of the SHAKE output so that
- * the same values will be obtained over different platforms, in case
- * a known seed is used.
- */
-static inline uint64_t
-get_rng_u64(inner_shake256_context *rng)
-{
-	/*
-	 * We enforce little-endian representation.
-	 */
-
-#if FALCON_LE  // yyyLE+1
-	/*
-	 * On little-endian systems we just interpret the bytes "as is"
-	 * (this is correct because the exact-width types such as
-	 * 'uint64_t' are guaranteed to have no padding and no trap
-	 * representation).
-	 */
-	uint64_t r;
-
-	inner_shake256_extract(rng, (uint8_t *)&r, sizeof r);
-	return r;
-#else  // yyyLE+0
-	uint8_t tmp[8];
-
-	inner_shake256_extract(rng, tmp, sizeof tmp);
-	return (uint64_t)tmp[0]
-		| ((uint64_t)tmp[1] << 8)
-		| ((uint64_t)tmp[2] << 16)
-		| ((uint64_t)tmp[3] << 24)
-		| ((uint64_t)tmp[4] << 32)
-		| ((uint64_t)tmp[5] << 40)
-		| ((uint64_t)tmp[6] << 48)
-		| ((uint64_t)tmp[7] << 56);
-#endif  // yyyLE-
-}
-
-#endif  // yyyKG_CHACHA20-
-
-/*
- * Table below incarnates a discrete Gaussian distribution:
- *    D(x) = exp(-(x^2)/(2*sigma^2))
- * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
- * Element 0 of the table is P(x = 0).
- * For k > 0, element k is P(x >= k+1 | x > 0).
- * Probabilities are scaled up by 2^63.
- */
-static const uint64_t gauss_1024_12289[] = {
-	 1283868770400643928u,  6416574995475331444u,  4078260278032692663u,
-	 2353523259288686585u,  1227179971273316331u,   575931623374121527u,
-	  242543240509105209u,    91437049221049666u,    30799446349977173u,
-	    9255276791179340u,     2478152334826140u,      590642893610164u,
-	     125206034929641u,       23590435911403u,        3948334035941u,
-	        586753615614u,          77391054539u,           9056793210u,
-	           940121950u,             86539696u,              7062824u,
-	              510971u,                32764u,                 1862u,
-	                  94u,                    4u,                    0u
-};
-
-/*
- * Generate a random value with a Gaussian distribution centered on 0.
- * The RNG must be ready for extraction (already flipped).
- *
- * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
- * precomputed table is for N = 1024. Since the sum of two independent
- * values of standard deviation sigma has standard deviation
- * sigma*sqrt(2), then we can just generate more values and add them
- * together for lower dimensions.
- */
-static int
-mkgauss(RNG_CONTEXT *rng, unsigned logn)
-{
-	unsigned u, g;
-	int val;
-
-	g = 1U << (10 - logn);
-	val = 0;
-	for (u = 0; u < g; u ++) {
-		/*
-		 * Each iteration generates one value with the
-		 * Gaussian distribution for N = 1024.
-		 *
-		 * We use two random 64-bit values. First value
-		 * decides on whether the generated value is 0, and,
-		 * if not, the sign of the value. Second random 64-bit
-		 * word is used to generate the non-zero value.
-		 *
-		 * For constant-time code we have to read the complete
-		 * table. This has negligible cost, compared with the
-		 * remainder of the keygen process (solving the NTRU
-		 * equation).
-		 */
-		uint64_t r;
-		uint32_t f, v, k, neg;
-
-		/*
-		 * First value:
-		 *  - flag 'neg' is randomly selected to be 0 or 1.
-		 *  - flag 'f' is set to 1 if the generated value is zero,
-		 *    or set to 0 otherwise.
-		 */
-		r = get_rng_u64(rng);
-		neg = (uint32_t)(r >> 63);
-		r &= ~((uint64_t)1 << 63);
-		f = (uint32_t)((r - gauss_1024_12289[0]) >> 63);
-
-		/*
-		 * We produce a new random 63-bit integer r, and go over
-		 * the array, starting at index 1. We store in v the
-		 * index of the first array element which is not greater
-		 * than r, unless the flag f was already 1.
-		 */
-		v = 0;
-		r = get_rng_u64(rng);
-		r &= ~((uint64_t)1 << 63);
-		for (k = 1; k < (sizeof gauss_1024_12289)
-			/ (sizeof gauss_1024_12289[0]); k ++)
-		{
-			uint32_t t;
-
-			t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1;
-			v |= k & -(t & (f ^ 1));
-			f |= t;
-		}
-
-		/*
-		 * We apply the sign ('neg' flag). If the value is zero,
-		 * the sign has no effect.
-		 */
-		v = (v ^ -neg) + neg;
-
-		/*
-		 * Generated value is added to val.
-		 */
-		val += *(int32_t *)&v;
-	}
-	return val;
-}
-
-/*
- * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit
- * words, of intermediate values in the computation:
- *
- *   MAX_BL_SMALL[depth]: length for the input f and g at that depth
- *   MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth
- *
- * Rules:
- *
- *  - Within an array, values grow.
- *
- *  - The 'SMALL' array must have an entry for maximum depth, corresponding
- *    to the size of values used in the binary GCD. There is no such value
- *    for the 'LARGE' array (the binary GCD yields already reduced
- *    coefficients).
- *
- *  - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
- *
- *  - Values must be large enough to handle the common cases, with some
- *    margins.
- *
- *  - Values must not be "too large" either because we will convert some
- *    integers into floating-point values by considering the top 10 words,
- *    i.e. 310 bits; hence, for values of length more than 10 words, we
- *    should take care to have the length centered on the expected size.
- *
- * The following average lengths, in bits, have been measured on thousands
- * of random keys (fg = max length of the absolute value of coefficients
- * of f and g at that depth; FG = idem for the unreduced F and G; for the
- * maximum depth, F and G are the output of binary GCD, multiplied by q;
- * for each value, the average and standard deviation are provided).
- *
- * Binary case:
- *    depth: 10    fg: 6307.52 (24.48)    FG: 6319.66 (24.51)
- *    depth:  9    fg: 3138.35 (12.25)    FG: 9403.29 (27.55)
- *    depth:  8    fg: 1576.87 ( 7.49)    FG: 4703.30 (14.77)
- *    depth:  7    fg:  794.17 ( 4.98)    FG: 2361.84 ( 9.31)
- *    depth:  6    fg:  400.67 ( 3.10)    FG: 1188.68 ( 6.04)
- *    depth:  5    fg:  202.22 ( 1.87)    FG:  599.81 ( 3.87)
- *    depth:  4    fg:  101.62 ( 1.02)    FG:  303.49 ( 2.38)
- *    depth:  3    fg:   50.37 ( 0.53)    FG:  153.65 ( 1.39)
- *    depth:  2    fg:   24.07 ( 0.25)    FG:   78.20 ( 0.73)
- *    depth:  1    fg:   10.99 ( 0.08)    FG:   39.82 ( 0.41)
- *    depth:  0    fg:    4.00 ( 0.00)    FG:   19.61 ( 0.49)
- *
- * Integers are actually represented either in binary notation over
- * 31-bit words (signed, using two's complement), or in RNS, modulo
- * many small primes. These small primes are close to, but slightly
- * lower than, 2^31. Use of RNS loses less than two bits, even for
- * the largest values.
- *
- * IMPORTANT: if these values are modified, then the temporary buffer
- * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed
- * accordingly.
- */
-
-static const size_t MAX_BL_SMALL[] = {
-	1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209
-};
-
-static const size_t MAX_BL_LARGE[] = {
-	2, 2, 5, 7, 12, 21, 40, 78, 157, 308
-};
-
-/*
- * Average and standard deviation for the maximum size (in bits) of
- * coefficients of (f,g), depending on depth. These values are used
- * to compute bounds for Babai's reduction.
- */
-static const struct {
-	int avg;
-	int std;
-} BITLENGTH[] = {
-	{    4,  0 },
-	{   11,  1 },
-	{   24,  1 },
-	{   50,  1 },
-	{  102,  1 },
-	{  202,  2 },
-	{  401,  4 },
-	{  794,  5 },
-	{ 1577,  8 },
-	{ 3138, 13 },
-	{ 6308, 25 }
-};
-
-/*
- * Minimal recursion depth at which we rebuild intermediate values
- * when reconstructing f and g.
- */
-#define DEPTH_INT_FG   4
-
-/*
- * Compute squared norm of a short vector. Returned value is saturated to
- * 2^32-1 if it is not lower than 2^31.
- */
-static uint32_t
-poly_small_sqnorm(const int8_t *f, unsigned logn)
-{
-	size_t n, u;
-	uint32_t s, ng;
-
-	n = MKN(logn);
-	s = 0;
-	ng = 0;
-	for (u = 0; u < n; u ++) {
-		int32_t z;
-
-		z = f[u];
-		s += (uint32_t)(z * z);
-		ng |= s;
-	}
-	return s | -(ng >> 31);
-}
-
-/*
- * Align (upwards) the provided 'data' pointer with regards to 'base'
- * so that the offset is a multiple of the size of 'fpr'.
- */
-static fpr *
-align_fpr(void *base, void *data)
-{
-	uint8_t *cb, *cd;
-	size_t k, km;
-
-	cb = base;
-	cd = data;
-	k = (size_t)(cd - cb);
-	km = k % sizeof(fpr);
-	if (km) {
-		k += (sizeof(fpr)) - km;
-	}
-	return (fpr *)(cb + k);
-}
-
-/*
- * Align (upwards) the provided 'data' pointer with regards to 'base'
- * so that the offset is a multiple of the size of 'uint32_t'.
- */
-static uint32_t *
-align_u32(void *base, void *data)
-{
-	uint8_t *cb, *cd;
-	size_t k, km;
-
-	cb = base;
-	cd = data;
-	k = (size_t)(cd - cb);
-	km = k % sizeof(uint32_t);
-	if (km) {
-		k += (sizeof(uint32_t)) - km;
-	}
-	return (uint32_t *)(cb + k);
-}
-
-/*
- * Convert a small vector to floating point.
- */
-static void
-poly_small_to_fp(fpr *x, const int8_t *f, unsigned logn)
-{
-	size_t n, u;
-
-	n = MKN(logn);
-	for (u = 0; u < n; u ++) {
-		x[u] = fpr_of(f[u]);
-	}
-}
-
-/*
- * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
- * individual length.
- *
- * Output: f',g' of degree N/2, with the length for 'depth+1'.
- *
- * Values are in RNS; input and/or output may also be in NTT.
- */
-static void
-make_fg_step(uint32_t *data, unsigned logn, unsigned depth,
-	int in_ntt, int out_ntt)
-{
-	size_t n, hn, u;
-	size_t slen, tlen;
-	uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
-	const small_prime *primes;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-	slen = MAX_BL_SMALL[depth];
-	tlen = MAX_BL_SMALL[depth + 1];
-	primes = PRIMES;
-
-	/*
-	 * Prepare room for the result.
-	 */
-	fd = data;
-	gd = fd + hn * tlen;
-	fs = gd + hn * tlen;
-	gs = fs + n * slen;
-	gm = gs + n * slen;
-	igm = gm + n;
-	t1 = igm + n;
-	memmove(fs, data, 2 * n * slen * sizeof *data);
-
-	/*
-	 * First slen words: we use the input values directly, and apply
-	 * inverse NTT as we go.
-	 */
-	for (u = 0; u < slen; u ++) {
-		uint32_t p, p0i, R2;
-		size_t v;
-		uint32_t *x;
-
-		p = primes[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-		modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
-
-		for (v = 0, x = fs + u; v < n; v ++, x += slen) {
-			t1[v] = *x;
-		}
-		if (!in_ntt) {
-			modp_NTT2(t1, gm, logn, p, p0i);
-		}
-		for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
-			uint32_t w0, w1;
-
-			w0 = t1[(v << 1) + 0];
-			w1 = t1[(v << 1) + 1];
-			*x = modp_montymul(
-				modp_montymul(w0, w1, p, p0i), R2, p, p0i);
-		}
-		if (in_ntt) {
-			modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i);
-		}
-
-		for (v = 0, x = gs + u; v < n; v ++, x += slen) {
-			t1[v] = *x;
-		}
-		if (!in_ntt) {
-			modp_NTT2(t1, gm, logn, p, p0i);
-		}
-		for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
-			uint32_t w0, w1;
-
-			w0 = t1[(v << 1) + 0];
-			w1 = t1[(v << 1) + 1];
-			*x = modp_montymul(
-				modp_montymul(w0, w1, p, p0i), R2, p, p0i);
-		}
-		if (in_ntt) {
-			modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i);
-		}
-
-		if (!out_ntt) {
-			modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
-			modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
-		}
-	}
-
-	/*
-	 * Since the fs and gs words have been de-NTTized, we can use the
-	 * CRT to rebuild the values.
-	 */
-	zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
-	zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);
-
-	/*
-	 * Remaining words: use modular reductions to extract the values.
-	 */
-	for (u = slen; u < tlen; u ++) {
-		uint32_t p, p0i, R2, Rx;
-		size_t v;
-		uint32_t *x;
-
-		p = primes[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-		Rx = modp_Rx((unsigned)slen, p, p0i, R2);
-		modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
-		for (v = 0, x = fs; v < n; v ++, x += slen) {
-			t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
-		}
-		modp_NTT2(t1, gm, logn, p, p0i);
-		for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
-			uint32_t w0, w1;
-
-			w0 = t1[(v << 1) + 0];
-			w1 = t1[(v << 1) + 1];
-			*x = modp_montymul(
-				modp_montymul(w0, w1, p, p0i), R2, p, p0i);
-		}
-		for (v = 0, x = gs; v < n; v ++, x += slen) {
-			t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
-		}
-		modp_NTT2(t1, gm, logn, p, p0i);
-		for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
-			uint32_t w0, w1;
-
-			w0 = t1[(v << 1) + 0];
-			w1 = t1[(v << 1) + 1];
-			*x = modp_montymul(
-				modp_montymul(w0, w1, p, p0i), R2, p, p0i);
-		}
-
-		if (!out_ntt) {
-			modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
-			modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
-		}
-	}
-}
-
-/*
- * Compute f and g at a specific depth, in RNS notation.
- *
- * Returned values are stored in the data[] array, at slen words per integer.
- *
- * Conditions:
- *   0 <= depth <= logn
- *
- * Space use in data[]: enough room for any two successive values (f', g',
- * f and g).
- */
-static void
-make_fg(uint32_t *data, const int8_t *f, const int8_t *g,
-	unsigned logn, unsigned depth, int out_ntt)
-{
-	size_t n, u;
-	uint32_t *ft, *gt, p0;
-	unsigned d;
-	const small_prime *primes;
-
-	n = MKN(logn);
-	ft = data;
-	gt = ft + n;
-	primes = PRIMES;
-	p0 = primes[0].p;
-	for (u = 0; u < n; u ++) {
-		ft[u] = modp_set(f[u], p0);
-		gt[u] = modp_set(g[u], p0);
-	}
-
-	if (depth == 0 && out_ntt) {
-		uint32_t *gm, *igm;
-		uint32_t p, p0i;
-
-		p = primes[0].p;
-		p0i = modp_ninv31(p);
-		gm = gt + n;
-		igm = gm + MKN(logn);
-		modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
-		modp_NTT2(ft, gm, logn, p, p0i);
-		modp_NTT2(gt, gm, logn, p, p0i);
-		return;
-	}
-
-	for (d = 0; d < depth; d ++) {
-		make_fg_step(data, logn - d, d,
-			d != 0, (d + 1) < depth || out_ntt);
-	}
-}
-
-/*
- * Solving the NTRU equation, deepest level: compute the resultants of
- * f and g with X^N+1, and use binary GCD. The F and G values are
- * returned in tmp[].
- *
- * Returned value: 1 on success, 0 on error.
- */
-static int
-solve_NTRU_deepest(unsigned logn_top,
-	const int8_t *f, const int8_t *g, uint32_t *tmp)
-{
-	size_t len;
-	uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
-	const small_prime *primes;
-
-	len = MAX_BL_SMALL[logn_top];
-	primes = PRIMES;
-
-	Fp = tmp;
-	Gp = Fp + len;
-	fp = Gp + len;
-	gp = fp + len;
-	t1 = gp + len;
-
-	make_fg(fp, f, g, logn_top, logn_top, 0);
-
-	/*
-	 * We use the CRT to rebuild the resultants as big integers.
-	 * There are two such big integers. The resultants are always
-	 * nonnegative.
-	 */
-	zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);
-
-	/*
-	 * Apply the binary GCD. The zint_bezout() function works only
-	 * if both inputs are odd.
-	 *
-	 * We can test on the result and return 0 because that would
-	 * imply failure of the NTRU solving equation, and the (f,g)
-	 * values will be abandoned in that case.
-	 */
-	if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
-		return 0;
-	}
-
-	/*
-	 * Multiply the two values by the target value q. Values must
-	 * fit in the destination arrays.
-	 * We can again test on the returned words: a non-zero output
-	 * of zint_mul_small() means that we exceeded our array
-	 * capacity, and that implies failure and rejection of (f,g).
-	 */
-	q = 12289;
-	if (zint_mul_small(Fp, len, q) != 0
-		|| zint_mul_small(Gp, len, q) != 0)
-	{
-		return 0;
-	}
-
-	return 1;
-}
-
-/*
- * Solving the NTRU equation, intermediate level. Upon entry, the F and G
- * from the previous level should be in the tmp[] array.
- * This function MAY be invoked for the top-level (in which case depth = 0).
- *
- * Returned value: 1 on success, 0 on error.
- */
-static int
-solve_NTRU_intermediate(unsigned logn_top,
-	const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp)
-{
-	/*
-	 * In this function, 'logn' is the log2 of the degree for
-	 * this step. If N = 2^logn, then:
-	 *  - the F and G values already in fk->tmp (from the deeper
-	 *    levels) have degree N/2;
-	 *  - this function should return F and G of degree N.
-	 */
-	unsigned logn;
-	size_t n, hn, slen, dlen, llen, rlen, FGlen, u;
-	uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
-	fpr *rt1, *rt2, *rt3, *rt4, *rt5;
-	int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k;
-	uint32_t *x, *y;
-	int32_t *k;
-	const small_prime *primes;
-
-	logn = logn_top - depth;
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-
-	/*
-	 * slen = size for our input f and g; also size of the reduced
-	 *        F and G we return (degree N)
-	 *
-	 * dlen = size of the F and G obtained from the deeper level
-	 *        (degree N/2 or N/3)
-	 *
-	 * llen = size for intermediary F and G before reduction (degree N)
-	 *
-	 * We build our non-reduced F and G as two independent halves each,
-	 * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
-	 */
-	slen = MAX_BL_SMALL[depth];
-	dlen = MAX_BL_SMALL[depth + 1];
-	llen = MAX_BL_LARGE[depth];
-	primes = PRIMES;
-
-	/*
-	 * Fd and Gd are the F and G from the deeper level.
-	 */
-	Fd = tmp;
-	Gd = Fd + dlen * hn;
-
-	/*
-	 * Compute the input f and g for this level. Note that we get f
-	 * and g in RNS + NTT representation.
-	 */
-	ft = Gd + dlen * hn;
-	make_fg(ft, f, g, logn_top, depth, 1);
-
-	/*
-	 * Move the newly computed f and g to make room for our candidate
-	 * F and G (unreduced).
-	 */
-	Ft = tmp;
-	Gt = Ft + n * llen;
-	t1 = Gt + n * llen;
-	memmove(t1, ft, 2 * n * slen * sizeof *ft);
-	ft = t1;
-	gt = ft + slen * n;
-	t1 = gt + slen * n;
-
-	/*
-	 * Move Fd and Gd _after_ f and g.
-	 */
-	memmove(t1, Fd, 2 * hn * dlen * sizeof *Fd);
-	Fd = t1;
-	Gd = Fd + hn * dlen;
-
-	/*
-	 * We reduce Fd and Gd modulo all the small primes we will need,
-	 * and store the values in Ft and Gt (only n/2 values in each).
-	 */
-	for (u = 0; u < llen; u ++) {
-		uint32_t p, p0i, R2, Rx;
-		size_t v;
-		uint32_t *xs, *ys, *xd, *yd;
-
-		p = primes[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-		Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
-		for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
-			v < hn;
-			v ++, xs += dlen, ys += dlen, xd += llen, yd += llen)
-		{
-			*xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
-			*yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
-		}
-	}
-
-	/*
-	 * We do not need Fd and Gd after that point.
-	 */
-
-	/*
-	 * Compute our F and G modulo sufficiently many small primes.
-	 */
-	for (u = 0; u < llen; u ++) {
-		uint32_t p, p0i, R2;
-		uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
-		size_t v;
-
-		/*
-		 * All computations are done modulo p.
-		 */
-		p = primes[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-
-		/*
-		 * If we processed slen words, then f and g have been
-		 * de-NTTized, and are in RNS; we can rebuild them.
-		 */
-		if (u == slen) {
-			zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
-			zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
-		}
-
-		gm = t1;
-		igm = gm + n;
-		fx = igm + n;
-		gx = fx + n;
-
-		modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
-
-		if (u < slen) {
-			for (v = 0, x = ft + u, y = gt + u;
-				v < n; v ++, x += slen, y += slen)
-			{
-				fx[v] = *x;
-				gx[v] = *y;
-			}
-			modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i);
-			modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i);
-		} else {
-			uint32_t Rx;
-
-			Rx = modp_Rx((unsigned)slen, p, p0i, R2);
-			for (v = 0, x = ft, y = gt;
-				v < n; v ++, x += slen, y += slen)
-			{
-				fx[v] = zint_mod_small_signed(x, slen,
-					p, p0i, R2, Rx);
-				gx[v] = zint_mod_small_signed(y, slen,
-					p, p0i, R2, Rx);
-			}
-			modp_NTT2(fx, gm, logn, p, p0i);
-			modp_NTT2(gx, gm, logn, p, p0i);
-		}
-
-		/*
-		 * Get F' and G' modulo p and in NTT representation
-		 * (they have degree n/2). These values were computed in
-		 * a previous step, and stored in Ft and Gt.
-		 */
-		Fp = gx + n;
-		Gp = Fp + hn;
-		for (v = 0, x = Ft + u, y = Gt + u;
-			v < hn; v ++, x += llen, y += llen)
-		{
-			Fp[v] = *x;
-			Gp[v] = *y;
-		}
-		modp_NTT2(Fp, gm, logn - 1, p, p0i);
-		modp_NTT2(Gp, gm, logn - 1, p, p0i);
-
-		/*
-		 * Compute our F and G modulo p.
-		 *
-		 * General case:
-		 *
-		 *   we divide degree by d = 2 or 3
-		 *   f'(x^d) = N(f)(x^d) = f * adj(f)
-		 *   g'(x^d) = N(g)(x^d) = g * adj(g)
-		 *   f'*G' - g'*F' = q
-		 *   F = F'(x^d) * adj(g)
-		 *   G = G'(x^d) * adj(f)
-		 *
-		 * We compute things in the NTT. We group roots of phi
-		 * such that all roots x in a group share the same x^d.
-		 * If the roots in a group are x_1, x_2... x_d, then:
-		 *
-		 *   N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
-		 *
-		 * Thus, we have:
-		 *
-		 *   G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
-		 *   G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
-		 *   ...
-		 *   G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
-		 *
-		 * In all cases, we can thus compute F and G in NTT
-		 * representation by a few simple multiplications.
-		 * Moreover, in our chosen NTT representation, roots
-		 * from the same group are consecutive in RAM.
-		 */
-		for (v = 0, x = Ft + u, y = Gt + u; v < hn;
-			v ++, x += (llen << 1), y += (llen << 1))
-		{
-			uint32_t ftA, ftB, gtA, gtB;
-			uint32_t mFp, mGp;
-
-			ftA = fx[(v << 1) + 0];
-			ftB = fx[(v << 1) + 1];
-			gtA = gx[(v << 1) + 0];
-			gtB = gx[(v << 1) + 1];
-			mFp = modp_montymul(Fp[v], R2, p, p0i);
-			mGp = modp_montymul(Gp[v], R2, p, p0i);
-			x[0] = modp_montymul(gtB, mFp, p, p0i);
-			x[llen] = modp_montymul(gtA, mFp, p, p0i);
-			y[0] = modp_montymul(ftB, mGp, p, p0i);
-			y[llen] = modp_montymul(ftA, mGp, p, p0i);
-		}
-		modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
-		modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
-	}
-
-	/*
-	 * Rebuild F and G with the CRT.
-	 */
-	zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
-	zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);
-
-	/*
-	 * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
-	 * order).
-	 */
-
-	/*
-	 * Apply Babai reduction to bring back F and G to size slen.
-	 *
-	 * We use the FFT to compute successive approximations of the
-	 * reduction coefficient. We first isolate the top bits of
-	 * the coefficients of f and g, and convert them to floating
-	 * point; with the FFT, we compute adj(f), adj(g), and
-	 * 1/(f*adj(f)+g*adj(g)).
-	 *
-	 * Then, we repeatedly apply the following:
-	 *
-	 *   - Get the top bits of the coefficients of F and G into
-	 *     floating point, and use the FFT to compute:
-	 *        (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
-	 *
-	 *   - Convert back that value into normal representation, and
-	 *     round it to the nearest integers, yielding a polynomial k.
-	 *     Proper scaling is applied to f, g, F and G so that the
-	 *     coefficients fit on 32 bits (signed).
-	 *
-	 *   - Subtract k*f from F and k*g from G.
-	 *
-	 * Under normal conditions, this process reduces the size of F
-	 * and G by some bits at each iteration. For constant-time
-	 * operation, we do not want to measure the actual length of
-	 * F and G; instead, we do the following:
-	 *
-	 *   - f and g are converted to floating-point, with some scaling
-	 *     if necessary to keep values in the representable range.
-	 *
-	 *   - For each iteration, we _assume_ a maximum size for F and G,
-	 *     and use the values at that size. If we overreach, then
-	 *     we get zeros, which is harmless: the resulting coefficients
-	 *     of k will be 0 and the value won't be reduced.
-	 *
-	 *   - We conservatively assume that F and G will be reduced by
-	 *     at least 25 bits at each iteration.
-	 *
-	 * Even when reaching the bottom of the reduction, reduction
-	 * coefficient will remain low. If it goes out-of-range, then
-	 * something wrong occurred and the whole NTRU solving fails.
-	 */
-
-	/*
-	 * Memory layout:
-	 *  - We need to compute and keep adj(f), adj(g), and
-	 *    1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
-	 *    respectively).
-	 *  - At each iteration we need two extra fp buffer (N fp values),
-	 *    and produce a k (N 32-bit words). k will be shared with one
-	 *    of the fp buffers.
-	 *  - To compute k*f and k*g efficiently (with the NTT), we need
-	 *    some extra room; we reuse the space of the temporary buffers.
-	 *
-	 * Arrays of 'fpr' are obtained from the temporary array itself.
-	 * We ensure that the base is at a properly aligned offset (the
-	 * source array tmp[] is supposed to be already aligned).
-	 */
-
-	rt3 = align_fpr(tmp, t1);
-	rt4 = rt3 + n;
-	rt5 = rt4 + n;
-	rt1 = rt5 + (n >> 1);
-	k = (int32_t *)align_u32(tmp, rt1);
-	rt2 = align_fpr(tmp, k + n);
-	if (rt2 < (rt1 + n)) {
-		rt2 = rt1 + n;
-	}
-	t1 = (uint32_t *)k + n;
-
-	/*
-	 * Get f and g into rt3 and rt4 as floating-point approximations.
-	 *
-	 * We need to "scale down" the floating-point representation of
-	 * coefficients when they are too big. We want to keep the value
-	 * below 2^310 or so. Thus, when values are larger than 10 words,
-	 * we consider only the top 10 words. Array lengths have been
-	 * computed so that average maximum length will fall in the
-	 * middle or the upper half of these top 10 words.
-	 */
-	rlen = (slen > 10) ? 10 : slen;
-	poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn);
-	poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn);
-
-	/*
-	 * Values in rt3 and rt4 are downscaled by 2^(scale_fg).
-	 */
-	scale_fg = 31 * (int)(slen - rlen);
-
-	/*
-	 * Estimated boundaries for the maximum size (in bits) of the
-	 * coefficients of (f,g). We use the measured average, and
-	 * allow for a deviation of at most six times the standard
-	 * deviation.
-	 */
-	minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std;
-	maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std;
-
-	/*
-	 * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f)
-	 * and adj(g) in rt3 and rt4, respectively.
-	 */
-	Zf(FFT)(rt3, logn);
-	Zf(FFT)(rt4, logn);
-	Zf(poly_invnorm2_fft)(rt5, rt3, rt4, logn);
-	Zf(poly_adj_fft)(rt3, logn);
-	Zf(poly_adj_fft)(rt4, logn);
-
-	/*
-	 * Reduce F and G repeatedly.
-	 *
-	 * The expected maximum bit length of coefficients of F and G
-	 * is kept in maxbl_FG, with the corresponding word length in
-	 * FGlen.
-	 */
-	FGlen = llen;
-	maxbl_FG = 31 * (int)llen;
-
-	/*
-	 * Each reduction operation computes the reduction polynomial
-	 * "k". We need that polynomial to have coefficients that fit
-	 * on 32-bit signed integers, with some scaling; thus, we use
-	 * a descending sequence of scaling values, down to zero.
-	 *
-	 * The size of the coefficients of k is (roughly) the difference
-	 * between the size of the coefficients of (F,G) and the size
-	 * of the coefficients of (f,g). Thus, the maximum size of the
-	 * coefficients of k is, at the start, maxbl_FG - minbl_fg;
-	 * this is our starting scale value for k.
-	 *
-	 * We need to estimate the size of (F,G) during the execution of
-	 * the algorithm; we are allowed some overestimation but not too
-	 * much (poly_big_to_fp() uses a 310-bit window). Generally
-	 * speaking, after applying a reduction with k scaled to
-	 * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd,
-	 * where 'dd' is a few bits to account for the fact that the
-	 * reduction is never perfect (intuitively, dd is on the order
-	 * of sqrt(N), so at most 5 bits; we here allow for 10 extra
-	 * bits).
-	 *
-	 * The size of (f,g) is not known exactly, but maxbl_fg is an
-	 * upper bound.
-	 */
-	scale_k = maxbl_FG - minbl_fg;
-
-	for (;;) {
-		int scale_FG, dc, new_maxbl_FG;
-		uint32_t scl, sch;
-		fpr pdc, pt;
-
-		/*
-		 * Convert current F and G into floating-point. We apply
-		 * scaling if the current length is more than 10 words.
-		 */
-		rlen = (FGlen > 10) ? 10 : FGlen;
-		scale_FG = 31 * (int)(FGlen - rlen);
-		poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn);
-		poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn);
-
-		/*
-		 * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2.
-		 */
-		Zf(FFT)(rt1, logn);
-		Zf(FFT)(rt2, logn);
-		Zf(poly_mul_fft)(rt1, rt3, logn);
-		Zf(poly_mul_fft)(rt2, rt4, logn);
-		Zf(poly_add)(rt2, rt1, logn);
-		Zf(poly_mul_autoadj_fft)(rt2, rt5, logn);
-		Zf(iFFT)(rt2, logn);
-
-		/*
-		 * (f,g) are scaled by 'scale_fg', meaning that the
-		 * numbers in rt3/rt4 should be multiplied by 2^(scale_fg)
-		 * to have their true mathematical value.
-		 *
-		 * (F,G) are similarly scaled by 'scale_FG'. Therefore,
-		 * the value we computed in rt2 is scaled by
-		 * 'scale_FG-scale_fg'.
-		 *
-		 * We want that value to be scaled by 'scale_k', hence we
-		 * apply a corrective scaling. After scaling, the values
-		 * should fit in -2^31-1..+2^31-1.
-		 */
-		dc = scale_k - scale_FG + scale_fg;
-
-		/*
-		 * We will need to multiply values by 2^(-dc). The value
-		 * 'dc' is not secret, so we can compute 2^(-dc) with a
-		 * non-constant-time process.
-		 * (We could use ldexp(), but we prefer to avoid any
-		 * dependency on libm. When using FP emulation, we could
-		 * use our fpr_ldexp(), which is constant-time.)
-		 */
-		if (dc < 0) {
-			dc = -dc;
-			pt = fpr_two;
-		} else {
-			pt = fpr_onehalf;
-		}
-		pdc = fpr_one;
-		while (dc != 0) {
-			if ((dc & 1) != 0) {
-				pdc = fpr_mul(pdc, pt);
-			}
-			dc >>= 1;
-			pt = fpr_sqr(pt);
-		}
-
-		for (u = 0; u < n; u ++) {
-			fpr xv;
-
-			xv = fpr_mul(rt2[u], pdc);
-
-			/*
-			 * Sometimes the values can be out-of-bounds if
-			 * the algorithm fails; we must not call
-			 * fpr_rint() (and cast to int32_t) if the value
-			 * is not in-bounds. Note that the test does not
-			 * break constant-time discipline, since any
-			 * failure here implies that we discard the current
-			 * secret key (f,g).
-			 */
-			if (!fpr_lt(fpr_mtwo31m1, xv)
-				|| !fpr_lt(xv, fpr_ptwo31m1))
-			{
-				return 0;
-			}
-			k[u] = (int32_t)fpr_rint(xv);
-		}
-
-		/*
-		 * Values in k[] are integers. They really are scaled
-		 * down by maxbl_FG - minbl_fg bits.
-		 *
-		 * If we are at low depth, then we use the NTT to
-		 * compute k*f and k*g.
-		 */
-		sch = (uint32_t)(scale_k / 31);
-		scl = (uint32_t)(scale_k % 31);
-		if (depth <= DEPTH_INT_FG) {
-			poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen,
-				k, sch, scl, logn, t1);
-			poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen,
-				k, sch, scl, logn, t1);
-		} else {
-			poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen,
-				k, sch, scl, logn);
-			poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen,
-				k, sch, scl, logn);
-		}
-
-		/*
-		 * We compute the new maximum size of (F,G), assuming that
-		 * (f,g) has _maximal_ length (i.e. that reduction is
-		 * "late" instead of "early". We also adjust FGlen
-		 * accordingly.
-		 */
-		new_maxbl_FG = scale_k + maxbl_fg + 10;
-		if (new_maxbl_FG < maxbl_FG) {
-			maxbl_FG = new_maxbl_FG;
-			if ((int)FGlen * 31 >= maxbl_FG + 31) {
-				FGlen --;
-			}
-		}
-
-		/*
-		 * We suppose that scaling down achieves a reduction by
-		 * at least 25 bits per iteration. We stop when we have
-		 * done the loop with an unscaled k.
-		 */
-		if (scale_k <= 0) {
-			break;
-		}
-		scale_k -= 25;
-		if (scale_k < 0) {
-			scale_k = 0;
-		}
-	}
-
-	/*
-	 * If (F,G) length was lowered below 'slen', then we must take
-	 * care to re-extend the sign.
-	 */
-	if (FGlen < slen) {
-		for (u = 0; u < n; u ++, Ft += llen, Gt += llen) {
-			size_t v;
-			uint32_t sw;
-
-			sw = -(Ft[FGlen - 1] >> 30) >> 1;
-			for (v = FGlen; v < slen; v ++) {
-				Ft[v] = sw;
-			}
-			sw = -(Gt[FGlen - 1] >> 30) >> 1;
-			for (v = FGlen; v < slen; v ++) {
-				Gt[v] = sw;
-			}
-		}
-	}
-
-	/*
-	 * Compress encoding of all values to 'slen' words (this is the
-	 * expected output format).
-	 */
-	for (u = 0, x = tmp, y = tmp;
-		u < (n << 1); u ++, x += slen, y += llen)
-	{
-		memmove(x, y, slen * sizeof *y);
-	}
-	return 1;
-}
-
-/*
- * Solving the NTRU equation, binary case, depth = 1. Upon entry, the
- * F and G from the previous level should be in the tmp[] array.
- *
- * Returned value: 1 on success, 0 on error.
- */
-static int
-solve_NTRU_binary_depth1(unsigned logn_top,
-	const int8_t *f, const int8_t *g, uint32_t *tmp)
-{
-	/*
-	 * The first half of this function is a copy of the corresponding
-	 * part in solve_NTRU_intermediate(), for the reconstruction of
-	 * the unreduced F and G. The second half (Babai reduction) is
-	 * done differently, because the unreduced F and G fit in 53 bits
-	 * of precision, allowing a much simpler process with lower RAM
-	 * usage.
-	 */
-	unsigned depth, logn;
-	size_t n_top, n, hn, slen, dlen, llen, u;
-	uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
-	fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6;
-	uint32_t *x, *y;
-
-	depth = 1;
-	n_top = (size_t)1 << logn_top;
-	logn = logn_top - depth;
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-
-	/*
-	 * Equations are:
-	 *
-	 *   f' = f0^2 - X^2*f1^2
-	 *   g' = g0^2 - X^2*g1^2
-	 *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
-	 *   F = F'*(g0 - X*g1)
-	 *   G = G'*(f0 - X*f1)
-	 *
-	 * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
-	 * degree N/2 (their odd-indexed coefficients are all zero).
-	 */
-
-	/*
-	 * slen = size for our input f and g; also size of the reduced
-	 *        F and G we return (degree N)
-	 *
-	 * dlen = size of the F and G obtained from the deeper level
-	 *        (degree N/2)
-	 *
-	 * llen = size for intermediary F and G before reduction (degree N)
-	 *
-	 * We build our non-reduced F and G as two independent halves each,
-	 * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
-	 */
-	slen = MAX_BL_SMALL[depth];
-	dlen = MAX_BL_SMALL[depth + 1];
-	llen = MAX_BL_LARGE[depth];
-
-	/*
-	 * Fd and Gd are the F and G from the deeper level. Ft and Gt
-	 * are the destination arrays for the unreduced F and G.
-	 */
-	Fd = tmp;
-	Gd = Fd + dlen * hn;
-	Ft = Gd + dlen * hn;
-	Gt = Ft + llen * n;
-
-	/*
-	 * We reduce Fd and Gd modulo all the small primes we will need,
-	 * and store the values in Ft and Gt.
-	 */
-	for (u = 0; u < llen; u ++) {
-		uint32_t p, p0i, R2, Rx;
-		size_t v;
-		uint32_t *xs, *ys, *xd, *yd;
-
-		p = PRIMES[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-		Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
-		for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
-			v < hn;
-			v ++, xs += dlen, ys += dlen, xd += llen, yd += llen)
-		{
-			*xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
-			*yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
-		}
-	}
-
-	/*
-	 * Now Fd and Gd are not needed anymore; we can squeeze them out.
-	 */
-	memmove(tmp, Ft, llen * n * sizeof(uint32_t));
-	Ft = tmp;
-	memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t));
-	Gt = Ft + llen * n;
-	ft = Gt + llen * n;
-	gt = ft + slen * n;
-
-	t1 = gt + slen * n;
-
-	/*
-	 * Compute our F and G modulo sufficiently many small primes.
-	 */
-	for (u = 0; u < llen; u ++) {
-		uint32_t p, p0i, R2;
-		uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
-		unsigned e;
-		size_t v;
-
-		/*
-		 * All computations are done modulo p.
-		 */
-		p = PRIMES[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-
-		/*
-		 * We recompute things from the source f and g, of full
-		 * degree. However, we will need only the n first elements
-		 * of the inverse NTT table (igm); the call to modp_mkgm()
-		 * below will fill n_top elements in igm[] (thus overflowing
-		 * into fx[]) but later code will overwrite these extra
-		 * elements.
-		 */
-		gm = t1;
-		igm = gm + n_top;
-		fx = igm + n;
-		gx = fx + n_top;
-		modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i);
-
-		/*
-		 * Set ft and gt to f and g modulo p, respectively.
-		 */
-		for (v = 0; v < n_top; v ++) {
-			fx[v] = modp_set(f[v], p);
-			gx[v] = modp_set(g[v], p);
-		}
-
-		/*
-		 * Convert to NTT and compute our f and g.
-		 */
-		modp_NTT2(fx, gm, logn_top, p, p0i);
-		modp_NTT2(gx, gm, logn_top, p, p0i);
-		for (e = logn_top; e > logn; e --) {
-			modp_poly_rec_res(fx, e, p, p0i, R2);
-			modp_poly_rec_res(gx, e, p, p0i, R2);
-		}
-
-		/*
-		 * From that point onward, we only need tables for
-		 * degree n, so we can save some space.
-		 */
-		if (depth > 0) { /* always true */
-			memmove(gm + n, igm, n * sizeof *igm);
-			igm = gm + n;
-			memmove(igm + n, fx, n * sizeof *ft);
-			fx = igm + n;
-			memmove(fx + n, gx, n * sizeof *gt);
-			gx = fx + n;
-		}
-
-		/*
-		 * Get F' and G' modulo p and in NTT representation
-		 * (they have degree n/2). These values were computed
-		 * in a previous step, and stored in Ft and Gt.
-		 */
-		Fp = gx + n;
-		Gp = Fp + hn;
-		for (v = 0, x = Ft + u, y = Gt + u;
-			v < hn; v ++, x += llen, y += llen)
-		{
-			Fp[v] = *x;
-			Gp[v] = *y;
-		}
-		modp_NTT2(Fp, gm, logn - 1, p, p0i);
-		modp_NTT2(Gp, gm, logn - 1, p, p0i);
-
-		/*
-		 * Compute our F and G modulo p.
-		 *
-		 * Equations are:
-		 *
-		 *   f'(x^2) = N(f)(x^2) = f * adj(f)
-		 *   g'(x^2) = N(g)(x^2) = g * adj(g)
-		 *
-		 *   f'*G' - g'*F' = q
-		 *
-		 *   F = F'(x^2) * adj(g)
-		 *   G = G'(x^2) * adj(f)
-		 *
-		 * The NTT representation of f is f(w) for all w which
-		 * are roots of phi. In the binary case, as well as in
-		 * the ternary case for all depth except the deepest,
-		 * these roots can be grouped in pairs (w,-w), and we
-		 * then have:
-		 *
-		 *   f(w) = adj(f)(-w)
-		 *   f(-w) = adj(f)(w)
-		 *
-		 * and w^2 is then a root for phi at the half-degree.
-		 *
-		 * At the deepest level in the ternary case, this still
-		 * holds, in the following sense: the roots of x^2-x+1
-		 * are (w,-w^2) (for w^3 = -1, and w != -1), and we
-		 * have:
-		 *
-		 *   f(w) = adj(f)(-w^2)
-		 *   f(-w^2) = adj(f)(w)
-		 *
-		 * In all case, we can thus compute F and G in NTT
-		 * representation by a few simple multiplications.
-		 * Moreover, the two roots for each pair are consecutive
-		 * in our bit-reversal encoding.
-		 */
-		for (v = 0, x = Ft + u, y = Gt + u;
-			v < hn; v ++, x += (llen << 1), y += (llen << 1))
-		{
-			uint32_t ftA, ftB, gtA, gtB;
-			uint32_t mFp, mGp;
-
-			ftA = fx[(v << 1) + 0];
-			ftB = fx[(v << 1) + 1];
-			gtA = gx[(v << 1) + 0];
-			gtB = gx[(v << 1) + 1];
-			mFp = modp_montymul(Fp[v], R2, p, p0i);
-			mGp = modp_montymul(Gp[v], R2, p, p0i);
-			x[0] = modp_montymul(gtB, mFp, p, p0i);
-			x[llen] = modp_montymul(gtA, mFp, p, p0i);
-			y[0] = modp_montymul(ftB, mGp, p, p0i);
-			y[llen] = modp_montymul(ftA, mGp, p, p0i);
-		}
-		modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
-		modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
-
-		/*
-		 * Also save ft and gt (only up to size slen).
-		 */
-		if (u < slen) {
-			modp_iNTT2(fx, igm, logn, p, p0i);
-			modp_iNTT2(gx, igm, logn, p, p0i);
-			for (v = 0, x = ft + u, y = gt + u;
-				v < n; v ++, x += slen, y += slen)
-			{
-				*x = fx[v];
-				*y = gx[v];
-			}
-		}
-	}
-
-	/*
-	 * Rebuild f, g, F and G with the CRT. Note that the elements of F
-	 * and G are consecutive, and thus can be rebuilt in a single
-	 * loop; similarly, the elements of f and g are consecutive.
-	 */
-	zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1);
-	zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1);
-
-	/*
-	 * Here starts the Babai reduction, specialized for depth = 1.
-	 *
-	 * Candidates F and G (from Ft and Gt), and base f and g (ft and gt),
-	 * are converted to floating point. There is no scaling, and a
-	 * single pass is sufficient.
-	 */
-
-	/*
-	 * Convert F and G into floating point (rt1 and rt2).
-	 */
-	rt1 = align_fpr(tmp, gt + slen * n);
-	rt2 = rt1 + n;
-	poly_big_to_fp(rt1, Ft, llen, llen, logn);
-	poly_big_to_fp(rt2, Gt, llen, llen, logn);
-
-	/*
-	 * Integer representation of F and G is no longer needed, we
-	 * can remove it.
-	 */
-	memmove(tmp, ft, 2 * slen * n * sizeof *ft);
-	ft = tmp;
-	gt = ft + slen * n;
-	rt3 = align_fpr(tmp, gt + slen * n);
-	memmove(rt3, rt1, 2 * n * sizeof *rt1);
-	rt1 = rt3;
-	rt2 = rt1 + n;
-	rt3 = rt2 + n;
-	rt4 = rt3 + n;
-
-	/*
-	 * Convert f and g into floating point (rt3 and rt4).
-	 */
-	poly_big_to_fp(rt3, ft, slen, slen, logn);
-	poly_big_to_fp(rt4, gt, slen, slen, logn);
-
-	/*
-	 * Remove unneeded ft and gt.
-	 */
-	memmove(tmp, rt1, 4 * n * sizeof *rt1);
-	rt1 = (fpr *)tmp;
-	rt2 = rt1 + n;
-	rt3 = rt2 + n;
-	rt4 = rt3 + n;
-
-	/*
-	 * We now have:
-	 *   rt1 = F
-	 *   rt2 = G
-	 *   rt3 = f
-	 *   rt4 = g
-	 * in that order in RAM. We convert all of them to FFT.
-	 */
-	Zf(FFT)(rt1, logn);
-	Zf(FFT)(rt2, logn);
-	Zf(FFT)(rt3, logn);
-	Zf(FFT)(rt4, logn);
-
-	/*
-	 * Compute:
-	 *   rt5 = F*adj(f) + G*adj(g)
-	 *   rt6 = 1 / (f*adj(f) + g*adj(g))
-	 * (Note that rt6 is half-length.)
-	 */
-	rt5 = rt4 + n;
-	rt6 = rt5 + n;
-	Zf(poly_add_muladj_fft)(rt5, rt1, rt2, rt3, rt4, logn);
-	Zf(poly_invnorm2_fft)(rt6, rt3, rt4, logn);
-
-	/*
-	 * Compute:
-	 *   rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g))
-	 */
-	Zf(poly_mul_autoadj_fft)(rt5, rt6, logn);
-
-	/*
-	 * Compute k as the rounded version of rt5. Check that none of
-	 * the values is larger than 2^63-1 (in absolute value)
-	 * because that would make the fpr_rint() do something undefined;
-	 * note that any out-of-bounds value here implies a failure and
-	 * (f,g) will be discarded, so we can make a simple test.
-	 */
-	Zf(iFFT)(rt5, logn);
-	for (u = 0; u < n; u ++) {
-		fpr z;
-
-		z = rt5[u];
-		if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) {
-			return 0;
-		}
-		rt5[u] = fpr_of(fpr_rint(z));
-	}
-	Zf(FFT)(rt5, logn);
-
-	/*
-	 * Subtract k*f from F, and k*g from G.
-	 */
-	Zf(poly_mul_fft)(rt3, rt5, logn);
-	Zf(poly_mul_fft)(rt4, rt5, logn);
-	Zf(poly_sub)(rt1, rt3, logn);
-	Zf(poly_sub)(rt2, rt4, logn);
-	Zf(iFFT)(rt1, logn);
-	Zf(iFFT)(rt2, logn);
-
-	/*
-	 * Convert back F and G to integers, and return.
-	 */
-	Ft = tmp;
-	Gt = Ft + n;
-	rt3 = align_fpr(tmp, Gt + n);
-	memmove(rt3, rt1, 2 * n * sizeof *rt1);
-	rt1 = rt3;
-	rt2 = rt1 + n;
-	for (u = 0; u < n; u ++) {
-		Ft[u] = (uint32_t)fpr_rint(rt1[u]);
-		Gt[u] = (uint32_t)fpr_rint(rt2[u]);
-	}
-
-	return 1;
-}
-
-/*
- * Solving the NTRU equation, top level. Upon entry, the F and G
- * from the previous level should be in the tmp[] array.
- *
- * Returned value: 1 on success, 0 on error.
- */
-static int
-solve_NTRU_binary_depth0(unsigned logn,
-	const int8_t *f, const int8_t *g, uint32_t *tmp)
-{
-	size_t n, hn, u;
-	uint32_t p, p0i, R2;
-	uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5;
-	uint32_t *gm, *igm, *ft, *gt;
-	fpr *rt2, *rt3;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-
-	/*
-	 * Equations are:
-	 *
-	 *   f' = f0^2 - X^2*f1^2
-	 *   g' = g0^2 - X^2*g1^2
-	 *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
-	 *   F = F'*(g0 - X*g1)
-	 *   G = G'*(f0 - X*f1)
-	 *
-	 * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
-	 * degree N/2 (their odd-indexed coefficients are all zero).
-	 *
-	 * Everything should fit in 31-bit integers, hence we can just use
-	 * the first small prime p = 2147473409.
-	 */
-	p = PRIMES[0].p;
-	p0i = modp_ninv31(p);
-	R2 = modp_R2(p, p0i);
-
-	Fp = tmp;
-	Gp = Fp + hn;
-	ft = Gp + hn;
-	gt = ft + n;
-	gm = gt + n;
-	igm = gm + n;
-
-	modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i);
-
-	/*
-	 * Convert F' anf G' in NTT representation.
-	 */
-	for (u = 0; u < hn; u ++) {
-		Fp[u] = modp_set(zint_one_to_plain(Fp + u), p);
-		Gp[u] = modp_set(zint_one_to_plain(Gp + u), p);
-	}
-	modp_NTT2(Fp, gm, logn - 1, p, p0i);
-	modp_NTT2(Gp, gm, logn - 1, p, p0i);
-
-	/*
-	 * Load f and g and convert them to NTT representation.
-	 */
-	for (u = 0; u < n; u ++) {
-		ft[u] = modp_set(f[u], p);
-		gt[u] = modp_set(g[u], p);
-	}
-	modp_NTT2(ft, gm, logn, p, p0i);
-	modp_NTT2(gt, gm, logn, p, p0i);
-
-	/*
-	 * Build the unreduced F,G in ft and gt.
-	 */
-	for (u = 0; u < n; u += 2) {
-		uint32_t ftA, ftB, gtA, gtB;
-		uint32_t mFp, mGp;
-
-		ftA = ft[u + 0];
-		ftB = ft[u + 1];
-		gtA = gt[u + 0];
-		gtB = gt[u + 1];
-		mFp = modp_montymul(Fp[u >> 1], R2, p, p0i);
-		mGp = modp_montymul(Gp[u >> 1], R2, p, p0i);
-		ft[u + 0] = modp_montymul(gtB, mFp, p, p0i);
-		ft[u + 1] = modp_montymul(gtA, mFp, p, p0i);
-		gt[u + 0] = modp_montymul(ftB, mGp, p, p0i);
-		gt[u + 1] = modp_montymul(ftA, mGp, p, p0i);
-	}
-	modp_iNTT2(ft, igm, logn, p, p0i);
-	modp_iNTT2(gt, igm, logn, p, p0i);
-
-	Gp = Fp + n;
-	t1 = Gp + n;
-	memmove(Fp, ft, 2 * n * sizeof *ft);
-
-	/*
-	 * We now need to apply the Babai reduction. At that point,
-	 * we have F and G in two n-word arrays.
-	 *
-	 * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g)
-	 * modulo p, using the NTT. We still move memory around in
-	 * order to save RAM.
-	 */
-	t2 = t1 + n;
-	t3 = t2 + n;
-	t4 = t3 + n;
-	t5 = t4 + n;
-
-	/*
-	 * Compute the NTT tables in t1 and t2. We do not keep t2
-	 * (we'll recompute it later on).
-	 */
-	modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i);
-
-	/*
-	 * Convert F and G to NTT.
-	 */
-	modp_NTT2(Fp, t1, logn, p, p0i);
-	modp_NTT2(Gp, t1, logn, p, p0i);
-
-	/*
-	 * Load f and adj(f) in t4 and t5, and convert them to NTT
-	 * representation.
-	 */
-	t4[0] = t5[0] = modp_set(f[0], p);
-	for (u = 1; u < n; u ++) {
-		t4[u] = modp_set(f[u], p);
-		t5[n - u] = modp_set(-f[u], p);
-	}
-	modp_NTT2(t4, t1, logn, p, p0i);
-	modp_NTT2(t5, t1, logn, p, p0i);
-
-	/*
-	 * Compute F*adj(f) in t2, and f*adj(f) in t3.
-	 */
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = modp_montymul(t5[u], R2, p, p0i);
-		t2[u] = modp_montymul(w, Fp[u], p, p0i);
-		t3[u] = modp_montymul(w, t4[u], p, p0i);
-	}
-
-	/*
-	 * Load g and adj(g) in t4 and t5, and convert them to NTT
-	 * representation.
-	 */
-	t4[0] = t5[0] = modp_set(g[0], p);
-	for (u = 1; u < n; u ++) {
-		t4[u] = modp_set(g[u], p);
-		t5[n - u] = modp_set(-g[u], p);
-	}
-	modp_NTT2(t4, t1, logn, p, p0i);
-	modp_NTT2(t5, t1, logn, p, p0i);
-
-	/*
-	 * Add G*adj(g) to t2, and g*adj(g) to t3.
-	 */
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = modp_montymul(t5[u], R2, p, p0i);
-		t2[u] = modp_add(t2[u],
-			modp_montymul(w, Gp[u], p, p0i), p);
-		t3[u] = modp_add(t3[u],
-			modp_montymul(w, t4[u], p, p0i), p);
-	}
-
-	/*
-	 * Convert back t2 and t3 to normal representation (normalized
-	 * around 0), and then
-	 * move them to t1 and t2. We first need to recompute the
-	 * inverse table for NTT.
-	 */
-	modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i);
-	modp_iNTT2(t2, t4, logn, p, p0i);
-	modp_iNTT2(t3, t4, logn, p, p0i);
-	for (u = 0; u < n; u ++) {
-		t1[u] = (uint32_t)modp_norm(t2[u], p);
-		t2[u] = (uint32_t)modp_norm(t3[u], p);
-	}
-
-	/*
-	 * At that point, array contents are:
-	 *
-	 *   F (NTT representation) (Fp)
-	 *   G (NTT representation) (Gp)
-	 *   F*adj(f)+G*adj(g) (t1)
-	 *   f*adj(f)+g*adj(g) (t2)
-	 *
-	 * We want to divide t1 by t2. The result is not integral; it
-	 * must be rounded. We thus need to use the FFT.
-	 */
-
-	/*
-	 * Get f*adj(f)+g*adj(g) in FFT representation. Since this
-	 * polynomial is auto-adjoint, all its coordinates in FFT
-	 * representation are actually real, so we can truncate off
-	 * the imaginary parts.
-	 */
-	rt3 = align_fpr(tmp, t3);
-	for (u = 0; u < n; u ++) {
-		rt3[u] = fpr_of(((int32_t *)t2)[u]);
-	}
-	Zf(FFT)(rt3, logn);
-	rt2 = align_fpr(tmp, t2);
-	memmove(rt2, rt3, hn * sizeof *rt3);
-
-	/*
-	 * Convert F*adj(f)+G*adj(g) in FFT representation.
-	 */
-	rt3 = rt2 + hn;
-	for (u = 0; u < n; u ++) {
-		rt3[u] = fpr_of(((int32_t *)t1)[u]);
-	}
-	Zf(FFT)(rt3, logn);
-
-	/*
-	 * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get
-	 * its rounded normal representation in t1.
-	 */
-	Zf(poly_div_autoadj_fft)(rt3, rt2, logn);
-	Zf(iFFT)(rt3, logn);
-	for (u = 0; u < n; u ++) {
-		t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p);
-	}
-
-	/*
-	 * RAM contents are now:
-	 *
-	 *   F (NTT representation) (Fp)
-	 *   G (NTT representation) (Gp)
-	 *   k (t1)
-	 *
-	 * We want to compute F-k*f, and G-k*g.
-	 */
-	t2 = t1 + n;
-	t3 = t2 + n;
-	t4 = t3 + n;
-	t5 = t4 + n;
-	modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i);
-	for (u = 0; u < n; u ++) {
-		t4[u] = modp_set(f[u], p);
-		t5[u] = modp_set(g[u], p);
-	}
-	modp_NTT2(t1, t2, logn, p, p0i);
-	modp_NTT2(t4, t2, logn, p, p0i);
-	modp_NTT2(t5, t2, logn, p, p0i);
-	for (u = 0; u < n; u ++) {
-		uint32_t kw;
-
-		kw = modp_montymul(t1[u], R2, p, p0i);
-		Fp[u] = modp_sub(Fp[u],
-			modp_montymul(kw, t4[u], p, p0i), p);
-		Gp[u] = modp_sub(Gp[u],
-			modp_montymul(kw, t5[u], p, p0i), p);
-	}
-	modp_iNTT2(Fp, t3, logn, p, p0i);
-	modp_iNTT2(Gp, t3, logn, p, p0i);
-	for (u = 0; u < n; u ++) {
-		Fp[u] = (uint32_t)modp_norm(Fp[u], p);
-		Gp[u] = (uint32_t)modp_norm(Gp[u], p);
-	}
-
-	return 1;
-}
-
-/*
- * Solve the NTRU equation. Returned value is 1 on success, 0 on error.
- * G can be NULL, in which case that value is computed but not returned.
- * If any of the coefficients of F and G exceeds lim (in absolute value),
- * then 0 is returned.
- */
-static int
-solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
-	const int8_t *f, const int8_t *g, int lim, uint32_t *tmp)
-{
-	size_t n, u;
-	uint32_t *ft, *gt, *Ft, *Gt, *gm;
-	uint32_t p, p0i, r;
-	const small_prime *primes;
-
-	n = MKN(logn);
-
-	if (!solve_NTRU_deepest(logn, f, g, tmp)) {
-		return 0;
-	}
-
-	/*
-	 * For logn <= 2, we need to use solve_NTRU_intermediate()
-	 * directly, because coefficients are a bit too large and
-	 * do not fit the hypotheses in solve_NTRU_binary_depth0().
-	 */
-	if (logn <= 2) {
-		unsigned depth;
-
-		depth = logn;
-		while (depth -- > 0) {
-			if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
-				return 0;
-			}
-		}
-	} else {
-		unsigned depth;
-
-		depth = logn;
-		while (depth -- > 2) {
-			if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
-				return 0;
-			}
-		}
-		if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) {
-			return 0;
-		}
-		if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) {
-			return 0;
-		}
-	}
-
-	/*
-	 * If no buffer has been provided for G, use a temporary one.
-	 */
-	if (G == NULL) {
-		G = (int8_t *)(tmp + 2 * n);
-	}
-
-	/*
-	 * Final F and G are in fk->tmp, one word per coefficient
-	 * (signed value over 31 bits).
-	 */
-	if (!poly_big_to_small(F, tmp, lim, logn)
-		|| !poly_big_to_small(G, tmp + n, lim, logn))
-	{
-		return 0;
-	}
-
-	/*
-	 * Verify that the NTRU equation is fulfilled. Since all elements
-	 * have short lengths, verifying modulo a small prime p works, and
-	 * allows using the NTT.
-	 *
-	 * We put Gt[] first in tmp[], and process it first, so that it does
-	 * not overlap with G[] in case we allocated it ourselves.
-	 */
-	Gt = tmp;
-	ft = Gt + n;
-	gt = ft + n;
-	Ft = gt + n;
-	gm = Ft + n;
-
-	primes = PRIMES;
-	p = primes[0].p;
-	p0i = modp_ninv31(p);
-	modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i);
-	for (u = 0; u < n; u ++) {
-		Gt[u] = modp_set(G[u], p);
-	}
-	for (u = 0; u < n; u ++) {
-		ft[u] = modp_set(f[u], p);
-		gt[u] = modp_set(g[u], p);
-		Ft[u] = modp_set(F[u], p);
-	}
-	modp_NTT2(ft, gm, logn, p, p0i);
-	modp_NTT2(gt, gm, logn, p, p0i);
-	modp_NTT2(Ft, gm, logn, p, p0i);
-	modp_NTT2(Gt, gm, logn, p, p0i);
-	r = modp_montymul(12289, 1, p, p0i);
-	for (u = 0; u < n; u ++) {
-		uint32_t z;
-
-		z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i),
-			modp_montymul(gt[u], Ft[u], p, p0i), p);
-		if (z != r) {
-			return 0;
-		}
-	}
-
-	return 1;
-}
-
-/*
- * Generate a random polynomial with a Gaussian distribution. This function
- * also makes sure that the resultant of the polynomial with phi is odd.
- */
-static void
-poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn)
-{
-	size_t n, u;
-	unsigned mod2;
-
-	n = MKN(logn);
-	mod2 = 0;
-	for (u = 0; u < n; u ++) {
-		int s;
-
-	restart:
-		s = mkgauss(rng, logn);
-
-		/*
-		 * We need the coefficient to fit within -127..+127;
-		 * realistically, this is always the case except for
-		 * the very low degrees (N = 2 or 4), for which there
-		 * is no real security anyway.
-		 */
-		if (s < -127 || s > 127) {
-			goto restart;
-		}
-
-		/*
-		 * We need the sum of all coefficients to be 1; otherwise,
-		 * the resultant of the polynomial with X^N+1 will be even,
-		 * and the binary GCD will fail.
-		 */
-		if (u == n - 1) {
-			if ((mod2 ^ (unsigned)(s & 1)) == 0) {
-				goto restart;
-			}
-		} else {
-			mod2 ^= (unsigned)(s & 1);
-		}
-		f[u] = (int8_t)s;
-	}
-}
-
-/* see falcon.h */
-void
-Zf(keygen)(inner_shake256_context *rng,
-	int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
-	unsigned logn, uint8_t *tmp)
-{
-	/*
-	 * Algorithm is the following:
-	 *
-	 *  - Generate f and g with the Gaussian distribution.
-	 *
-	 *  - If either Res(f,phi) or Res(g,phi) is even, try again.
-	 *
-	 *  - If ||(f,g)|| is too large, try again.
-	 *
-	 *  - If ||B~_{f,g}|| is too large, try again.
-	 *
-	 *  - If f is not invertible mod phi mod q, try again.
-	 *
-	 *  - Compute h = g/f mod phi mod q.
-	 *
-	 *  - Solve the NTRU equation fG - gF = q; if the solving fails,
-	 *    try again. Usual failure condition is when Res(f,phi)
-	 *    and Res(g,phi) are not prime to each other.
-	 */
-	size_t n, u;
-	uint16_t *h2, *tmp2;
-	RNG_CONTEXT *rc;
-#if FALCON_KG_CHACHA20  // yyyKG_CHACHA20+1
-	prng p;
-#endif  // yyyKG_CHACHA20-
-
-	n = MKN(logn);
-#if FALCON_KG_CHACHA20  // yyyKG_CHACHA20+1
-	Zf(prng_init)(&p, rng);
-	rc = &p;
-#else // yyyKG_CHACHA20+0
-	rc = rng;
-#endif  // yyyKG_CHACHA20-
-
-	/*
-	 * We need to generate f and g randomly, until we find values
-	 * such that the norm of (g,-f), and of the orthogonalized
-	 * vector, are satisfying. The orthogonalized vector is:
-	 *   (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g)))
-	 * (it is actually the (N+1)-th row of the Gram-Schmidt basis).
-	 *
-	 * In the binary case, coefficients of f and g are generated
-	 * independently of each other, with a discrete Gaussian
-	 * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then,
-	 * the two vectors have expected norm 1.17*sqrt(q), which is
-	 * also our acceptance bound: we require both vectors to be no
-	 * larger than that (this will be satisfied about 1/4th of the
-	 * time, thus we expect sampling new (f,g) about 4 times for that
-	 * step).
-	 *
-	 * We require that Res(f,phi) and Res(g,phi) are both odd (the
-	 * NTRU equation solver requires it).
-	 */
-	for (;;) {
-		fpr *rt1, *rt2, *rt3;
-		fpr bnorm;
-		uint32_t normf, normg, norm;
-		int lim;
-
-		/*
-		 * The poly_small_mkgauss() function makes sure
-		 * that the sum of coefficients is 1 modulo 2
-		 * (i.e. the resultant of the polynomial with phi
-		 * will be odd).
-		 */
-		poly_small_mkgauss(rc, f, logn);
-		poly_small_mkgauss(rc, g, logn);
-
-		/*
-		 * Verify that all coefficients are within the bounds
-		 * defined in max_fg_bits. This is the case with
-		 * overwhelming probability; this guarantees that the
-		 * key will be encodable with FALCON_COMP_TRIM.
-		 */
-		lim = 1 << (Zf(max_fg_bits)[logn] - 1);
-		for (u = 0; u < n; u ++) {
-			/*
-			 * We can use non-CT tests since on any failure
-			 * we will discard f and g.
-			 */
-			if (f[u] >= lim || f[u] <= -lim
-				|| g[u] >= lim || g[u] <= -lim)
-			{
-				lim = -1;
-				break;
-			}
-		}
-		if (lim < 0) {
-			continue;
-		}
-
-		/*
-		 * Bound is 1.17*sqrt(q). We compute the squared
-		 * norms. With q = 12289, the squared bound is:
-		 *   (1.17^2)* 12289 = 16822.4121
-		 * Since f and g are integral, the squared norm
-		 * of (g,-f) is an integer.
-		 */
-		normf = poly_small_sqnorm(f, logn);
-		normg = poly_small_sqnorm(g, logn);
-		norm = (normf + normg) | -((normf | normg) >> 31);
-		if (norm >= 16823) {
-			continue;
-		}
-
-		/*
-		 * We compute the orthogonalized vector norm.
-		 */
-		rt1 = (fpr *)tmp;
-		rt2 = rt1 + n;
-		rt3 = rt2 + n;
-		poly_small_to_fp(rt1, f, logn);
-		poly_small_to_fp(rt2, g, logn);
-		Zf(FFT)(rt1, logn);
-		Zf(FFT)(rt2, logn);
-		Zf(poly_invnorm2_fft)(rt3, rt1, rt2, logn);
-		Zf(poly_adj_fft)(rt1, logn);
-		Zf(poly_adj_fft)(rt2, logn);
-		Zf(poly_mulconst)(rt1, fpr_q, logn);
-		Zf(poly_mulconst)(rt2, fpr_q, logn);
-		Zf(poly_mul_autoadj_fft)(rt1, rt3, logn);
-		Zf(poly_mul_autoadj_fft)(rt2, rt3, logn);
-		Zf(iFFT)(rt1, logn);
-		Zf(iFFT)(rt2, logn);
-		bnorm = fpr_zero;
-		for (u = 0; u < n; u ++) {
-			bnorm = fpr_add(bnorm, fpr_sqr(rt1[u]));
-			bnorm = fpr_add(bnorm, fpr_sqr(rt2[u]));
-		}
-		if (!fpr_lt(bnorm, fpr_bnorm_max)) {
-			continue;
-		}
-
-		/*
-		 * Compute public key h = g/f mod X^N+1 mod q. If this
-		 * fails, we must restart.
-		 */
-		if (h == NULL) {
-			h2 = (uint16_t *)tmp;
-			tmp2 = h2 + n;
-		} else {
-			h2 = h;
-			tmp2 = (uint16_t *)tmp;
-		}
-		if (!Zf(compute_public)(h2, f, g, logn, (uint8_t *)tmp2)) {
-			continue;
-		}
-
-		/*
-		 * Solve the NTRU equation to get F and G.
-		 */
-		lim = (1 << (Zf(max_FG_bits)[logn] - 1)) - 1;
-		if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) {
-			continue;
-		}
-
-		/*
-		 * Key pair is generated.
-		 */
-		break;
-	}
-}
diff --git a/crypto_sign/falcon-1024/m4-ct/pqm4.c b/crypto_sign/falcon-1024/m4-ct/pqm4.c
deleted file mode 100644
index 74b83a8b..00000000
--- a/crypto_sign/falcon-1024/m4-ct/pqm4.c
+++ /dev/null
@@ -1,347 +0,0 @@
-#include <stddef.h>
-#include <string.h>
-
-#include "api.h"
-#include "inner.h"
-#include "randombytes.h"
-
-/* ==================================================================== */
-
-/*
- * Falcon degree is N = 2^LOGN, where LOGN=9 (for Falcon-512) or 10
- * (for Falcon-1024). We use the advertised public key size to know
- * which degree is used.
- */
-#if CRYPTO_PUBLICKEYBYTES == 897
-#define LOGN   9
-#elif CRYPTO_PUBLICKEYBYTES == 1793
-#define LOGN   10
-#else
-#error Unknown Falcon degree (unexpected public key size)
-#endif
-
-#define N   ((size_t)1 << LOGN)
-#define NONCELEN   40
-#define SEEDLEN    48
-
-/*
- * If the private key length is larger than 10000, then this is the
- * variant with precomputed expanded keys.
- */
-#if CRYPTO_SECRETKEYBYTES > 10000
-#define KG_EXPAND   1
-#else
-#define KG_EXPAND   0
-#endif
-
-/*
- * Common buffer, to avoid bulky stack allocation. The buffer sizes are
- * all expressed in bytes, but the buffer must be suitably aligned for
- * 64-bit integers and floating-point values.
- *
- * Required size (in bytes):
- *
- *   With expanded key:
- *      keygen:  48*N + 6*N = 54*N
- *      sign:    48*N + 2*N = 50*N
- *      vrfy:    8*N
- *
- *   Without expanded key:
- *      keygen:  28*N + 5*N = 33*N
- *      sign:    72*N + 6*N = 78*N
- *      vrfy:    8*N
- */
-static union {
-#if KG_EXPAND
-	uint8_t b[54 * N];
-#else
-	uint8_t b[78 * N];
-#endif
-	uint64_t dummy_u64;
-	fpr dummy_fp;
-} tmp;
-
-int
-crypto_sign_keypair(unsigned char *pk, unsigned char *sk)
-{
-	int8_t *f, *g, *F, *G;
-	uint16_t *h;
-	inner_shake256_context rng;
-	unsigned char seed[SEEDLEN];
-#if KG_EXPAND
-	size_t v;
-#else
-	size_t u, v;
-#endif
-	unsigned sav_cw;
-
-#if KG_EXPAND
-	f = (int8_t *)&tmp.b[48 * N];
-	g = f + N;
-	F = g + N;
-	G = F + N;
-	h = (uint16_t *)(G + N);
-#else
-	f = (int8_t *)&tmp.b[28 * N];
-	g = f + N;
-	F = g + N;
-	G = NULL;
-	h = (uint16_t *)(F + N);
-#endif
-
-	randombytes(seed, SEEDLEN);
-	inner_shake256_init(&rng);
-	inner_shake256_inject(&rng, seed, SEEDLEN);
-	inner_shake256_flip(&rng);
-	sav_cw = set_fpu_cw(2);
-	Zf(keygen)(&rng, f, g, F, G, h, LOGN, tmp.b);
-
-#if KG_EXPAND
-	/*
-	 * Expand private key.
-	 */
-	Zf(expand_privkey)((fpr *)sk, f, g, F, G, LOGN, tmp.b);
-	set_fpu_cw(sav_cw);
-#else
-	set_fpu_cw(sav_cw);
-
-	/*
-	 * Encode private key.
-	 */
-	sk[0] = 0x50 + LOGN;
-	u = 1;
-	v = Zf(trim_i8_encode)(sk + u, CRYPTO_SECRETKEYBYTES - u,
-		f, LOGN, Zf(max_fg_bits)[LOGN]);
-	if (v == 0) {
-		return -1;
-	}
-	u += v;
-	v = Zf(trim_i8_encode)(sk + u, CRYPTO_SECRETKEYBYTES - u,
-		g, LOGN, Zf(max_fg_bits)[LOGN]);
-	if (v == 0) {
-		return -1;
-	}
-	u += v;
-	v = Zf(trim_i8_encode)(sk + u, CRYPTO_SECRETKEYBYTES - u,
-		F, LOGN, Zf(max_FG_bits)[LOGN]);
-	if (v == 0) {
-		return -1;
-	}
-	u += v;
-	if (u != CRYPTO_SECRETKEYBYTES) {
-		return -1;
-	}
-#endif
-
-	/*
-	 * Encode public key.
-	 */
-	pk[0] = 0x00 + LOGN;
-	v = Zf(modq_encode)(pk + 1, CRYPTO_PUBLICKEYBYTES - 1, h, LOGN);
-	if (v != CRYPTO_PUBLICKEYBYTES - 1) {
-		return -1;
-	}
-
-	return 0;
-}
-
-int
-crypto_sign(unsigned char *sm, size_t *smlen,
-	const unsigned char *m, size_t mlen,
-	const unsigned char *sk)
-{
-#if KG_EXPAND
-	const fpr *expanded_key;
-#else
-	int8_t *f, *g, *F, *G;
-	size_t u, v;
-#endif
-	int16_t *sig;
-	uint16_t *hm;
-	unsigned char seed[SEEDLEN], nonce[NONCELEN];
-	unsigned char *esig;
-	inner_shake256_context sc;
-	size_t sig_len;
-	unsigned sav_cw;
-
-#if KG_EXPAND
-	sig = (int16_t *)&tmp.b[48 * N];
-#else
-	f = (int8_t *)&tmp.b[72 * N];
-	g = f + N;
-	F = g + N;
-	G = F + N;
-	sig = (int16_t *)(G + N);
-#endif
-	hm = (uint16_t *)sig;  /* hm[] is shared with sig[] */
-	esig = (unsigned char *)tmp.b;
-
-#if KG_EXPAND
-	/*
-	 * Expanded key is provided "as is".
-	 */
-	expanded_key = (const fpr *)sk;
-#else
-	/*
-	 * Decode the private key.
-	 */
-	if (sk[0] != 0x50 + LOGN) {
-		return -1;
-	}
-	u = 1;
-	v = Zf(trim_i8_decode)(f, LOGN, Zf(max_fg_bits)[LOGN],
-		sk + u, CRYPTO_SECRETKEYBYTES - u);
-	if (v == 0) {
-		return -1;
-	}
-	u += v;
-	v = Zf(trim_i8_decode)(g, LOGN, Zf(max_fg_bits)[LOGN],
-		sk + u, CRYPTO_SECRETKEYBYTES - u);
-	if (v == 0) {
-		return -1;
-	}
-	u += v;
-	v = Zf(trim_i8_decode)(F, LOGN, Zf(max_FG_bits)[LOGN],
-		sk + u, CRYPTO_SECRETKEYBYTES - u);
-	if (v == 0) {
-		return -1;
-	}
-	u += v;
-	if (u != CRYPTO_SECRETKEYBYTES) {
-		return -1;
-	}
-	if (!Zf(complete_private)(G, f, g, F, LOGN, tmp.b)) {
-		return -1;
-	}
-#endif
-
-	/*
-	 * Create a random nonce (40 bytes).
-	 */
-	randombytes(nonce, NONCELEN);
-
-	/*
-	 * Hash message nonce + message into a vector.
-	 */
-	inner_shake256_init(&sc);
-	inner_shake256_inject(&sc, nonce, NONCELEN);
-	inner_shake256_inject(&sc, m, mlen);
-	inner_shake256_flip(&sc);
-	Zf(hash_to_point_vartime)(&sc, hm, LOGN);
-
-	/*
-	 * Initialize a RNG.
-	 */
-	randombytes(seed, SEEDLEN);
-	inner_shake256_init(&sc);
-	inner_shake256_inject(&sc, seed, SEEDLEN);
-	inner_shake256_flip(&sc);
-
-	/*
-	 * Compute the signature.
-	 */
-	sav_cw = set_fpu_cw(2);
-#if KG_EXPAND
-	Zf(sign_tree)(sig, &sc, expanded_key, hm, LOGN, tmp.b);
-#else
-	Zf(sign_dyn)(sig, &sc, f, g, F, G, hm, LOGN, tmp.b);
-#endif
-	set_fpu_cw(sav_cw);
-
-	/*
-	 * Encode the signature and bundle it with the message. Format is:
-	 *   signature length     2 bytes, big-endian
-	 *   nonce                40 bytes
-	 *   message              mlen bytes
-	 *   signature            slen bytes
-	 */
-	esig[0] = 0x20 + LOGN;
-	sig_len = Zf(comp_encode)(esig + 1, CRYPTO_BYTES - 1, sig, LOGN);
-	if (sig_len == 0) {
-		return -1;
-	}
-	sig_len ++;
-	memmove(sm + 2 + NONCELEN, m, mlen);
-	sm[0] = (unsigned char)(sig_len >> 8);
-	sm[1] = (unsigned char)sig_len;
-	memcpy(sm + 2, nonce, NONCELEN);
-	memcpy(sm + 2 + NONCELEN + mlen, esig, sig_len);
-	*smlen = 2 + NONCELEN + mlen + sig_len;
-	return 0;
-}
-
-int
-crypto_sign_open(unsigned char *m, size_t *mlen,
-	const unsigned char *sm, size_t smlen,
-	const unsigned char *pk)
-{
-	uint16_t *h, *hm;
-	int16_t *sig;
-	const unsigned char *esig;
-	inner_shake256_context sc;
-	size_t sig_len, msg_len;
-
-	h = (uint16_t *)&tmp.b[2 * N];
-	hm = h + N;
-	sig = (int16_t *)(hm + N);
-
-	/*
-	 * Decode public key.
-	 */
-	if (pk[0] != 0x00 + LOGN) {
-		return -1;
-	}
-	if (Zf(modq_decode)(h, LOGN, pk + 1, CRYPTO_PUBLICKEYBYTES - 1)
-		!= CRYPTO_PUBLICKEYBYTES - 1)
-	{
-		return -1;
-	}
-	Zf(to_ntt_monty)(h, LOGN);
-
-	/*
-	 * Find nonce, signature, message length.
-	 */
-	if (smlen < 2 + NONCELEN) {
-		return -1;
-	}
-	sig_len = ((size_t)sm[0] << 8) | (size_t)sm[1];
-	if (sig_len > (smlen - 2 - NONCELEN)) {
-		return -1;
-	}
-	msg_len = smlen - 2 - NONCELEN - sig_len;
-
-	/*
-	 * Decode signature.
-	 */
-	esig = sm + 2 + NONCELEN + msg_len;
-	if (sig_len < 1 || esig[0] != 0x20 + LOGN) {
-		return -1;
-	}
-	if (Zf(comp_decode)(sig, LOGN,
-		esig + 1, sig_len - 1) != sig_len - 1)
-	{
-		return -1;
-	}
-
-	/*
-	 * Hash nonce + message into a vector.
-	 */
-	inner_shake256_init(&sc);
-	inner_shake256_inject(&sc, sm + 2, NONCELEN + msg_len);
-	inner_shake256_flip(&sc);
-	Zf(hash_to_point_vartime)(&sc, hm, LOGN);
-
-	/*
-	 * Verify signature.
-	 */
-	if (!Zf(verify_raw)(hm, sig, h, LOGN, tmp.b)) {
-		return -1;
-	}
-
-	/*
-	 * Return plaintext.
-	 */
-	memmove(m, sm + 2 + NONCELEN, msg_len);
-	*mlen = msg_len;
-	return 0;
-}
diff --git a/crypto_sign/falcon-1024/m4-ct/rng.c b/crypto_sign/falcon-1024/m4-ct/rng.c
deleted file mode 100644
index d2ecb7af..00000000
--- a/crypto_sign/falcon-1024/m4-ct/rng.c
+++ /dev/null
@@ -1,379 +0,0 @@
-/*
- * PRNG and interface to the system RNG.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include <assert.h>
-
-#include "inner.h"
-
-// yyyNIST+0 yyyPQCLEAN+0
-/*
- * Include relevant system header files. For Win32, this will also need
- * linking with advapi32.dll, which we trigger with an appropriate #pragma.
- */
-#if FALCON_RAND_GETENTROPY
-#include <unistd.h>
-#endif
-#if FALCON_RAND_URANDOM
-#include <sys/types.h>
-#if !FALCON_RAND_GETENTROPY
-#include <unistd.h>
-#endif
-#include <fcntl.h>
-#include <errno.h>
-#endif
-#if FALCON_RAND_WIN32
-#include <windows.h>
-#include <wincrypt.h>
-#pragma comment(lib, "advapi32")
-#endif
-
-/* see inner.h */
-int
-Zf(get_seed)(void *seed, size_t len)
-{
-	(void)seed;
-	if (len == 0) {
-		return 1;
-	}
-#if FALCON_RAND_GETENTROPY
-	if (getentropy(seed, len) == 0) {
-		return 1;
-	}
-#endif
-#if FALCON_RAND_URANDOM
-	{
-		int f;
-
-		f = open("/dev/urandom", O_RDONLY);
-		if (f >= 0) {
-			while (len > 0) {
-				ssize_t rlen;
-
-				rlen = read(f, seed, len);
-				if (rlen < 0) {
-					if (errno == EINTR) {
-						continue;
-					}
-					break;
-				}
-				seed = (uint8_t *)seed + rlen;
-				len -= (size_t)rlen;
-			}
-			close(f);
-			if (len == 0) {
-				return 1;
-			}
-		}
-	}
-#endif
-#if FALCON_RAND_WIN32
-	{
-		HCRYPTPROV hp;
-
-		if (CryptAcquireContext(&hp, 0, 0, PROV_RSA_FULL,
-			CRYPT_VERIFYCONTEXT | CRYPT_SILENT))
-		{
-			BOOL r;
-
-			r = CryptGenRandom(hp, (DWORD)len, seed);
-			CryptReleaseContext(hp, 0);
-			if (r) {
-				return 1;
-			}
-		}
-	}
-#endif
-	return 0;
-}
-// yyyNIST- yyyPQCLEAN-
-
-/* see inner.h */
-void
-Zf(prng_init)(prng *p, inner_shake256_context *src)
-{
-#if FALCON_LE  // yyyLE+1
-	inner_shake256_extract(src, p->state.d, 56);
-#else  // yyyLE+0
-	/*
-	 * To ensure reproducibility for a given seed, we
-	 * must enforce little-endian interpretation of
-	 * the state words.
-	 */
-	uint8_t tmp[56];
-	uint64_t th, tl;
-	int i;
-
-	inner_shake256_extract(src, tmp, 56);
-	for (i = 0; i < 14; i ++) {
-		uint32_t w;
-
-		w = (uint32_t)tmp[(i << 2) + 0]
-			| ((uint32_t)tmp[(i << 2) + 1] << 8)
-			| ((uint32_t)tmp[(i << 2) + 2] << 16)
-			| ((uint32_t)tmp[(i << 2) + 3] << 24);
-		*(uint32_t *)(p->state.d + (i << 2)) = w;
-	}
-	tl = *(uint32_t *)(p->state.d + 48);
-	th = *(uint32_t *)(p->state.d + 52);
-	*(uint64_t *)(p->state.d + 48) = tl + (th << 32);
-#endif  // yyyLE-
-	Zf(prng_refill)(p);
-}
-
-/*
- * PRNG based on ChaCha20.
- *
- * State consists in key (32 bytes) then IV (16 bytes) and block counter
- * (8 bytes). Normally, we should not care about local endianness (this
- * is for a PRNG), but for the NIST competition we need reproducible KAT
- * vectors that work across architectures, so we enforce little-endian
- * interpretation where applicable. Moreover, output words are "spread
- * out" over the output buffer with the interleaving pattern that is
- * naturally obtained from the AVX2 implementation that runs eight
- * ChaCha20 instances in parallel.
- *
- * The block counter is XORed into the first 8 bytes of the IV.
- */
-TARGET_AVX2
-void
-Zf(prng_refill)(prng *p)
-{
-#if FALCON_AVX2 // yyyAVX2+1
-
-	static const uint32_t CW[] = {
-		0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
-	};
-
-	uint64_t cc;
-	size_t u;
-	int i;
-	uint32_t *sw;
-	union {
-		uint32_t w[16];
-		__m256i y[2];  /* for alignment */
-	} t;
-	__m256i state[16], init[16];
-
-	sw = (uint32_t *)p->state.d;
-
-	/*
-	 * XOR next counter values into state.
-	 */
-	cc = *(uint64_t *)(p->state.d + 48);
-	for (u = 0; u < 8; u ++) {
-		t.w[u] = (uint32_t)(cc + u);
-		t.w[u + 8] = (uint32_t)((cc + u) >> 32);
-	}
-	*(uint64_t *)(p->state.d + 48) = cc + 8;
-
-	/*
-	 * Load state.
-	 */
-	for (u = 0; u < 4; u ++) {
-		state[u] = init[u] =
-			_mm256_broadcastd_epi32(_mm_cvtsi32_si128(CW[u]));
-	}
-	for (u = 0; u < 10; u ++) {
-		state[u + 4] = init[u + 4] =
-			_mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[u]));
-	}
-	state[14] = init[14] = _mm256_xor_si256(
-		_mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[10])),
-		_mm256_loadu_si256((__m256i *)&t.w[0]));
-	state[15] = init[15] = _mm256_xor_si256(
-		_mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[11])),
-		_mm256_loadu_si256((__m256i *)&t.w[8]));
-
-	/*
-	 * Do all rounds.
-	 */
-	for (i = 0; i < 10; i ++) {
-
-#define QROUND(a, b, c, d)   do { \
-		state[a] = _mm256_add_epi32(state[a], state[b]); \
-		state[d] = _mm256_xor_si256(state[d], state[a]); \
-		state[d] = _mm256_or_si256( \
-			_mm256_slli_epi32(state[d], 16), \
-			_mm256_srli_epi32(state[d], 16)); \
-		state[c] = _mm256_add_epi32(state[c], state[d]); \
-		state[b] = _mm256_xor_si256(state[b], state[c]); \
-		state[b] = _mm256_or_si256( \
-			_mm256_slli_epi32(state[b], 12), \
-			_mm256_srli_epi32(state[b], 20)); \
-		state[a] = _mm256_add_epi32(state[a], state[b]); \
-		state[d] = _mm256_xor_si256(state[d], state[a]); \
-		state[d] = _mm256_or_si256( \
-			_mm256_slli_epi32(state[d],  8), \
-			_mm256_srli_epi32(state[d], 24)); \
-		state[c] = _mm256_add_epi32(state[c], state[d]); \
-		state[b] = _mm256_xor_si256(state[b], state[c]); \
-		state[b] = _mm256_or_si256( \
-			_mm256_slli_epi32(state[b], 7), \
-			_mm256_srli_epi32(state[b], 25)); \
-	} while (0)
-
-		QROUND( 0,  4,  8, 12);
-		QROUND( 1,  5,  9, 13);
-		QROUND( 2,  6, 10, 14);
-		QROUND( 3,  7, 11, 15);
-		QROUND( 0,  5, 10, 15);
-		QROUND( 1,  6, 11, 12);
-		QROUND( 2,  7,  8, 13);
-		QROUND( 3,  4,  9, 14);
-
-#undef QROUND
-
-	}
-
-	/*
-	 * Add initial state back and encode the result in the destination
-	 * buffer. We can dump the AVX2 values "as is" because the non-AVX2
-	 * code uses a compatible order of values.
-	 */
-	for (u = 0; u < 16; u ++) {
-		_mm256_storeu_si256((__m256i *)&p->buf.d[u << 5],
-			_mm256_add_epi32(state[u], init[u]));
-	}
-
-#else // yyyAVX2+0
-
-	static const uint32_t CW[] = {
-		0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
-	};
-
-	uint64_t cc;
-	size_t u;
-
-	/*
-	 * State uses local endianness. Only the output bytes must be
-	 * converted to little endian (if used on a big-endian machine).
-	 */
-	cc = *(uint64_t *)(p->state.d + 48);
-	for (u = 0; u < 8; u ++) {
-		uint32_t state[16];
-		size_t v;
-		int i;
-
-		memcpy(&state[0], CW, sizeof CW);
-		memcpy(&state[4], p->state.d, 48);
-		state[14] ^= (uint32_t)cc;
-		state[15] ^= (uint32_t)(cc >> 32);
-		for (i = 0; i < 10; i ++) {
-
-#define QROUND(a, b, c, d)   do { \
-		state[a] += state[b]; \
-		state[d] ^= state[a]; \
-		state[d] = (state[d] << 16) | (state[d] >> 16); \
-		state[c] += state[d]; \
-		state[b] ^= state[c]; \
-		state[b] = (state[b] << 12) | (state[b] >> 20); \
-		state[a] += state[b]; \
-		state[d] ^= state[a]; \
-		state[d] = (state[d] <<  8) | (state[d] >> 24); \
-		state[c] += state[d]; \
-		state[b] ^= state[c]; \
-		state[b] = (state[b] <<  7) | (state[b] >> 25); \
-	} while (0)
-
-			QROUND( 0,  4,  8, 12);
-			QROUND( 1,  5,  9, 13);
-			QROUND( 2,  6, 10, 14);
-			QROUND( 3,  7, 11, 15);
-			QROUND( 0,  5, 10, 15);
-			QROUND( 1,  6, 11, 12);
-			QROUND( 2,  7,  8, 13);
-			QROUND( 3,  4,  9, 14);
-
-#undef QROUND
-
-		}
-
-		for (v = 0; v < 4; v ++) {
-			state[v] += CW[v];
-		}
-		for (v = 4; v < 14; v ++) {
-			state[v] += ((uint32_t *)p->state.d)[v - 4];
-		}
-		state[14] += ((uint32_t *)p->state.d)[10]
-			^ (uint32_t)cc;
-		state[15] += ((uint32_t *)p->state.d)[11]
-			^ (uint32_t)(cc >> 32);
-		cc ++;
-
-		/*
-		 * We mimic the interleaving that is used in the AVX2
-		 * implementation.
-		 */
-		for (v = 0; v < 16; v ++) {
-#if FALCON_LE  // yyyLE+1
-			((uint32_t *)p->buf.d)[u + (v << 3)] = state[v];
-#else  // yyyLE+0
-			p->buf.d[(u << 2) + (v << 5) + 0] =
-				(uint8_t)state[v];
-			p->buf.d[(u << 2) + (v << 5) + 1] =
-				(uint8_t)(state[v] >> 8);
-			p->buf.d[(u << 2) + (v << 5) + 2] =
-				(uint8_t)(state[v] >> 16);
-			p->buf.d[(u << 2) + (v << 5) + 3] =
-				(uint8_t)(state[v] >> 24);
-#endif  // yyyLE-
-		}
-	}
-	*(uint64_t *)(p->state.d + 48) = cc;
-
-#endif // yyyAVX2-
-
-	p->ptr = 0;
-}
-
-/* see inner.h */
-void
-Zf(prng_get_bytes)(prng *p, void *dst, size_t len)
-{
-	uint8_t *buf;
-
-	buf = dst;
-	while (len > 0) {
-		size_t clen;
-
-		clen = (sizeof p->buf.d) - p->ptr;
-		if (clen > len) {
-			clen = len;
-		}
-		memcpy(buf, p->buf.d, clen);
-		buf += clen;
-		len -= clen;
-		p->ptr += clen;
-		if (p->ptr == sizeof p->buf.d) {
-			Zf(prng_refill)(p);
-		}
-	}
-}
diff --git a/crypto_sign/falcon-1024/m4-ct/sign.c b/crypto_sign/falcon-1024/m4-ct/sign.c
deleted file mode 100644
index 752fb8ba..00000000
--- a/crypto_sign/falcon-1024/m4-ct/sign.c
+++ /dev/null
@@ -1,1532 +0,0 @@
-/*
- * Falcon signature generation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-/* =================================================================== */
-
-/*
- * Compute degree N from logarithm 'logn'.
- */
-#define MKN(logn)   ((size_t)1 << (logn))
-
-/* =================================================================== */
-/*
- * Binary case:
- *   N = 2^logn
- *   phi = X^N+1
- */
-
-/*
- * Get the size of the LDL tree for an input with polynomials of size
- * 2^logn. The size is expressed in the number of elements.
- */
-static inline unsigned
-ffLDL_treesize(unsigned logn)
-{
-	/*
-	 * For logn = 0 (polynomials are constant), the "tree" is a
-	 * single element. Otherwise, the tree node has size 2^logn, and
-	 * has two child trees for size logn-1 each. Thus, treesize s()
-	 * must fulfill these two relations:
-	 *
-	 *   s(0) = 1
-	 *   s(logn) = (2^logn) + 2*s(logn-1)
-	 */
-	return (logn + 1) << logn;
-}
-
-/*
- * Inner function for ffLDL_fft(). It expects the matrix to be both
- * auto-adjoint and quasicyclic; also, it uses the source operands
- * as modifiable temporaries.
- *
- * tmp[] must have room for at least one polynomial.
- */
-static void
-ffLDL_fft_inner(fpr *restrict tree,
-	fpr *restrict g0, fpr *restrict g1, unsigned logn, fpr *restrict tmp)
-{
-	size_t n, hn;
-
-	n = MKN(logn);
-	if (n == 1) {
-		tree[0] = g0[0];
-		return;
-	}
-	hn = n >> 1;
-
-	/*
-	 * The LDL decomposition yields L (which is written in the tree)
-	 * and the diagonal of D. Since d00 = g0, we just write d11
-	 * into tmp.
-	 */
-	Zf(poly_LDLmv_fft)(tmp, tree, g0, g1, g0, logn);
-
-	/*
-	 * Split d00 (currently in g0) and d11 (currently in tmp). We
-	 * reuse g0 and g1 as temporary storage spaces:
-	 *   d00 splits into g1, g1+hn
-	 *   d11 splits into g0, g0+hn
-	 */
-	Zf(poly_split_fft)(g1, g1 + hn, g0, logn);
-	Zf(poly_split_fft)(g0, g0 + hn, tmp, logn);
-
-	/*
-	 * Each split result is the first row of a new auto-adjoint
-	 * quasicyclic matrix for the next recursive step.
-	 */
-	ffLDL_fft_inner(tree + n,
-		g1, g1 + hn, logn - 1, tmp);
-	ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
-		g0, g0 + hn, logn - 1, tmp);
-}
-
-/*
- * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix
- * is provided as three polynomials (FFT representation).
- *
- * The "tree" array is filled with the computed tree, of size
- * (logn+1)*(2^logn) elements (see ffLDL_treesize()).
- *
- * Input arrays MUST NOT overlap, except possibly the three unmodified
- * arrays g00, g01 and g11. tmp[] should have room for at least three
- * polynomials of 2^logn elements each.
- */
-static void
-ffLDL_fft(fpr *restrict tree, const fpr *restrict g00,
-	const fpr *restrict g01, const fpr *restrict g11,
-	unsigned logn, fpr *restrict tmp)
-{
-	size_t n, hn;
-	fpr *d00, *d11;
-
-	n = MKN(logn);
-	if (n == 1) {
-		tree[0] = g00[0];
-		return;
-	}
-	hn = n >> 1;
-	d00 = tmp;
-	d11 = tmp + n;
-	tmp += n << 1;
-
-	memcpy(d00, g00, n * sizeof *g00);
-	Zf(poly_LDLmv_fft)(d11, tree, g00, g01, g11, logn);
-
-	Zf(poly_split_fft)(tmp, tmp + hn, d00, logn);
-	Zf(poly_split_fft)(d00, d00 + hn, d11, logn);
-	memcpy(d11, tmp, n * sizeof *tmp);
-	ffLDL_fft_inner(tree + n,
-		d11, d11 + hn, logn - 1, tmp);
-	ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
-		d00, d00 + hn, logn - 1, tmp);
-}
-
-/*
- * Normalize an ffLDL tree: each leaf of value x is replaced with
- * sigma / sqrt(x).
- */
-static void
-ffLDL_binary_normalize(fpr *tree, unsigned logn)
-{
-	/*
-	 * TODO: make an iterative version.
-	 */
-	size_t n;
-
-	n = MKN(logn);
-	if (n == 1) {
-		/*
-		 * We actually store in the tree leaf the inverse of
-		 * the value mandated by the specification: this
-		 * saves a division both here and in the sampler.
-		 */
-		tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma);
-	} else {
-		ffLDL_binary_normalize(tree + n, logn - 1);
-		ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1),
-			logn - 1);
-	}
-}
-
-/* =================================================================== */
-
-/*
- * Convert an integer polynomial (with small values) into the
- * representation with complex numbers.
- */
-static void
-smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn)
-{
-	size_t n, u;
-
-	n = MKN(logn);
-	for (u = 0; u < n; u ++) {
-		r[u] = fpr_of(t[u]);
-	}
-}
-
-/*
- * The expanded private key contains:
- *  - The B0 matrix (four elements)
- *  - The ffLDL tree
- */
-
-static inline size_t
-skoff_b00(unsigned logn)
-{
-	(void)logn;
-	return 0;
-}
-
-static inline size_t
-skoff_b01(unsigned logn)
-{
-	return MKN(logn);
-}
-
-static inline size_t
-skoff_b10(unsigned logn)
-{
-	return 2 * MKN(logn);
-}
-
-static inline size_t
-skoff_b11(unsigned logn)
-{
-	return 3 * MKN(logn);
-}
-
-static inline size_t
-skoff_tree(unsigned logn)
-{
-	return 4 * MKN(logn);
-}
-
-/* see inner.h */
-void
-Zf(expand_privkey)(fpr *restrict expanded_key,
-	const int8_t *f, const int8_t *g,
-	const int8_t *F, const int8_t *G,
-	unsigned logn, uint8_t *restrict tmp)
-{
-	size_t n;
-	fpr *rf, *rg, *rF, *rG;
-	fpr *b00, *b01, *b10, *b11;
-	fpr *g00, *g01, *g11, *gxx;
-	fpr *tree;
-
-	n = MKN(logn);
-	b00 = expanded_key + skoff_b00(logn);
-	b01 = expanded_key + skoff_b01(logn);
-	b10 = expanded_key + skoff_b10(logn);
-	b11 = expanded_key + skoff_b11(logn);
-	tree = expanded_key + skoff_tree(logn);
-
-	/*
-	 * We load the private key elements directly into the B0 matrix,
-	 * since B0 = [[g, -f], [G, -F]].
-	 */
-	rf = b01;
-	rg = b00;
-	rF = b11;
-	rG = b10;
-
-	smallints_to_fpr(rf, f, logn);
-	smallints_to_fpr(rg, g, logn);
-	smallints_to_fpr(rF, F, logn);
-	smallints_to_fpr(rG, G, logn);
-
-	/*
-	 * Compute the FFT for the key elements, and negate f and F.
-	 */
-	Zf(FFT)(rf, logn);
-	Zf(FFT)(rg, logn);
-	Zf(FFT)(rF, logn);
-	Zf(FFT)(rG, logn);
-	Zf(poly_neg)(rf, logn);
-	Zf(poly_neg)(rF, logn);
-
-	/*
-	 * The Gram matrix is G = B·B*. Formulas are:
-	 *   g00 = b00*adj(b00) + b01*adj(b01)
-	 *   g01 = b00*adj(b10) + b01*adj(b11)
-	 *   g10 = b10*adj(b00) + b11*adj(b01)
-	 *   g11 = b10*adj(b10) + b11*adj(b11)
-	 *
-	 * For historical reasons, this implementation uses
-	 * g00, g01 and g11 (upper triangle).
-	 */
-	g00 = (fpr *)tmp;
-	g01 = g00 + n;
-	g11 = g01 + n;
-	gxx = g11 + n;
-
-	memcpy(g00, b00, n * sizeof *b00);
-	Zf(poly_mulselfadj_fft)(g00, logn);
-	memcpy(gxx, b01, n * sizeof *b01);
-	Zf(poly_mulselfadj_fft)(gxx, logn);
-	Zf(poly_add)(g00, gxx, logn);
-
-	memcpy(g01, b00, n * sizeof *b00);
-	Zf(poly_muladj_fft)(g01, b10, logn);
-	memcpy(gxx, b01, n * sizeof *b01);
-	Zf(poly_muladj_fft)(gxx, b11, logn);
-	Zf(poly_add)(g01, gxx, logn);
-
-	memcpy(g11, b10, n * sizeof *b10);
-	Zf(poly_mulselfadj_fft)(g11, logn);
-	memcpy(gxx, b11, n * sizeof *b11);
-	Zf(poly_mulselfadj_fft)(gxx, logn);
-	Zf(poly_add)(g11, gxx, logn);
-
-	/*
-	 * Compute the Falcon tree.
-	 */
-	ffLDL_fft(tree, g00, g01, g11, logn, gxx);
-
-	/*
-	 * Normalize tree.
-	 */
-	ffLDL_binary_normalize(tree, logn);
-}
-
-typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma);
-
-/*
- * Perform Fast Fourier Sampling for target vector t. The Gram matrix
- * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector
- * is written over (t0,t1). The Gram matrix is modified as well. The
- * tmp[] buffer must have room for four polynomials.
- */
-TARGET_AVX2
-static void
-ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx,
-	fpr *restrict t0, fpr *restrict t1,
-	fpr *restrict g00, fpr *restrict g01, fpr *restrict g11,
-	unsigned logn, fpr *restrict tmp)
-{
-	size_t n, hn;
-	fpr *z0, *z1;
-
-	/*
-	 * Deepest level: the LDL tree leaf value is just g00 (the
-	 * array has length only 1 at this point); we normalize it
-	 * with regards to sigma, then use it for sampling.
-	 */
-	if (logn == 0) {
-		fpr leaf;
-
-		leaf = g00[0];
-		leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma);
-		t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf));
-		t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf));
-		return;
-	}
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-
-	/*
-	 * Decompose G into LDL. We only need d00 (identical to g00),
-	 * d11, and l10; we do that in place.
-	 */
-	Zf(poly_LDL_fft)(g00, g01, g11, logn);
-
-	/*
-	 * Split d00 and d11 and expand them into half-size quasi-cyclic
-	 * Gram matrices. We also save l10 in tmp[].
-	 */
-	Zf(poly_split_fft)(tmp, tmp + hn, g00, logn);
-	memcpy(g00, tmp, n * sizeof *tmp);
-	Zf(poly_split_fft)(tmp, tmp + hn, g11, logn);
-	memcpy(g11, tmp, n * sizeof *tmp);
-	memcpy(tmp, g01, n * sizeof *g01);
-	memcpy(g01, g00, hn * sizeof *g00);
-	memcpy(g01 + hn, g11, hn * sizeof *g00);
-
-	/*
-	 * The half-size Gram matrices for the recursive LDL tree
-	 * building are now:
-	 *   - left sub-tree: g00, g00+hn, g01
-	 *   - right sub-tree: g11, g11+hn, g01+hn
-	 * l10 is in tmp[].
-	 */
-
-	/*
-	 * We split t1 and use the first recursive call on the two
-	 * halves, using the right sub-tree. The result is merged
-	 * back into tmp + 2*n.
-	 */
-	z1 = tmp + n;
-	Zf(poly_split_fft)(z1, z1 + hn, t1, logn);
-	ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn,
-		g11, g11 + hn, g01 + hn, logn - 1, z1 + n);
-	Zf(poly_merge_fft)(tmp + (n << 1), z1, z1 + hn, logn);
-
-	/*
-	 * Compute tb0 = t0 + (t1 - z1) * l10.
-	 * At that point, l10 is in tmp, t1 is unmodified, and z1 is
-	 * in tmp + (n << 1). The buffer in z1 is free.
-	 *
-	 * In the end, z1 is written over t1, and tb0 is in t0.
-	 */
-	memcpy(z1, t1, n * sizeof *t1);
-	Zf(poly_sub)(z1, tmp + (n << 1), logn);
-	memcpy(t1, tmp + (n << 1), n * sizeof *tmp);
-	Zf(poly_mul_fft)(tmp, z1, logn);
-	Zf(poly_add)(t0, tmp, logn);
-
-	/*
-	 * Second recursive invocation, on the split tb0 (currently in t0)
-	 * and the left sub-tree.
-	 */
-	z0 = tmp;
-	Zf(poly_split_fft)(z0, z0 + hn, t0, logn);
-	ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn,
-		g00, g00 + hn, g01, logn - 1, z0 + n);
-	Zf(poly_merge_fft)(t0, z0, z0 + hn, logn);
-}
-
-/*
- * Perform Fast Fourier Sampling for target vector t and LDL tree T.
- * tmp[] must have size for at least two polynomials of size 2^logn.
- */
-TARGET_AVX2
-static void
-ffSampling_fft(samplerZ samp, void *samp_ctx,
-	fpr *restrict z0, fpr *restrict z1,
-	const fpr *restrict tree,
-	const fpr *restrict t0, const fpr *restrict t1, unsigned logn,
-	fpr *restrict tmp)
-{
-	size_t n, hn;
-	const fpr *tree0, *tree1;
-
-	/*
-	 * When logn == 2, we inline the last two recursion levels.
-	 */
-	if (logn == 2) {
-#if FALCON_AVX2  // yyyAVX2+1
-		fpr w0, w1, w2, w3, sigma;
-		__m128d ww0, ww1, wa, wb, wc, wd;
-		__m128d wy0, wy1, wz0, wz1;
-		__m128d half, invsqrt8, invsqrt2, neghi, neglo;
-		int si0, si1, si2, si3;
-
-		tree0 = tree + 4;
-		tree1 = tree + 8;
-
-		half = _mm_set1_pd(0.5);
-		invsqrt8 = _mm_set1_pd(0.353553390593273762200422181052);
-		invsqrt2 = _mm_set1_pd(0.707106781186547524400844362105);
-		neghi = _mm_set_pd(-0.0, 0.0);
-		neglo = _mm_set_pd(0.0, -0.0);
-
-		/*
-		 * We split t1 into w*, then do the recursive invocation,
-		 * with output in w*. We finally merge back into z1.
-		 */
-		ww0 = _mm_loadu_pd(&t1[0].v);
-		ww1 = _mm_loadu_pd(&t1[2].v);
-		wa = _mm_unpacklo_pd(ww0, ww1);
-		wb = _mm_unpackhi_pd(ww0, ww1);
-		wc = _mm_add_pd(wa, wb);
-		ww0 = _mm_mul_pd(wc, half);
-		wc = _mm_sub_pd(wa, wb);
-		wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi);
-		ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8);
-
-		w2.v = _mm_cvtsd_f64(ww1);
-		w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1));
-		wa = ww1;
-		sigma = tree1[3];
-		si2 = samp(samp_ctx, w2, sigma);
-		si3 = samp(samp_ctx, w3, sigma);
-		ww1 = _mm_set_pd((double)si3, (double)si2);
-		wa = _mm_sub_pd(wa, ww1);
-		wb = _mm_loadu_pd(&tree1[0].v);
-		wc = _mm_mul_pd(wa, wb);
-		wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1));
-		wa = _mm_unpacklo_pd(wc, wd);
-		wb = _mm_unpackhi_pd(wc, wd);
-		ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo)));
-		w0.v = _mm_cvtsd_f64(ww0);
-		w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1));
-		sigma = tree1[2];
-		si0 = samp(samp_ctx, w0, sigma);
-		si1 = samp(samp_ctx, w1, sigma);
-		ww0 = _mm_set_pd((double)si1, (double)si0);
-
-		wc = _mm_mul_pd(
-			_mm_set_pd((double)(si2 + si3), (double)(si2 - si3)),
-			invsqrt2);
-		wa = _mm_add_pd(ww0, wc);
-		wb = _mm_sub_pd(ww0, wc);
-		ww0 = _mm_unpacklo_pd(wa, wb);
-		ww1 = _mm_unpackhi_pd(wa, wb);
-		_mm_storeu_pd(&z1[0].v, ww0);
-		_mm_storeu_pd(&z1[2].v, ww1);
-
-		/*
-		 * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
-		 */
-		wy0 = _mm_sub_pd(_mm_loadu_pd(&t1[0].v), ww0);
-		wy1 = _mm_sub_pd(_mm_loadu_pd(&t1[2].v), ww1);
-		wz0 = _mm_loadu_pd(&tree[0].v);
-		wz1 = _mm_loadu_pd(&tree[2].v);
-		ww0 = _mm_sub_pd(_mm_mul_pd(wy0, wz0), _mm_mul_pd(wy1, wz1));
-		ww1 = _mm_add_pd(_mm_mul_pd(wy0, wz1), _mm_mul_pd(wy1, wz0));
-		ww0 = _mm_add_pd(ww0, _mm_loadu_pd(&t0[0].v));
-		ww1 = _mm_add_pd(ww1, _mm_loadu_pd(&t0[2].v));
-
-		/*
-		 * Second recursive invocation.
-		 */
-		wa = _mm_unpacklo_pd(ww0, ww1);
-		wb = _mm_unpackhi_pd(ww0, ww1);
-		wc = _mm_add_pd(wa, wb);
-		ww0 = _mm_mul_pd(wc, half);
-		wc = _mm_sub_pd(wa, wb);
-		wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi);
-		ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8);
-
-		w2.v = _mm_cvtsd_f64(ww1);
-		w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1));
-		wa = ww1;
-		sigma = tree0[3];
-		si2 = samp(samp_ctx, w2, sigma);
-		si3 = samp(samp_ctx, w3, sigma);
-		ww1 = _mm_set_pd((double)si3, (double)si2);
-		wa = _mm_sub_pd(wa, ww1);
-		wb = _mm_loadu_pd(&tree0[0].v);
-		wc = _mm_mul_pd(wa, wb);
-		wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1));
-		wa = _mm_unpacklo_pd(wc, wd);
-		wb = _mm_unpackhi_pd(wc, wd);
-		ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo)));
-		w0.v = _mm_cvtsd_f64(ww0);
-		w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1));
-		sigma = tree0[2];
-		si0 = samp(samp_ctx, w0, sigma);
-		si1 = samp(samp_ctx, w1, sigma);
-		ww0 = _mm_set_pd((double)si1, (double)si0);
-
-		wc = _mm_mul_pd(
-			_mm_set_pd((double)(si2 + si3), (double)(si2 - si3)),
-			invsqrt2);
-		wa = _mm_add_pd(ww0, wc);
-		wb = _mm_sub_pd(ww0, wc);
-		ww0 = _mm_unpacklo_pd(wa, wb);
-		ww1 = _mm_unpackhi_pd(wa, wb);
-		_mm_storeu_pd(&z0[0].v, ww0);
-		_mm_storeu_pd(&z0[2].v, ww1);
-
-		return;
-#else  // yyyAVX2+0
-		fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma;
-		fpr a_re, a_im, b_re, b_im, c_re, c_im;
-
-		tree0 = tree + 4;
-		tree1 = tree + 8;
-
-		/*
-		 * We split t1 into w*, then do the recursive invocation,
-		 * with output in w*. We finally merge back into z1.
-		 */
-		a_re = t1[0];
-		a_im = t1[2];
-		b_re = t1[1];
-		b_im = t1[3];
-		c_re = fpr_add(a_re, b_re);
-		c_im = fpr_add(a_im, b_im);
-		w0 = fpr_half(c_re);
-		w1 = fpr_half(c_im);
-		c_re = fpr_sub(a_re, b_re);
-		c_im = fpr_sub(a_im, b_im);
-		w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
-		w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
-
-		x0 = w2;
-		x1 = w3;
-		sigma = tree1[3];
-		w2 = fpr_of(samp(samp_ctx, x0, sigma));
-		w3 = fpr_of(samp(samp_ctx, x1, sigma));
-		a_re = fpr_sub(x0, w2);
-		a_im = fpr_sub(x1, w3);
-		b_re = tree1[0];
-		b_im = tree1[1];
-		c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
-		c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
-		x0 = fpr_add(c_re, w0);
-		x1 = fpr_add(c_im, w1);
-		sigma = tree1[2];
-		w0 = fpr_of(samp(samp_ctx, x0, sigma));
-		w1 = fpr_of(samp(samp_ctx, x1, sigma));
-
-		a_re = w0;
-		a_im = w1;
-		b_re = w2;
-		b_im = w3;
-		c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
-		c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
-		z1[0] = w0 = fpr_add(a_re, c_re);
-		z1[2] = w2 = fpr_add(a_im, c_im);
-		z1[1] = w1 = fpr_sub(a_re, c_re);
-		z1[3] = w3 = fpr_sub(a_im, c_im);
-
-		/*
-		 * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
-		 */
-		w0 = fpr_sub(t1[0], w0);
-		w1 = fpr_sub(t1[1], w1);
-		w2 = fpr_sub(t1[2], w2);
-		w3 = fpr_sub(t1[3], w3);
-
-		a_re = w0;
-		a_im = w2;
-		b_re = tree[0];
-		b_im = tree[2];
-		w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
-		w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
-		a_re = w1;
-		a_im = w3;
-		b_re = tree[1];
-		b_im = tree[3];
-		w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
-		w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
-
-		w0 = fpr_add(w0, t0[0]);
-		w1 = fpr_add(w1, t0[1]);
-		w2 = fpr_add(w2, t0[2]);
-		w3 = fpr_add(w3, t0[3]);
-
-		/*
-		 * Second recursive invocation.
-		 */
-		a_re = w0;
-		a_im = w2;
-		b_re = w1;
-		b_im = w3;
-		c_re = fpr_add(a_re, b_re);
-		c_im = fpr_add(a_im, b_im);
-		w0 = fpr_half(c_re);
-		w1 = fpr_half(c_im);
-		c_re = fpr_sub(a_re, b_re);
-		c_im = fpr_sub(a_im, b_im);
-		w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
-		w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
-
-		x0 = w2;
-		x1 = w3;
-		sigma = tree0[3];
-		w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma));
-		w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma));
-		a_re = fpr_sub(x0, y0);
-		a_im = fpr_sub(x1, y1);
-		b_re = tree0[0];
-		b_im = tree0[1];
-		c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
-		c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
-		x0 = fpr_add(c_re, w0);
-		x1 = fpr_add(c_im, w1);
-		sigma = tree0[2];
-		w0 = fpr_of(samp(samp_ctx, x0, sigma));
-		w1 = fpr_of(samp(samp_ctx, x1, sigma));
-
-		a_re = w0;
-		a_im = w1;
-		b_re = w2;
-		b_im = w3;
-		c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
-		c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
-		z0[0] = fpr_add(a_re, c_re);
-		z0[2] = fpr_add(a_im, c_im);
-		z0[1] = fpr_sub(a_re, c_re);
-		z0[3] = fpr_sub(a_im, c_im);
-
-		return;
-#endif  // yyyAVX2-
-	}
-
-	/*
-	 * Case logn == 1 is reachable only when using Falcon-2 (the
-	 * smallest size for which Falcon is mathematically defined, but
-	 * of course way too insecure to be of any use).
-	 */
-	if (logn == 1) {
-		fpr x0, x1, y0, y1, sigma;
-		fpr a_re, a_im, b_re, b_im, c_re, c_im;
-
-		x0 = t1[0];
-		x1 = t1[1];
-		sigma = tree[3];
-		z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
-		z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
-		a_re = fpr_sub(x0, y0);
-		a_im = fpr_sub(x1, y1);
-		b_re = tree[0];
-		b_im = tree[1];
-		c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
-		c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
-		x0 = fpr_add(c_re, t0[0]);
-		x1 = fpr_add(c_im, t0[1]);
-		sigma = tree[2];
-		z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
-		z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
-
-		return;
-	}
-
-	/*
-	 * Normal end of recursion is for logn == 0. Since the last
-	 * steps of the recursions were inlined in the blocks above
-	 * (when logn == 1 or 2), this case is not reachable, and is
-	 * retained here only for documentation purposes.
-
-	if (logn == 0) {
-		fpr x0, x1, sigma;
-
-		x0 = t0[0];
-		x1 = t1[0];
-		sigma = tree[0];
-		z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
-		z1[0] = fpr_of(samp(samp_ctx, x1, sigma));
-		return;
-	}
-
-	 */
-
-	/*
-	 * General recursive case (logn >= 3).
-	 */
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-	tree0 = tree + n;
-	tree1 = tree + n + ffLDL_treesize(logn - 1);
-
-	/*
-	 * We split t1 into z1 (reused as temporary storage), then do
-	 * the recursive invocation, with output in tmp. We finally
-	 * merge back into z1.
-	 */
-	Zf(poly_split_fft)(z1, z1 + hn, t1, logn);
-	ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
-		tree1, z1, z1 + hn, logn - 1, tmp + n);
-	Zf(poly_merge_fft)(z1, tmp, tmp + hn, logn);
-
-	/*
-	 * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[].
-	 */
-	memcpy(tmp, t1, n * sizeof *t1);
-	Zf(poly_sub)(tmp, z1, logn);
-	Zf(poly_mul_fft)(tmp, tree, logn);
-	Zf(poly_add)(tmp, t0, logn);
-
-	/*
-	 * Second recursive invocation.
-	 */
-	Zf(poly_split_fft)(z0, z0 + hn, tmp, logn);
-	ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
-		tree0, z0, z0 + hn, logn - 1, tmp + n);
-	Zf(poly_merge_fft)(z0, tmp, tmp + hn, logn);
-}
-
-/*
- * Compute a signature: the signature contains two vectors, s1 and s2.
- * The s1 vector is not returned. The squared norm of (s1,s2) is
- * computed, and if it is short enough, then s2 is returned into the
- * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
- * returned; the caller should then try again. This function uses an
- * expanded key.
- *
- * tmp[] must have room for at least six polynomials.
- */
-static int
-do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
-	const fpr *restrict expanded_key,
-	const uint16_t *hm,
-	unsigned logn, fpr *restrict tmp)
-{
-	size_t n, u;
-	fpr *t0, *t1, *tx, *ty;
-	const fpr *b00, *b01, *b10, *b11, *tree;
-	fpr ni;
-	uint32_t sqn, ng;
-	int16_t *s1tmp, *s2tmp;
-
-	n = MKN(logn);
-	t0 = tmp;
-	t1 = t0 + n;
-	b00 = expanded_key + skoff_b00(logn);
-	b01 = expanded_key + skoff_b01(logn);
-	b10 = expanded_key + skoff_b10(logn);
-	b11 = expanded_key + skoff_b11(logn);
-	tree = expanded_key + skoff_tree(logn);
-
-	/*
-	 * Set the target vector to [hm, 0] (hm is the hashed message).
-	 */
-	for (u = 0; u < n; u ++) {
-		t0[u] = fpr_of(hm[u]);
-		/* This is implicit.
-		t1[u] = fpr_zero;
-		*/
-	}
-
-	/*
-	 * Apply the lattice basis to obtain the real target
-	 * vector (after normalization with regards to modulus).
-	 */
-	Zf(FFT)(t0, logn);
-	ni = fpr_inverse_of_q;
-	memcpy(t1, t0, n * sizeof *t0);
-	Zf(poly_mul_fft)(t1, b01, logn);
-	Zf(poly_mulconst)(t1, fpr_neg(ni), logn);
-	Zf(poly_mul_fft)(t0, b11, logn);
-	Zf(poly_mulconst)(t0, ni, logn);
-
-	tx = t1 + n;
-	ty = tx + n;
-
-	/*
-	 * Apply sampling. Output is written back in [tx, ty].
-	 */
-	ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, logn, ty + n);
-
-	/*
-	 * Get the lattice point corresponding to that tiny vector.
-	 */
-	memcpy(t0, tx, n * sizeof *tx);
-	memcpy(t1, ty, n * sizeof *ty);
-	Zf(poly_mul_fft)(tx, b00, logn);
-	Zf(poly_mul_fft)(ty, b10, logn);
-	Zf(poly_add)(tx, ty, logn);
-	memcpy(ty, t0, n * sizeof *t0);
-	Zf(poly_mul_fft)(ty, b01, logn);
-
-	memcpy(t0, tx, n * sizeof *tx);
-	Zf(poly_mul_fft)(t1, b11, logn);
-	Zf(poly_add)(t1, ty, logn);
-
-	Zf(iFFT)(t0, logn);
-	Zf(iFFT)(t1, logn);
-
-	/*
-	 * Compute the signature.
-	 */
-	s1tmp = (int16_t *)tx;
-	sqn = 0;
-	ng = 0;
-	for (u = 0; u < n; u ++) {
-		int32_t z;
-
-		z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
-		sqn += (uint32_t)(z * z);
-		ng |= sqn;
-		s1tmp[u] = (int16_t)z;
-	}
-	sqn |= -(ng >> 31);
-
-	/*
-	 * With "normal" degrees (e.g. 512 or 1024), it is very
-	 * improbable that the computed vector is not short enough;
-	 * however, it may happen in practice for the very reduced
-	 * versions (e.g. degree 16 or below). In that case, the caller
-	 * will loop, and we must not write anything into s2[] because
-	 * s2[] may overlap with the hashed message hm[] and we need
-	 * hm[] for the next iteration.
-	 */
-	s2tmp = (int16_t *)tmp;
-	for (u = 0; u < n; u ++) {
-		s2tmp[u] = (int16_t)-fpr_rint(t1[u]);
-	}
-	if (Zf(is_short_half)(sqn, s2tmp, logn)) {
-		memcpy(s2, s2tmp, n * sizeof *s2);
-		memcpy(tmp, s1tmp, n * sizeof *s1tmp);
-		return 1;
-	}
-	return 0;
-}
-
-/*
- * Compute a signature: the signature contains two vectors, s1 and s2.
- * The s1 vector is not returned. The squared norm of (s1,s2) is
- * computed, and if it is short enough, then s2 is returned into the
- * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
- * returned; the caller should then try again.
- *
- * tmp[] must have room for at least nine polynomials.
- */
-static int
-do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
-	const int8_t *restrict f, const int8_t *restrict g,
-	const int8_t *restrict F, const int8_t *restrict G,
-	const uint16_t *hm, unsigned logn, fpr *restrict tmp)
-{
-	size_t n, u;
-	fpr *t0, *t1, *tx, *ty;
-	fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
-	fpr ni;
-	uint32_t sqn, ng;
-	int16_t *s1tmp, *s2tmp;
-
-	n = MKN(logn);
-
-	/*
-	 * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT.
-	 */
-	b00 = tmp;
-	b01 = b00 + n;
-	b10 = b01 + n;
-	b11 = b10 + n;
-	smallints_to_fpr(b01, f, logn);
-	smallints_to_fpr(b00, g, logn);
-	smallints_to_fpr(b11, F, logn);
-	smallints_to_fpr(b10, G, logn);
-	Zf(FFT)(b01, logn);
-	Zf(FFT)(b00, logn);
-	Zf(FFT)(b11, logn);
-	Zf(FFT)(b10, logn);
-	Zf(poly_neg)(b01, logn);
-	Zf(poly_neg)(b11, logn);
-
-	/*
-	 * Compute the Gram matrix G = B·B*. Formulas are:
-	 *   g00 = b00*adj(b00) + b01*adj(b01)
-	 *   g01 = b00*adj(b10) + b01*adj(b11)
-	 *   g10 = b10*adj(b00) + b11*adj(b01)
-	 *   g11 = b10*adj(b10) + b11*adj(b11)
-	 *
-	 * For historical reasons, this implementation uses
-	 * g00, g01 and g11 (upper triangle). g10 is not kept
-	 * since it is equal to adj(g01).
-	 *
-	 * We _replace_ the matrix B with the Gram matrix, but we
-	 * must keep b01 and b11 for computing the target vector.
-	 */
-	t0 = b11 + n;
-	t1 = t0 + n;
-
-	memcpy(t0, b01, n * sizeof *b01);
-	Zf(poly_mulselfadj_fft)(t0, logn);    // t0 <- b01*adj(b01)
-
-	memcpy(t1, b00, n * sizeof *b00);
-	Zf(poly_muladj_fft)(t1, b10, logn);   // t1 <- b00*adj(b10)
-	Zf(poly_mulselfadj_fft)(b00, logn);   // b00 <- b00*adj(b00)
-	Zf(poly_add)(b00, t0, logn);      // b00 <- g00
-	memcpy(t0, b01, n * sizeof *b01);
-	Zf(poly_muladj_fft)(b01, b11, logn);  // b01 <- b01*adj(b11)
-	Zf(poly_add)(b01, t1, logn);      // b01 <- g01
-
-	Zf(poly_mulselfadj_fft)(b10, logn);   // b10 <- b10*adj(b10)
-	memcpy(t1, b11, n * sizeof *b11);
-	Zf(poly_mulselfadj_fft)(t1, logn);    // t1 <- b11*adj(b11)
-	Zf(poly_add)(b10, t1, logn);      // b10 <- g11
-
-	/*
-	 * We rename variables to make things clearer. The three elements
-	 * of the Gram matrix uses the first 3*n slots of tmp[], followed
-	 * by b11 and b01 (in that order).
-	 */
-	g00 = b00;
-	g01 = b01;
-	g11 = b10;
-	b01 = t0;
-	t0 = b01 + n;
-	t1 = t0 + n;
-
-	/*
-	 * Memory layout at that point:
-	 *   g00 g01 g11 b11 b01 t0 t1
-	 */
-
-	/*
-	 * Set the target vector to [hm, 0] (hm is the hashed message).
-	 */
-	for (u = 0; u < n; u ++) {
-		t0[u] = fpr_of(hm[u]);
-		/* This is implicit.
-		t1[u] = fpr_zero;
-		*/
-	}
-
-	/*
-	 * Apply the lattice basis to obtain the real target
-	 * vector (after normalization with regards to modulus).
-	 */
-	Zf(FFT)(t0, logn);
-	ni = fpr_inverse_of_q;
-	memcpy(t1, t0, n * sizeof *t0);
-	Zf(poly_mul_fft)(t1, b01, logn);
-	Zf(poly_mulconst)(t1, fpr_neg(ni), logn);
-	Zf(poly_mul_fft)(t0, b11, logn);
-	Zf(poly_mulconst)(t0, ni, logn);
-
-	/*
-	 * b01 and b11 can be discarded, so we move back (t0,t1).
-	 * Memory layout is now:
-	 *      g00 g01 g11 t0 t1
-	 */
-	memcpy(b11, t0, n * 2 * sizeof *t0);
-	t0 = g11 + n;
-	t1 = t0 + n;
-
-	/*
-	 * Apply sampling; result is written over (t0,t1).
-	 */
-	ffSampling_fft_dyntree(samp, samp_ctx,
-		t0, t1, g00, g01, g11, logn, t1 + n);
-
-	/*
-	 * We arrange the layout back to:
-	 *     b00 b01 b10 b11 t0 t1
-	 *
-	 * We did not conserve the matrix basis, so we must recompute
-	 * it now.
-	 */
-	b00 = tmp;
-	b01 = b00 + n;
-	b10 = b01 + n;
-	b11 = b10 + n;
-	memmove(b11 + n, t0, n * 2 * sizeof *t0);
-	t0 = b11 + n;
-	t1 = t0 + n;
-	smallints_to_fpr(b01, f, logn);
-	smallints_to_fpr(b00, g, logn);
-	smallints_to_fpr(b11, F, logn);
-	smallints_to_fpr(b10, G, logn);
-	Zf(FFT)(b01, logn);
-	Zf(FFT)(b00, logn);
-	Zf(FFT)(b11, logn);
-	Zf(FFT)(b10, logn);
-	Zf(poly_neg)(b01, logn);
-	Zf(poly_neg)(b11, logn);
-	tx = t1 + n;
-	ty = tx + n;
-
-	/*
-	 * Get the lattice point corresponding to that tiny vector.
-	 */
-	memcpy(tx, t0, n * sizeof *t0);
-	memcpy(ty, t1, n * sizeof *t1);
-	Zf(poly_mul_fft)(tx, b00, logn);
-	Zf(poly_mul_fft)(ty, b10, logn);
-	Zf(poly_add)(tx, ty, logn);
-	memcpy(ty, t0, n * sizeof *t0);
-	Zf(poly_mul_fft)(ty, b01, logn);
-
-	memcpy(t0, tx, n * sizeof *tx);
-	Zf(poly_mul_fft)(t1, b11, logn);
-	Zf(poly_add)(t1, ty, logn);
-	Zf(iFFT)(t0, logn);
-	Zf(iFFT)(t1, logn);
-
-	s1tmp = (int16_t *)tx;
-	sqn = 0;
-	ng = 0;
-	for (u = 0; u < n; u ++) {
-		int32_t z;
-
-		z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
-		sqn += (uint32_t)(z * z);
-		ng |= sqn;
-		s1tmp[u] = (int16_t)z;
-	}
-	sqn |= -(ng >> 31);
-
-	/*
-	 * With "normal" degrees (e.g. 512 or 1024), it is very
-	 * improbable that the computed vector is not short enough;
-	 * however, it may happen in practice for the very reduced
-	 * versions (e.g. degree 16 or below). In that case, the caller
-	 * will loop, and we must not write anything into s2[] because
-	 * s2[] may overlap with the hashed message hm[] and we need
-	 * hm[] for the next iteration.
-	 */
-	s2tmp = (int16_t *)tmp;
-	for (u = 0; u < n; u ++) {
-		s2tmp[u] = (int16_t)-fpr_rint(t1[u]);
-	}
-	if (Zf(is_short_half)(sqn, s2tmp, logn)) {
-		memcpy(s2, s2tmp, n * sizeof *s2);
-		memcpy(tmp, s1tmp, n * sizeof *s1tmp);
-		return 1;
-	}
-	return 0;
-}
-
-/*
- * Sample an integer value along a half-gaussian distribution centered
- * on zero and standard deviation 1.8205, with a precision of 72 bits.
- */
-TARGET_AVX2
-int
-Zf(gaussian0_sampler)(prng *p)
-{
-#if FALCON_AVX2 // yyyAVX2+1
-
-	/*
-	 * High words.
-	 */
-	static const union {
-		uint16_t u16[16];
-		__m256i ymm[1];
-	} rhi15 = {
-		{
-			0x51FB, 0x2A69, 0x113E, 0x0568,
-			0x014A, 0x003B, 0x0008, 0x0000,
-			0x0000, 0x0000, 0x0000, 0x0000,
-			0x0000, 0x0000, 0x0000, 0x0000
-		}
-	};
-
-	static const union {
-		uint64_t u64[20];
-		__m256i ymm[5];
-	} rlo57 = {
-		{
-			0x1F42ED3AC391802, 0x12B181F3F7DDB82,
-			0x1CDD0934829C1FF, 0x1754377C7994AE4,
-			0x1846CAEF33F1F6F, 0x14AC754ED74BD5F,
-			0x024DD542B776AE4, 0x1A1FFDC65AD63DA,
-			0x01F80D88A7B6428, 0x001C3FDB2040C69,
-			0x00012CF24D031FB, 0x00000949F8B091F,
-			0x0000003665DA998, 0x00000000EBF6EBB,
-			0x0000000002F5D7E, 0x000000000007098,
-			0x0000000000000C6, 0x000000000000001,
-			0x000000000000000, 0x000000000000000
-		}
-	};
-
-	uint64_t lo;
-	unsigned hi;
-	__m256i xhi, rhi, gthi, eqhi, eqm;
-	__m256i xlo, gtlo0, gtlo1, gtlo2, gtlo3, gtlo4;
-	__m128i t, zt;
-	int r;
-
-	/*
-	 * Get a 72-bit random value and split it into a low part
-	 * (57 bits) and a high part (15 bits)
-	 */
-	lo = prng_get_u64(p);
-	hi = prng_get_u8(p);
-	hi = (hi << 7) | (unsigned)(lo >> 57);
-	lo &= 0x1FFFFFFFFFFFFFF;
-
-	/*
-	 * Broadcast the high part and compare it with the relevant
-	 * values. We need both a "greater than" and an "equal"
-	 * comparisons.
-	 */
-	xhi = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(hi));
-	rhi = _mm256_loadu_si256(&rhi15.ymm[0]);
-	gthi = _mm256_cmpgt_epi16(rhi, xhi);
-	eqhi = _mm256_cmpeq_epi16(rhi, xhi);
-
-	/*
-	 * The result is the number of 72-bit values (among the list of 19)
-	 * which are greater than the 72-bit random value. We first count
-	 * all non-zero 16-bit elements in the first eight of gthi. Such
-	 * elements have value -1 or 0, so we first negate them.
-	 */
-	t = _mm_srli_epi16(_mm256_castsi256_si128(gthi), 15);
-	zt = _mm_setzero_si128();
-	t = _mm_hadd_epi16(t, zt);
-	t = _mm_hadd_epi16(t, zt);
-	t = _mm_hadd_epi16(t, zt);
-	r = _mm_cvtsi128_si32(t);
-
-	/*
-	 * We must look at the low bits for all values for which the
-	 * high bits are an "equal" match; values 8-18 all have the
-	 * same high bits (0).
-	 * On 32-bit systems, 'lo' really is two registers, requiring
-	 * some extra code.
-	 */
-#if defined(__x86_64__) || defined(_M_X64)
-	xlo = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(*(int64_t *)&lo));
-#else
-	{
-		uint32_t e0, e1;
-		int32_t f0, f1;
-
-		e0 = (uint32_t)lo;
-		e1 = (uint32_t)(lo >> 32);
-		f0 = *(int32_t *)&e0;
-		f1 = *(int32_t *)&e1;
-		xlo = _mm256_set_epi32(f1, f0, f1, f0, f1, f0, f1, f0);
-	}
-#endif
-	gtlo0 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[0]), xlo); 
-	gtlo1 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[1]), xlo); 
-	gtlo2 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[2]), xlo); 
-	gtlo3 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[3]), xlo); 
-	gtlo4 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[4]), xlo); 
-
-	/*
-	 * Keep only comparison results that correspond to the non-zero
-	 * elements in eqhi.
-	 */
-	gtlo0 = _mm256_and_si256(gtlo0, _mm256_cvtepi16_epi64(
-		_mm256_castsi256_si128(eqhi)));
-	gtlo1 = _mm256_and_si256(gtlo1, _mm256_cvtepi16_epi64(
-		_mm256_castsi256_si128(_mm256_bsrli_epi128(eqhi, 8))));
-	eqm = _mm256_permute4x64_epi64(eqhi, 0xFF);
-	gtlo2 = _mm256_and_si256(gtlo2, eqm);
-	gtlo3 = _mm256_and_si256(gtlo3, eqm);
-	gtlo4 = _mm256_and_si256(gtlo4, eqm);
-
-	/*
-	 * Add all values to count the total number of "-1" elements.
-	 * Since the first eight "high" words are all different, only
-	 * one element (at most) in gtlo0:gtlo1 can be non-zero; however,
-	 * if the high word of the random value is zero, then many
-	 * elements of gtlo2:gtlo3:gtlo4 can be non-zero.
-	 */
-	gtlo0 = _mm256_or_si256(gtlo0, gtlo1);
-	gtlo0 = _mm256_add_epi64(
-		_mm256_add_epi64(gtlo0, gtlo2),
-		_mm256_add_epi64(gtlo3, gtlo4));
-	t = _mm_add_epi64(
-		_mm256_castsi256_si128(gtlo0),
-		_mm256_extracti128_si256(gtlo0, 1));
-	t = _mm_add_epi64(t, _mm_srli_si128(t, 8));
-	r -= _mm_cvtsi128_si32(t);
-
-	return r;
-
-#else // yyyAVX2+0
-
-	static const uint32_t dist[] = {
-		10745844u,  3068844u,  3741698u,
-		 5559083u,  1580863u,  8248194u,
-		 2260429u, 13669192u,  2736639u,
-		  708981u,  4421575u, 10046180u,
-		  169348u,  7122675u,  4136815u,
-		   30538u, 13063405u,  7650655u,
-		    4132u, 14505003u,  7826148u,
-		     417u, 16768101u, 11363290u,
-		      31u,  8444042u,  8086568u,
-		       1u, 12844466u,   265321u,
-		       0u,  1232676u, 13644283u,
-		       0u,    38047u,  9111839u,
-		       0u,      870u,  6138264u,
-		       0u,       14u, 12545723u,
-		       0u,        0u,  3104126u,
-		       0u,        0u,    28824u,
-		       0u,        0u,      198u,
-		       0u,        0u,        1u
-	};
-
-	uint32_t v0, v1, v2, hi;
-	uint64_t lo;
-	size_t u;
-	int z;
-
-	/*
-	 * Get a random 72-bit value, into three 24-bit limbs v0..v2.
-	 */
-	lo = prng_get_u64(p);
-	hi = prng_get_u8(p);
-	v0 = (uint32_t)lo & 0xFFFFFF;
-	v1 = (uint32_t)(lo >> 24) & 0xFFFFFF;
-	v2 = (uint32_t)(lo >> 48) | (hi << 16);
-
-	/*
-	 * Sampled value is z, such that v0..v2 is lower than the first
-	 * z elements of the table.
-	 */
-	z = 0;
-	for (u = 0; u < (sizeof dist) / sizeof(dist[0]); u += 3) {
-		uint32_t w0, w1, w2, cc;
-
-		w0 = dist[u + 2];
-		w1 = dist[u + 1];
-		w2 = dist[u + 0];
-		cc = (v0 - w0) >> 31;
-		cc = (v1 - w1 - cc) >> 31;
-		cc = (v2 - w2 - cc) >> 31;
-		z += (int)cc;
-	}
-	return z;
-
-#endif // yyyAVX2-
-}
-
-/*
- * Sample a bit with probability exp(-x) for some x >= 0.
- */
-TARGET_AVX2
-static int
-BerExp(prng *p, fpr x, fpr ccs)
-{
-	int s, i;
-	fpr r;
-	uint32_t sw, w;
-	uint64_t z;
-
-	/*
-	 * Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
-	 * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
-	 */
-	s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
-	r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
-
-	/*
-	 * It may happen (quite rarely) that s >= 64; if sigma = 1.2
-	 * (the minimum value for sigma), r = 0 and b = 1, then we get
-	 * s >= 64 if the half-Gaussian produced a z >= 13, which happens
-	 * with probability about 0.000000000230383991, which is
-	 * approximatively equal to 2^(-32). In any case, if s >= 64,
-	 * then BerExp will be non-zero with probability less than
-	 * 2^(-64), so we can simply saturate s at 63.
-	 */
-	sw = (uint32_t)s;
-	sw ^= (sw ^ 63) & -((63 - sw) >> 31);
-	s = (int)sw;
-
-	/*
-	 * Compute exp(-r); we know that 0 <= r < log(2) at this point, so
-	 * we can use fpr_expm_p63(), which yields a result scaled to 2^63.
-	 * We scale it up to 2^64, then right-shift it by s bits because
-	 * we really want exp(-x) = 2^(-s)*exp(-r).
-	 *
-	 * The "-1" operation makes sure that the value fits on 64 bits
-	 * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
-	 * case). The bias is negligible since fpr_expm_p63() only computes
-	 * with 51 bits of precision or so.
-	 */
-	z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
-
-	/*
-	 * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
-	 * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
-	 * PRNG output to limit its consumption, the sign of the difference
-	 * yields the expected result.
-	 */
-	i = 64;
-	do {
-		i -= 8;
-		w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
-	} while (!w && i > 0);
-	return (int)(w >> 31);
-}
-
-/*
- * The sampler produces a random integer that follows a discrete Gaussian
- * distribution, centered on mu, and with standard deviation sigma. The
- * provided parameter isigma is equal to 1/sigma.
- *
- * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
- * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
- */
-TARGET_AVX2
-int
-Zf(sampler)(void *ctx, fpr mu, fpr isigma)
-{
-	sampler_context *spc;
-	int s;
-	fpr r, dss, ccs;
-
-	spc = ctx;
-
-	/*
-	 * Center is mu. We compute mu = s + r where s is an integer
-	 * and 0 <= r < 1.
-	 */
-	s = (int)fpr_floor(mu);
-	r = fpr_sub(mu, fpr_of(s));
-
-	/*
-	 * dss = 1/(2*sigma^2) = 0.5*(isigma^2).
-	 */
-	dss = fpr_half(fpr_sqr(isigma));
-
-	/*
-	 * ccs = sigma_min / sigma = sigma_min * isigma.
-	 */
-	ccs = fpr_mul(isigma, spc->sigma_min);
-
-	/*
-	 * We now need to sample on center r.
-	 */
-	for (;;) {
-		int z0, z, b;
-		fpr x;
-
-		/*
-		 * Sample z for a Gaussian distribution. Then get a
-		 * random bit b to turn the sampling into a bimodal
-		 * distribution: if b = 1, we use z+1, otherwise we
-		 * use -z. We thus have two situations:
-		 *
-		 *  - b = 1: z >= 1 and sampled against a Gaussian
-		 *    centered on 1.
-		 *  - b = 0: z <= 0 and sampled against a Gaussian
-		 *    centered on 0.
-		 */
-		z0 = Zf(gaussian0_sampler)(&spc->p);
-		b = prng_get_u8(&spc->p) & 1;
-		z = b + ((b << 1) - 1) * z0;
-
-		/*
-		 * Rejection sampling. We want a Gaussian centered on r;
-		 * but we sampled against a Gaussian centered on b (0 or
-		 * 1). But we know that z is always in the range where
-		 * our sampling distribution is greater than the Gaussian
-		 * distribution, so rejection works.
-		 *
-		 * We got z with distribution:
-		 *    G(z) = exp(-((z-b)^2)/(2*sigma0^2))
-		 * We target distribution:
-		 *    S(z) = exp(-((z-r)^2)/(2*sigma^2))
-		 * Rejection sampling works by keeping the value z with
-		 * probability S(z)/G(z), and starting again otherwise.
-		 * This requires S(z) <= G(z), which is the case here.
-		 * Thus, we simply need to keep our z with probability:
-		 *    P = exp(-x)
-		 * where:
-		 *    x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
-		 *
-		 * Here, we scale up the Bernouilli distribution, which
-		 * makes rejection more probable, but makes rejection
-		 * rate sufficiently decorrelated from the Gaussian
-		 * center and standard deviation that the whole sampler
-		 * can be said to be constant-time.
-		 */
-		x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
-		x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
-		if (BerExp(&spc->p, x, ccs)) {
-			/*
-			 * Rejection sampling was centered on r, but the
-			 * actual center is mu = s + r.
-			 */
-			return s + z;
-		}
-	}
-}
-
-/* see inner.h */
-void
-Zf(sign_tree)(int16_t *sig, inner_shake256_context *rng,
-	const fpr *restrict expanded_key,
-	const uint16_t *hm, unsigned logn, uint8_t *tmp)
-{
-	fpr *ftmp;
-
-	ftmp = (fpr *)tmp;
-	for (;;) {
-		/*
-		 * Signature produces short vectors s1 and s2. The
-		 * signature is acceptable only if the aggregate vector
-		 * s1,s2 is short; we must use the same bound as the
-		 * verifier.
-		 *
-		 * If the signature is acceptable, then we return only s2
-		 * (the verifier recomputes s1 from s2, the hashed message,
-		 * and the public key).
-		 */
-		sampler_context spc;
-		samplerZ samp;
-		void *samp_ctx;
-
-		/*
-		 * Normal sampling. We use a fast PRNG seeded from our
-		 * SHAKE context ('rng').
-		 */
-		spc.sigma_min = (logn == 10)
-			? fpr_sigma_min_10
-			: fpr_sigma_min_9;
-		Zf(prng_init)(&spc.p, rng);
-		samp = Zf(sampler);
-		samp_ctx = &spc;
-
-		/*
-		 * Do the actual signature.
-		 */
-		if (do_sign_tree(samp, samp_ctx, sig,
-			expanded_key, hm, logn, ftmp))
-		{
-			break;
-		}
-	}
-}
-
-/* see inner.h */
-void
-Zf(sign_dyn)(int16_t *sig, inner_shake256_context *rng,
-	const int8_t *restrict f, const int8_t *restrict g,
-	const int8_t *restrict F, const int8_t *restrict G,
-	const uint16_t *hm, unsigned logn, uint8_t *tmp)
-{
-	fpr *ftmp;
-
-	ftmp = (fpr *)tmp;
-	for (;;) {
-		/*
-		 * Signature produces short vectors s1 and s2. The
-		 * signature is acceptable only if the aggregate vector
-		 * s1,s2 is short; we must use the same bound as the
-		 * verifier.
-		 *
-		 * If the signature is acceptable, then we return only s2
-		 * (the verifier recomputes s1 from s2, the hashed message,
-		 * and the public key).
-		 */
-		sampler_context spc;
-		samplerZ samp;
-		void *samp_ctx;
-
-		/*
-		 * Normal sampling. We use a fast PRNG seeded from our
-		 * SHAKE context ('rng').
-		 */
-		spc.sigma_min = (logn == 10)
-			? fpr_sigma_min_10
-			: fpr_sigma_min_9;
-		Zf(prng_init)(&spc.p, rng);
-		samp = Zf(sampler);
-		samp_ctx = &spc;
-
-		/*
-		 * Do the actual signature.
-		 */
-		if (do_sign_dyn(samp, samp_ctx, sig,
-			f, g, F, G, hm, logn, ftmp))
-		{
-			break;
-		}
-	}
-}
diff --git a/crypto_sign/falcon-1024/m4-ct/vrfy.c b/crypto_sign/falcon-1024/m4-ct/vrfy.c
deleted file mode 100644
index c74a3dd3..00000000
--- a/crypto_sign/falcon-1024/m4-ct/vrfy.c
+++ /dev/null
@@ -1,871 +0,0 @@
-/*
- * Falcon signature verification.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-/* ===================================================================== */
-/*
- * Constants for NTT.
- *
- *   n = 2^logn  (2 <= n <= 1024)
- *   phi = X^n + 1
- *   q = 12289
- *   q0i = -1/q mod 2^16
- *   R = 2^16 mod q
- *   R2 = 2^32 mod q
- */
-
-#define Q     12289
-#define Q0I   12287
-#define R      4091
-#define R2    10952
-
-/*
- * Table for NTT, binary case:
- *   GMb[x] = R*(g^rev(x)) mod q
- * where g = 7 (it is a 2048-th primitive root of 1 modulo q)
- * and rev() is the bit-reversal function over 10 bits.
- */
-static const uint16_t GMb[] = {
-	 4091,  7888, 11060, 11208,  6960,  4342,  6275,  9759,
-	 1591,  6399,  9477,  5266,   586,  5825,  7538,  9710,
-	 1134,  6407,  1711,   965,  7099,  7674,  3743,  6442,
-	10414,  8100,  1885,  1688,  1364, 10329, 10164,  9180,
-	12210,  6240,   997,   117,  4783,  4407,  1549,  7072,
-	 2829,  6458,  4431,  8877,  7144,  2564,  5664,  4042,
-	12189,   432, 10751,  1237,  7610,  1534,  3983,  7863,
-	 2181,  6308,  8720,  6570,  4843,  1690,    14,  3872,
-	 5569,  9368, 12163,  2019,  7543,  2315,  4673,  7340,
-	 1553,  1156,  8401, 11389,  1020,  2967, 10772,  7045,
-	 3316, 11236,  5285, 11578, 10637, 10086,  9493,  6180,
-	 9277,  6130,  3323,   883, 10469,   489,  1502,  2851,
-	11061,  9729,  2742, 12241,  4970, 10481, 10078,  1195,
-	  730,  1762,  3854,  2030,  5892, 10922,  9020,  5274,
-	 9179,  3604,  3782, 10206,  3180,  3467,  4668,  2446,
-	 7613,  9386,   834,  7703,  6836,  3403,  5351, 12276,
-	 3580,  1739, 10820,  9787, 10209,  4070, 12250,  8525,
-	10401,  2749,  7338, 10574,  6040,   943,  9330,  1477,
-	 6865,  9668,  3585,  6633, 12145,  4063,  3684,  7680,
-	 8188,  6902,  3533,  9807,  6090,   727, 10099,  7003,
-	 6945,  1949,  9731, 10559,  6057,   378,  7871,  8763,
-	 8901,  9229,  8846,  4551,  9589, 11664,  7630,  8821,
-	 5680,  4956,  6251,  8388, 10156,  8723,  2341,  3159,
-	 1467,  5460,  8553,  7783,  2649,  2320,  9036,  6188,
-	  737,  3698,  4699,  5753,  9046,  3687,    16,   914,
-	 5186, 10531,  4552,  1964,  3509,  8436,  7516,  5381,
-	10733,  3281,  7037,  1060,  2895,  7156,  8887,  5357,
-	 6409,  8197,  2962,  6375,  5064,  6634,  5625,   278,
-	  932, 10229,  8927,  7642,   351,  9298,   237,  5858,
-	 7692,  3146, 12126,  7586,  2053, 11285,  3802,  5204,
-	 4602,  1748, 11300,   340,  3711,  4614,   300, 10993,
-	 5070, 10049, 11616, 12247,  7421, 10707,  5746,  5654,
-	 3835,  5553,  1224,  8476,  9237,  3845,   250, 11209,
-	 4225,  6326,  9680, 12254,  4136,  2778,   692,  8808,
-	 6410,  6718, 10105, 10418,  3759,  7356, 11361,  8433,
-	 6437,  3652,  6342,  8978,  5391,  2272,  6476,  7416,
-	 8418, 10824, 11986,  5733,   876,  7030,  2167,  2436,
-	 3442,  9217,  8206,  4858,  5964,  2746,  7178,  1434,
-	 7389,  8879, 10661, 11457,  4220,  1432, 10832,  4328,
-	 8557,  1867,  9454,  2416,  3816,  9076,   686,  5393,
-	 2523,  4339,  6115,   619,   937,  2834,  7775,  3279,
-	 2363,  7488,  6112,  5056,   824, 10204, 11690,  1113,
-	 2727,  9848,   896,  2028,  5075,  2654, 10464,  7884,
-	12169,  5434,  3070,  6400,  9132, 11672, 12153,  4520,
-	 1273,  9739, 11468,  9937, 10039,  9720,  2262,  9399,
-	11192,   315,  4511,  1158,  6061,  6751, 11865,   357,
-	 7367,  4550,   983,  8534,  8352, 10126,  7530,  9253,
-	 4367,  5221,  3999,  8777,  3161,  6990,  4130, 11652,
-	 3374, 11477,  1753,   292,  8681,  2806, 10378, 12188,
-	 5800, 11811,  3181,  1988,  1024,  9340,  2477, 10928,
-	 4582,  6750,  3619,  5503,  5233,  2463,  8470,  7650,
-	 7964,  6395,  1071,  1272,  3474, 11045,  3291, 11344,
-	 8502,  9478,  9837,  1253,  1857,  6233,  4720, 11561,
-	 6034,  9817,  3339,  1797,  2879,  6242,  5200,  2114,
-	 7962,  9353, 11363,  5475,  6084,  9601,  4108,  7323,
-	10438,  9471,  1271,   408,  6911,  3079,   360,  8276,
-	11535,  9156,  9049, 11539,   850,  8617,   784,  7919,
-	 8334, 12170,  1846, 10213, 12184,  7827, 11903,  5600,
-	 9779,  1012,   721,  2784,  6676,  6552,  5348,  4424,
-	 6816,  8405,  9959,  5150,  2356,  5552,  5267,  1333,
-	 8801,  9661,  7308,  5788,  4910,   909, 11613,  4395,
-	 8238,  6686,  4302,  3044,  2285, 12249,  1963,  9216,
-	 4296, 11918,   695,  4371,  9793,  4884,  2411, 10230,
-	 2650,   841,  3890, 10231,  7248,  8505, 11196,  6688,
-	 4059,  6060,  3686,  4722, 11853,  5816,  7058,  6868,
-	11137,  7926,  4894, 12284,  4102,  3908,  3610,  6525,
-	 7938,  7982, 11977,  6755,   537,  4562,  1623,  8227,
-	11453,  7544,   906, 11816,  9548, 10858,  9703,  2815,
-	11736,  6813,  6979,   819,  8903,  6271, 10843,   348,
-	 7514,  8339,  6439,   694,   852,  5659,  2781,  3716,
-	11589,  3024,  1523,  8659,  4114, 10738,  3303,  5885,
-	 2978,  7289, 11884,  9123,  9323, 11830,    98,  2526,
-	 2116,  4131, 11407,  1844,  3645,  3916,  8133,  2224,
-	10871,  8092,  9651,  5989,  7140,  8480,  1670,   159,
-	10923,  4918,   128,  7312,   725,  9157,  5006,  6393,
-	 3494,  6043, 10972,  6181, 11838,  3423, 10514,  7668,
-	 3693,  6658,  6905, 11953, 10212, 11922,  9101,  8365,
-	 5110,    45,  2400,  1921,  4377,  2720,  1695,    51,
-	 2808,   650,  1896,  9997,  9971, 11980,  8098,  4833,
-	 4135,  4257,  5838,  4765, 10985, 11532,   590, 12198,
-	  482, 12173,  2006,  7064, 10018,  3912, 12016, 10519,
-	11362,  6954,  2210,   284,  5413,  6601,  3865, 10339,
-	11188,  6231,   517,  9564, 11281,  3863,  1210,  4604,
-	 8160, 11447,   153,  7204,  5763,  5089,  9248, 12154,
-	11748,  1354,  6672,   179,  5532,  2646,  5941, 12185,
-	  862,  3158,   477,  7279,  5678,  7914,  4254,   302,
-	 2893, 10114,  6890,  9560,  9647, 11905,  4098,  9824,
-	10269,  1353, 10715,  5325,  6254,  3951,  1807,  6449,
-	 5159,  1308,  8315,  3404,  1877,  1231,   112,  6398,
-	11724, 12272,  7286,  1459, 12274,  9896,  3456,   800,
-	 1397, 10678,   103,  7420,  7976,   936,   764,   632,
-	 7996,  8223,  8445,  7758, 10870,  9571,  2508,  1946,
-	 6524, 10158,  1044,  4338,  2457,  3641,  1659,  4139,
-	 4688,  9733, 11148,  3946,  2082,  5261,  2036, 11850,
-	 7636, 12236,  5366,  2380,  1399,  7720,  2100,  3217,
-	10912,  8898,  7578, 11995,  2791,  1215,  3355,  2711,
-	 2267,  2004,  8568, 10176,  3214,  2337,  1750,  4729,
-	 4997,  7415,  6315, 12044,  4374,  7157,  4844,   211,
-	 8003, 10159,  9290, 11481,  1735,  2336,  5793,  9875,
-	 8192,   986,  7527,  1401,   870,  3615,  8465,  2756,
-	 9770,  2034, 10168,  3264,  6132,    54,  2880,  4763,
-	11805,  3074,  8286,  9428,  4881,  6933,  1090, 10038,
-	 2567,   708,   893,  6465,  4962, 10024,  2090,  5718,
-	10743,   780,  4733,  4623,  2134,  2087,  4802,   884,
-	 5372,  5795,  5938,  4333,  6559,  7549,  5269, 10664,
-	 4252,  3260,  5917, 10814,  5768,  9983,  8096,  7791,
-	 6800,  7491,  6272,  1907, 10947,  6289, 11803,  6032,
-	11449,  1171,  9201,  7933,  2479,  7970, 11337,  7062,
-	 8911,  6728,  6542,  8114,  8828,  6595,  3545,  4348,
-	 4610,  2205,  6999,  8106,  5560, 10390,  9321,  2499,
-	 2413,  7272,  6881, 10582,  9308,  9437,  3554,  3326,
-	 5991, 11969,  3415, 12283,  9838, 12063,  4332,  7830,
-	11329,  6605, 12271,  2044, 11611,  7353, 11201, 11582,
-	 3733,  8943,  9978,  1627,  7168,  3935,  5050,  2762,
-	 7496, 10383,   755,  1654, 12053,  4952, 10134,  4394,
-	 6592,  7898,  7497,  8904, 12029,  3581, 10748,  5674,
-	10358,  4901,  7414,  8771,   710,  6764,  8462,  7193,
-	 5371,  7274, 11084,   290,  7864,  6827, 11822,  2509,
-	 6578,  4026,  5807,  1458,  5721,  5762,  4178,  2105,
-	11621,  4852,  8897,  2856, 11510,  9264,  2520,  8776,
-	 7011,  2647,  1898,  7039,  5950, 11163,  5488,  6277,
-	 9182, 11456,   633, 10046, 11554,  5633,  9587,  2333,
-	 7008,  7084,  5047,  7199,  9865,  8997,   569,  6390,
-	10845,  9679,  8268, 11472,  4203,  1997,     2,  9331,
-	  162,  6182,  2000,  3649,  9792,  6363,  7557,  6187,
-	 8510,  9935,  5536,  9019,  3706, 12009,  1452,  3067,
-	 5494,  9692,  4865,  6019,  7106,  9610,  4588, 10165,
-	 6261,  5887,  2652, 10172,  1580, 10379,  4638,  9949
-};
-
-/*
- * Table for inverse NTT, binary case:
- *   iGMb[x] = R*((1/g)^rev(x)) mod q
- * Since g = 7, 1/g = 8778 mod 12289.
- */
-static const uint16_t iGMb[] = {
-	 4091,  4401,  1081,  1229,  2530,  6014,  7947,  5329,
-	 2579,  4751,  6464, 11703,  7023,  2812,  5890, 10698,
-	 3109,  2125,  1960, 10925, 10601, 10404,  4189,  1875,
-	 5847,  8546,  4615,  5190, 11324, 10578,  5882, 11155,
-	 8417, 12275, 10599,  7446,  5719,  3569,  5981, 10108,
-	 4426,  8306, 10755,  4679, 11052,  1538, 11857,   100,
-	 8247,  6625,  9725,  5145,  3412,  7858,  5831,  9460,
-	 5217, 10740,  7882,  7506, 12172, 11292,  6049,    79,
-	   13,  6938,  8886,  5453,  4586, 11455,  2903,  4676,
-	 9843,  7621,  8822,  9109,  2083,  8507,  8685,  3110,
-	 7015,  3269,  1367,  6397, 10259,  8435, 10527, 11559,
-	11094,  2211,  1808,  7319,    48,  9547,  2560,  1228,
-	 9438, 10787, 11800,  1820, 11406,  8966,  6159,  3012,
-	 6109,  2796,  2203,  1652,   711,  7004,  1053,  8973,
-	 5244,  1517,  9322, 11269,   900,  3888, 11133, 10736,
-	 4949,  7616,  9974,  4746, 10270,   126,  2921,  6720,
-	 6635,  6543,  1582,  4868,    42,   673,  2240,  7219,
-	 1296, 11989,  7675,  8578, 11949,   989, 10541,  7687,
-	 7085,  8487,  1004, 10236,  4703,   163,  9143,  4597,
-	 6431, 12052,  2991, 11938,  4647,  3362,  2060, 11357,
-	12011,  6664,  5655,  7225,  5914,  9327,  4092,  5880,
-	 6932,  3402,  5133,  9394, 11229,  5252,  9008,  1556,
-	 6908,  4773,  3853,  8780, 10325,  7737,  1758,  7103,
-	11375, 12273,  8602,  3243,  6536,  7590,  8591, 11552,
-	 6101,  3253,  9969,  9640,  4506,  3736,  6829, 10822,
-	 9130,  9948,  3566,  2133,  3901,  6038,  7333,  6609,
-	 3468,  4659,   625,  2700,  7738,  3443,  3060,  3388,
-	 3526,  4418, 11911,  6232,  1730,  2558, 10340,  5344,
-	 5286,  2190, 11562,  6199,  2482,  8756,  5387,  4101,
-	 4609,  8605,  8226,   144,  5656,  8704,  2621,  5424,
-	10812,  2959, 11346,  6249,  1715,  4951,  9540,  1888,
-	 3764,    39,  8219,  2080,  2502,  1469, 10550,  8709,
-	 5601,  1093,  3784,  5041,  2058,  8399, 11448,  9639,
-	 2059,  9878,  7405,  2496,  7918, 11594,   371,  7993,
-	 3073, 10326,    40, 10004,  9245,  7987,  5603,  4051,
-	 7894,   676, 11380,  7379,  6501,  4981,  2628,  3488,
-	10956,  7022,  6737,  9933,  7139,  2330,  3884,  5473,
-	 7865,  6941,  5737,  5613,  9505, 11568, 11277,  2510,
-	 6689,   386,  4462,   105,  2076, 10443,   119,  3955,
-	 4370, 11505,  3672, 11439,   750,  3240,  3133,   754,
-	 4013, 11929,  9210,  5378, 11881, 11018,  2818,  1851,
-	 4966,  8181,  2688,  6205,  6814,   926,  2936,  4327,
-	10175,  7089,  6047,  9410, 10492,  8950,  2472,  6255,
-	  728,  7569,  6056, 10432, 11036,  2452,  2811,  3787,
-	  945,  8998,  1244,  8815, 11017, 11218,  5894,  4325,
-	 4639,  3819,  9826,  7056,  6786,  8670,  5539,  7707,
-	 1361,  9812,  2949, 11265, 10301,  9108,   478,  6489,
-	  101,  1911,  9483,  3608, 11997, 10536,   812,  8915,
-	  637,  8159,  5299,  9128,  3512,  8290,  7068,  7922,
-	 3036,  4759,  2163,  3937,  3755, 11306,  7739,  4922,
-	11932,   424,  5538,  6228, 11131,  7778, 11974,  1097,
-	 2890, 10027,  2569,  2250,  2352,   821,  2550, 11016,
-	 7769,   136,   617,  3157,  5889,  9219,  6855,   120,
-	 4405,  1825,  9635,  7214, 10261, 11393,  2441,  9562,
-	11176,   599,  2085, 11465,  7233,  6177,  4801,  9926,
-	 9010,  4514,  9455, 11352, 11670,  6174,  7950,  9766,
-	 6896, 11603,  3213,  8473,  9873,  2835, 10422,  3732,
-	 7961,  1457, 10857,  8069,   832,  1628,  3410,  4900,
-	10855,  5111,  9543,  6325,  7431,  4083,  3072,  8847,
-	 9853, 10122,  5259, 11413,  6556,   303,  1465,  3871,
-	 4873,  5813, 10017,  6898,  3311,  5947,  8637,  5852,
-	 3856,   928,  4933,  8530,  1871,  2184,  5571,  5879,
-	 3481, 11597,  9511,  8153,    35,  2609,  5963,  8064,
-	 1080, 12039,  8444,  3052,  3813, 11065,  6736,  8454,
-	 2340,  7651,  1910, 10709,  2117,  9637,  6402,  6028,
-	 2124,  7701,  2679,  5183,  6270,  7424,  2597,  6795,
-	 9222, 10837,   280,  8583,  3270,  6753,  2354,  3779,
-	 6102,  4732,  5926,  2497,  8640, 10289,  6107, 12127,
-	 2958, 12287, 10292,  8086,   817,  4021,  2610,  1444,
-	 5899, 11720,  3292,  2424,  5090,  7242,  5205,  5281,
-	 9956,  2702,  6656,   735,  2243, 11656,   833,  3107,
-	 6012,  6801,  1126,  6339,  5250, 10391,  9642,  5278,
-	 3513,  9769,  3025,   779,  9433,  3392,  7437,   668,
-	10184,  8111,  6527,  6568, 10831,  6482,  8263,  5711,
-	 9780,   467,  5462,  4425, 11999,  1205,  5015,  6918,
-	 5096,  3827,  5525, 11579,  3518,  4875,  7388,  1931,
-	 6615,  1541,  8708,   260,  3385,  4792,  4391,  5697,
-	 7895,  2155,  7337,   236, 10635, 11534,  1906,  4793,
-	 9527,  7239,  8354,  5121, 10662,  2311,  3346,  8556,
-	  707,  1088,  4936,   678, 10245,    18,  5684,   960,
-	 4459,  7957,   226,  2451,     6,  8874,   320,  6298,
-	 8963,  8735,  2852,  2981,  1707,  5408,  5017,  9876,
-	 9790,  2968,  1899,  6729,  4183,  5290, 10084,  7679,
-	 7941,  8744,  5694,  3461,  4175,  5747,  5561,  3378,
-	 5227,   952,  4319,  9810,  4356,  3088, 11118,   840,
-	 6257,   486,  6000,  1342, 10382,  6017,  4798,  5489,
-	 4498,  4193,  2306,  6521,  1475,  6372,  9029,  8037,
-	 1625,  7020,  4740,  5730,  7956,  6351,  6494,  6917,
-	11405,  7487, 10202, 10155,  7666,  7556, 11509,  1546,
-	 6571, 10199,  2265,  7327,  5824, 11396, 11581,  9722,
-	 2251, 11199,  5356,  7408,  2861,  4003,  9215,   484,
-	 7526,  9409, 12235,  6157,  9025,  2121, 10255,  2519,
-	 9533,  3824,  8674, 11419, 10888,  4762, 11303,  4097,
-	 2414,  6496,  9953, 10554,   808,  2999,  2130,  4286,
-	12078,  7445,  5132,  7915,   245,  5974,  4874,  7292,
-	 7560, 10539,  9952,  9075,  2113,  3721, 10285, 10022,
-	 9578,  8934, 11074,  9498,   294,  4711,  3391,  1377,
-	 9072, 10189,  4569, 10890,  9909,  6923,    53,  4653,
-	  439, 10253,  7028, 10207,  8343,  1141,  2556,  7601,
-	 8150, 10630,  8648,  9832,  7951, 11245,  2131,  5765,
-	10343,  9781,  2718,  1419,  4531,  3844,  4066,  4293,
-	11657, 11525, 11353,  4313,  4869, 12186,  1611, 10892,
-	11489,  8833,  2393,    15, 10830,  5003,    17,   565,
-	 5891, 12177, 11058, 10412,  8885,  3974, 10981,  7130,
-	 5840, 10482,  8338,  6035,  6964,  1574, 10936,  2020,
-	 2465,  8191,   384,  2642,  2729,  5399,  2175,  9396,
-	11987,  8035,  4375,  6611,  5010, 11812,  9131, 11427,
-	  104,  6348,  9643,  6757, 12110,  5617, 10935,   541,
-	  135,  3041,  7200,  6526,  5085, 12136,   842,  4129,
-	 7685, 11079,  8426,  1008,  2725, 11772,  6058,  1101,
-	 1950,  8424,  5688,  6876, 12005, 10079,  5335,   927,
-	 1770,   273,  8377,  2271,  5225, 10283,   116, 11807,
-	   91, 11699,   757,  1304,  7524,  6451,  8032,  8154,
-	 7456,  4191,   309,  2318,  2292, 10393, 11639,  9481,
-	12238, 10594,  9569,  7912, 10368,  9889, 12244,  7179,
-	 3924,  3188,   367,  2077,   336,  5384,  5631,  8596,
-	 4621,  1775,  8866,   451,  6108,  1317,  6246,  8795,
-	 5896,  7283,  3132, 11564,  4977, 12161,  7371,  1366,
-	12130, 10619,  3809,  5149,  6300,  2638,  4197,  1418,
-	10065,  4156,  8373,  8644, 10445,   882,  8158, 10173,
-	 9763, 12191,   459,  2966,  3166,   405,  5000,  9311,
-	 6404,  8986,  1551,  8175,  3630, 10766,  9265,   700,
-	 8573,  9508,  6630, 11437, 11595,  5850,  3950,  4775,
-	11941,  1446,  6018,  3386, 11470,  5310,  5476,   553,
-	 9474,  2586,  1431,  2741,   473, 11383,  4745,   836,
-	 4062, 10666,  7727, 11752,  5534,   312,  4307,  4351,
-	 5764,  8679,  8381,  8187,     5,  7395,  4363,  1152,
-	 5421,  5231,  6473,   436,  7567,  8603,  6229,  8230
-};
-
-/*
- * Reduce a small signed integer modulo q. The source integer MUST
- * be between -q/2 and +q/2.
- */
-static inline uint32_t
-mq_conv_small(int x)
-{
-	/*
-	 * If x < 0, the cast to uint32_t will set the high bit to 1.
-	 */
-	uint32_t y;
-
-	y = (uint32_t)x;
-	y += Q & -(y >> 31);
-	return y;
-}
-
-/*
- * Addition modulo q. Operands must be in the 0..q-1 range.
- */
-static inline uint32_t
-mq_add(uint32_t x, uint32_t y)
-{
-	/*
-	 * We compute x + y - q. If the result is negative, then the
-	 * high bit will be set, and 'd >> 31' will be equal to 1;
-	 * thus '-(d >> 31)' will be an all-one pattern. Otherwise,
-	 * it will be an all-zero pattern. In other words, this
-	 * implements a conditional addition of q.
-	 */
-	uint32_t d;
-
-	d = x + y - Q;
-	d += Q & -(d >> 31);
-	return d;
-}
-
-/*
- * Subtraction modulo q. Operands must be in the 0..q-1 range.
- */
-static inline uint32_t
-mq_sub(uint32_t x, uint32_t y)
-{
-	/*
-	 * As in mq_add(), we use a conditional addition to ensure the
-	 * result is in the 0..q-1 range.
-	 */
-	uint32_t d;
-
-	d = x - y;
-	d += Q & -(d >> 31);
-	return d;
-}
-
-/*
- * Division by 2 modulo q. Operand must be in the 0..q-1 range.
- */
-static inline uint32_t
-mq_rshift1(uint32_t x)
-{
-	x += Q & -(x & 1);
-	return (x >> 1);
-}
-
-/*
- * Montgomery multiplication modulo q. If we set R = 2^16 mod q, then
- * this function computes: x * y / R mod q
- * Operands must be in the 0..q-1 range.
- */
-static inline uint32_t
-mq_montymul(uint32_t x, uint32_t y)
-{
-	uint32_t z, w;
-
-	/*
-	 * We compute x*y + k*q with a value of k chosen so that the 16
-	 * low bits of the result are 0. We can then shift the value.
-	 * After the shift, result may still be larger than q, but it
-	 * will be lower than 2*q, so a conditional subtraction works.
-	 */
-
-	z = x * y;
-	w = ((z * Q0I) & 0xFFFF) * Q;
-
-	/*
-	 * When adding z and w, the result will have its low 16 bits
-	 * equal to 0. Since x, y and z are lower than q, the sum will
-	 * be no more than (2^15 - 1) * q + (q - 1)^2, which will
-	 * fit on 29 bits.
-	 */
-	z = (z + w) >> 16;
-
-	/*
-	 * After the shift, analysis shows that the value will be less
-	 * than 2q. We do a subtraction then conditional subtraction to
-	 * ensure the result is in the expected range.
-	 */
-	z -= Q;
-	z += Q & -(z >> 31);
-	return z;
-}
-
-/*
- * Montgomery squaring (computes (x^2)/R).
- */
-static inline uint32_t
-mq_montysqr(uint32_t x)
-{
-	return mq_montymul(x, x);
-}
-
-/*
- * Divide x by y modulo q = 12289.
- */
-static inline uint32_t
-mq_div_12289(uint32_t x, uint32_t y)
-{
-	/*
-	 * We invert y by computing y^(q-2) mod q.
-	 *
-	 * We use the following addition chain for exponent e = 12287:
-	 *
-	 *   e0 = 1
-	 *   e1 = 2 * e0 = 2
-	 *   e2 = e1 + e0 = 3
-	 *   e3 = e2 + e1 = 5
-	 *   e4 = 2 * e3 = 10
-	 *   e5 = 2 * e4 = 20
-	 *   e6 = 2 * e5 = 40
-	 *   e7 = 2 * e6 = 80
-	 *   e8 = 2 * e7 = 160
-	 *   e9 = e8 + e2 = 163
-	 *   e10 = e9 + e8 = 323
-	 *   e11 = 2 * e10 = 646
-	 *   e12 = 2 * e11 = 1292
-	 *   e13 = e12 + e9 = 1455
-	 *   e14 = 2 * e13 = 2910
-	 *   e15 = 2 * e14 = 5820
-	 *   e16 = e15 + e10 = 6143
-	 *   e17 = 2 * e16 = 12286
-	 *   e18 = e17 + e0 = 12287
-	 *
-	 * Additions on exponents are converted to Montgomery
-	 * multiplications. We define all intermediate results as so
-	 * many local variables, and let the C compiler work out which
-	 * must be kept around.
-	 */
-	uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
-	uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18;
-
-	y0 = mq_montymul(y, R2);
-	y1 = mq_montysqr(y0);
-	y2 = mq_montymul(y1, y0);
-	y3 = mq_montymul(y2, y1);
-	y4 = mq_montysqr(y3);
-	y5 = mq_montysqr(y4);
-	y6 = mq_montysqr(y5);
-	y7 = mq_montysqr(y6);
-	y8 = mq_montysqr(y7);
-	y9 = mq_montymul(y8, y2);
-	y10 = mq_montymul(y9, y8);
-	y11 = mq_montysqr(y10);
-	y12 = mq_montysqr(y11);
-	y13 = mq_montymul(y12, y9);
-	y14 = mq_montysqr(y13);
-	y15 = mq_montysqr(y14);
-	y16 = mq_montymul(y15, y10);
-	y17 = mq_montysqr(y16);
-	y18 = mq_montymul(y17, y0);
-
-	/*
-	 * Final multiplication with x, which is not in Montgomery
-	 * representation, computes the correct division result.
-	 */
-	return mq_montymul(y18, x);
-}
-
-/*
- * Compute NTT on a ring element.
- */
-static void
-mq_NTT(uint16_t *a, unsigned logn)
-{
-	size_t n, t, m;
-
-	n = (size_t)1 << logn;
-	t = n;
-	for (m = 1; m < n; m <<= 1) {
-		size_t ht, i, j1;
-
-		ht = t >> 1;
-		for (i = 0, j1 = 0; i < m; i ++, j1 += t) {
-			size_t j, j2;
-			uint32_t s;
-
-			s = GMb[m + i];
-			j2 = j1 + ht;
-			for (j = j1; j < j2; j ++) {
-				uint32_t u, v;
-
-				u = a[j];
-				v = mq_montymul(a[j + ht], s);
-				a[j] = (uint16_t)mq_add(u, v);
-				a[j + ht] = (uint16_t)mq_sub(u, v);
-			}
-		}
-		t = ht;
-	}
-}
-
-/*
- * Compute the inverse NTT on a ring element, binary case.
- */
-static void
-mq_iNTT(uint16_t *a, unsigned logn)
-{
-	size_t n, t, m;
-	uint32_t ni;
-
-	n = (size_t)1 << logn;
-	t = 1;
-	m = n;
-	while (m > 1) {
-		size_t hm, dt, i, j1;
-
-		hm = m >> 1;
-		dt = t << 1;
-		for (i = 0, j1 = 0; i < hm; i ++, j1 += dt) {
-			size_t j, j2;
-			uint32_t s;
-
-			j2 = j1 + t;
-			s = iGMb[hm + i];
-			for (j = j1; j < j2; j ++) {
-				uint32_t u, v, w;
-
-				u = a[j];
-				v = a[j + t];
-				a[j] = (uint16_t)mq_add(u, v);
-				w = mq_sub(u, v);
-				a[j + t] = (uint16_t)
-					mq_montymul(w, s);
-			}
-		}
-		t = dt;
-		m = hm;
-	}
-
-	/*
-	 * To complete the inverse NTT, we must now divide all values by
-	 * n (the vector size). We thus need the inverse of n, i.e. we
-	 * need to divide 1 by 2 logn times. But we also want it in
-	 * Montgomery representation, i.e. we also want to multiply it
-	 * by R = 2^16. In the common case, this should be a simple right
-	 * shift. The loop below is generic and works also in corner cases;
-	 * its computation time is negligible.
-	 */
-	ni = R;
-	for (m = n; m > 1; m >>= 1) {
-		ni = mq_rshift1(ni);
-	}
-	for (m = 0; m < n; m ++) {
-		a[m] = (uint16_t)mq_montymul(a[m], ni);
-	}
-}
-
-/*
- * Convert a polynomial (mod q) to Montgomery representation.
- */
-static void
-mq_poly_tomonty(uint16_t *f, unsigned logn)
-{
-	size_t u, n;
-
-	n = (size_t)1 << logn;
-	for (u = 0; u < n; u ++) {
-		f[u] = (uint16_t)mq_montymul(f[u], R2);
-	}
-}
-
-/*
- * Multiply two polynomials together (NTT representation, and using
- * a Montgomery multiplication). Result f*g is written over f.
- */
-static void
-mq_poly_montymul_ntt(uint16_t *f, const uint16_t *g, unsigned logn)
-{
-	size_t u, n;
-
-	n = (size_t)1 << logn;
-	for (u = 0; u < n; u ++) {
-		f[u] = (uint16_t)mq_montymul(f[u], g[u]);
-	}
-}
-
-/*
- * Subtract polynomial g from polynomial f.
- */
-static void
-mq_poly_sub(uint16_t *f, const uint16_t *g, unsigned logn)
-{
-	size_t u, n;
-
-	n = (size_t)1 << logn;
-	for (u = 0; u < n; u ++) {
-		f[u] = (uint16_t)mq_sub(f[u], g[u]);
-	}
-}
-
-/* ===================================================================== */
-
-/* see inner.h */
-void
-Zf(to_ntt_monty)(uint16_t *h, unsigned logn)
-{
-	mq_NTT(h, logn);
-	mq_poly_tomonty(h, logn);
-}
-
-/* see inner.h */
-int
-Zf(verify_raw)(const uint16_t *c0, const int16_t *s2,
-	const uint16_t *h, unsigned logn, uint8_t *tmp)
-{
-	size_t u, n;
-	uint16_t *tt;
-
-	n = (size_t)1 << logn;
-	tt = (uint16_t *)tmp;
-
-	/*
-	 * Reduce s2 elements modulo q ([0..q-1] range).
-	 */
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = (uint32_t)s2[u];
-		w += Q & -(w >> 31);
-		tt[u] = (uint16_t)w;
-	}
-
-	/*
-	 * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]).
-	 */
-	mq_NTT(tt, logn);
-	mq_poly_montymul_ntt(tt, h, logn);
-	mq_iNTT(tt, logn);
-	mq_poly_sub(tt, c0, logn);
-
-	/*
-	 * Normalize -s1 elements into the [-q/2..q/2] range.
-	 */
-	for (u = 0; u < n; u ++) {
-		int32_t w;
-
-		w = (int32_t)tt[u];
-		w -= (int32_t)(Q & -(((Q >> 1) - (uint32_t)w) >> 31));
-		((int16_t *)tt)[u] = (int16_t)w;
-	}
-
-	/*
-	 * Signature is valid if and only if the aggregate (-s1,s2) vector
-	 * is short enough.
-	 */
-	return Zf(is_short)((int16_t *)tt, s2, logn);
-}
-
-/* see inner.h */
-int
-Zf(compute_public)(uint16_t *h,
-	const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp)
-{
-	size_t u, n;
-	uint16_t *tt;
-
-	n = (size_t)1 << logn;
-	tt = (uint16_t *)tmp;
-	for (u = 0; u < n; u ++) {
-		tt[u] = (uint16_t)mq_conv_small(f[u]);
-		h[u] = (uint16_t)mq_conv_small(g[u]);
-	}
-	mq_NTT(h, logn);
-	mq_NTT(tt, logn);
-	for (u = 0; u < n; u ++) {
-		if (tt[u] == 0) {
-			return 0;
-		}
-		h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
-	}
-	mq_iNTT(h, logn);
-	return 1;
-}
-
-/* see inner.h */
-int
-Zf(complete_private)(int8_t *G,
-	const int8_t *f, const int8_t *g, const int8_t *F,
-	unsigned logn, uint8_t *tmp)
-{
-	size_t u, n;
-	uint16_t *t1, *t2;
-
-	n = (size_t)1 << logn;
-	t1 = (uint16_t *)tmp;
-	t2 = t1 + n;
-	for (u = 0; u < n; u ++) {
-		t1[u] = (uint16_t)mq_conv_small(g[u]);
-		t2[u] = (uint16_t)mq_conv_small(F[u]);
-	}
-	mq_NTT(t1, logn);
-	mq_NTT(t2, logn);
-	mq_poly_tomonty(t1, logn);
-	mq_poly_montymul_ntt(t1, t2, logn);
-	for (u = 0; u < n; u ++) {
-		t2[u] = (uint16_t)mq_conv_small(f[u]);
-	}
-	mq_NTT(t2, logn);
-	for (u = 0; u < n; u ++) {
-		if (t2[u] == 0) {
-			return 0;
-		}
-		t1[u] = (uint16_t)mq_div_12289(t1[u], t2[u]);
-	}
-	mq_iNTT(t1, logn);
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-		int32_t gi;
-
-		w = t1[u];
-		w -= (Q & ~-((w - (Q >> 1)) >> 31));
-		gi = *(int32_t *)&w;
-		if (gi < -127 || gi > +127) {
-			return 0;
-		}
-		G[u] = (int8_t)gi;
-	}
-	return 1;
-}
-
-/* see inner.h */
-int
-Zf(is_invertible)(
-	const int16_t *s2, unsigned logn, uint8_t *tmp)
-{
-	size_t u, n;
-	uint16_t *tt;
-	uint32_t r;
-
-	n = (size_t)1 << logn;
-	tt = (uint16_t *)tmp;
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = (uint32_t)s2[u];
-		w += Q & -(w >> 31);
-		tt[u] = (uint16_t)w;
-	}
-	mq_NTT(tt, logn);
-	r = 0;
-	for (u = 0; u < n; u ++) {
-		r |= (uint32_t)(tt[u] - 1);
-	}
-	return (int)(1u - (r >> 31));
-}
-
-/* see inner.h */
-int
-Zf(verify_recover)(uint16_t *h,
-	const uint16_t *c0, const int16_t *s1, const int16_t *s2,
-	unsigned logn, uint8_t *tmp)
-{
-	size_t u, n;
-	uint16_t *tt;
-	uint32_t r;
-
-	n = (size_t)1 << logn;
-
-	/*
-	 * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
-	 * and c0 - s1 into h[].
-	 */
-	tt = (uint16_t *)tmp;
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = (uint32_t)s2[u];
-		w += Q & -(w >> 31);
-		tt[u] = (uint16_t)w;
-
-		w = (uint32_t)s1[u];
-		w += Q & -(w >> 31);
-		w = mq_sub(c0[u], w);
-		h[u] = (uint16_t)w;
-	}
-
-	/*
-	 * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
-	 * is zero (in NTT representation) then the operation fails. We
-	 * keep that information into a flag so that we do not deviate
-	 * from strict constant-time processing; if all coefficients of
-	 * s2 are non-zero, then the high bit of r will be zero.
-	 */
-	mq_NTT(tt, logn);
-	mq_NTT(h, logn);
-	r = 0;
-	for (u = 0; u < n; u ++) {
-		r |= (uint32_t)(tt[u] - 1);
-		h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
-	}
-	mq_iNTT(h, logn);
-
-	/*
-	 * Signature is acceptable if and only if it is short enough,
-	 * and s2 was invertible mod phi mod q. The caller must still
-	 * check that the rebuilt public key matches the expected
-	 * value (e.g. through a hash).
-	 */
-	r = ~r & (uint32_t)-Zf(is_short)(s1, s2, logn);
-	return (int)(r >> 31);
-}
-
-/* see inner.h */
-int
-Zf(count_nttzero)(const int16_t *sig, unsigned logn, uint8_t *tmp)
-{
-	uint16_t *s2;
-	size_t u, n;
-	uint32_t r;
-
-	n = (size_t)1 << logn;
-	s2 = (uint16_t *)tmp;
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = (uint32_t)sig[u];
-		w += Q & -(w >> 31);
-		s2[u] = (uint16_t)w;
-	}
-	mq_NTT(s2, logn);
-	r = 0;
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = (uint32_t)s2[u] - 1u;
-		r += (w >> 31);
-	}
-	return (int)r;
-}
diff --git a/crypto_sign/falcon-512-tree/m4-ct/README.txt b/crypto_sign/falcon-512-tree/m4-ct/README.txt
deleted file mode 100644
index 7bedf7f1..00000000
--- a/crypto_sign/falcon-512-tree/m4-ct/README.txt
+++ /dev/null
@@ -1,137 +0,0 @@
-Falcon implementation for PQM4 (or even mupq in general).
-
-
-There are multiple variants. Each variant is selected with the choice of
-api.h (four choices: api512dyn.h, api512tree.h, api1024dyn.h,
-api1024tree.h), and additional compile-time macro that are documented in
-config.h and can be set either in config.h, or through command-line
-flags passed to the C compiler.
-
-Choice of api.h:
-
-    api512dyn.h
-        "Normal" Falcon-512. Private key is reasonably compact. The
-        Falcon LDL tree is internally recomputed for each signature.
-
-    api512tree.h
-        Falcon-512 is key expansion. The Falcon LDL tree is computed
-        as part of the keygen, and returned as private key. This
-        speeds up signature generation, but also greatly enlarges
-        the private key size.
-
-    api1024dyn.h
-        "Normal" Falcon-1024.
-
-    api1024tree.h
-        Falcon-1024 with key expansion.
-
-Compile-time options (config.h):
-
-    FALCON_FPEMU
-        Set to 1 to enable use of the internal constant-time emulation
-        of floating-point operations.
-
-    FALCON_FPNATIVE
-        Set to 1 to use the native 'double' type and floating-point
-        operations. On architectures that lack a FPU, this will use the
-        compiler-provided floating-point emulation routines, which are
-        usually not constant-time (and sometimes return values which
-        do not follow IEEE-754 rounding rules).
-
-    FALCON_ASM_CORTEXM4
-        Set to 1 to use the M4 assembly routine for the constant-time
-        emulation of floating-point operations. These are faster than
-        the generic routines in C activated by FALCON_FPEMU.
-
-There is some internal autodetection that tries to select the right
-values automatically, but it's safer to explicitly select things:
-
-    To use the native 'double' type:
-        -DFALCON_FPNATIVE=1
-
-    To use the generic FP emulation code:
-        -DFALCON_FPEMU=1 -DFALCON_ASM_CORTEXM4=0
-
-    To use the M4 assembly code for FP emulation:
-        -DFALCON_FPEMU=1 -DFALCON_ASM_CORTEXM4=1
-
-The code relying on the native 'double' type requires an implementation
-that follows IEEE-754 rules with a 64-bit type. It works on 64-bit x86
-and PowerPC / POWER systems. On 32-bit x86, it tends to fail because the
-80387 FPU is used with more precision; on such a system, use
-'-msse2 -mfpmath=sse' to force use of the SSE2 unit (this might be the
-default on some systems, e.g. Darwin / macOS).
-
-
-IMPORTANT NOTES
-===============
-
-  * The PQM4 API is implemented in pqm4.c. Since the M4 stack is usually
-    small (usual default is 4 kB), temporary buffers are statically
-    allocated. This implies that the crypto_sign_keypair(), crypto_sign()
-    and crypto_sign_open() functions are not thread-safe or reentrant.
-    Also, the static allocation is "forever".
-
-    See the comments for the 'tmp' variable in pqm4.c; this gives the
-    relevant sizes.
-
-  * When using expanded keys, the private key contains 64-bit values
-    (floating-point, i.e. 'double' or 'uint64_t' depending on the kind
-    of floating-point emulation that is used). On many systems, this
-    implies some alignment requirements. I.e. crypto_sign_keypair() and
-    crypto_sign() then require the 'sk' pointer to be suitably aligned.
-    On an ARM Cortex M4, 32-bit alignment is required (while the basic
-    RAM access opcodes tolerate unaligned accesses, the 'ldm' and 'stm'
-    opcodes need 32-bit aligned pointers).
-
-  * When using the native 'double' type, the code has a dependency on
-    the sqrt() function. On x86, the relevant SSE2 opcode is inlined,
-    but the library function is still (potentially) invoked in case the
-    operand is negative, so that proper error management is performed.
-    This case does not happen in Falcon, but the library function is
-    still referenced, and explicitly linking with '-lm' may be
-    necessary.
-
-  * When using the native 'double' type, do _NOT_ enable -ffast-math.
-    The internal rounding function relies on the usual trick:
-        when x >= 0, round(x) = (x + 2**52) - 2**52
-
-    This trick works only as long as each addition is rounded as per
-    the IEEE-754 rules to the exact precision of the 64-bit type.
-    When -ffast-math is enabled, the compiler may assume commutativity
-    and "optimize" that expression into 'round(x) = x', which does not
-    work at all.
-
-
-TESTS
-=====
-
-In the 'tests/' directory is a generator for known-answer tests, and the
-expected file. The code comes from the NIST, but was modified to avoid a
-dependency on OpenSSL. When compiling the C source file against the
-selected Falcon implementation, an executable is produced, that, when
-executed, generates an '*.req' and an '*.rsp' files. The .req file is
-redundant (the .rsp file contains all the information, and some more).
-
-The expected .rsp files are provided as:
-    KAT512dyn.rsp        Falcon-512, no expanded key
-    KAT512tree.rsp       Falcon-512, with expanded key
-    KAT1024dyn.rsp       Falcon-1024, no expanded key
-    KAT1024tree.rsp      Falcon-1024, with expanded key
-
-
-Normally, all computations are exact and the files are exactly
-reproducible. However, some discrepancies may occur with the '*tree'
-files in the following cases:
-
-  - On big-endian architectures, the bytes in sk[] will be in a
-    different order. This is a side effect of putting the raw bytes
-    of the expanded key in sk[] (this could be fixed with some
-    reencoding pass, but this was not implemented yet).
-
-  - If a non-exact IEEE-754 implementation is used, some of the
-    low bits of the values may be changed. This may happen if the
-    underlying implementation is not strictly faithful to rounding.
-
-As long as only the 'sk' lines are changed, then the public keys
-and signature values are unimpacted.
diff --git a/crypto_sign/falcon-512-tree/m4-ct/api.h b/crypto_sign/falcon-512-tree/m4-ct/api.h
deleted file mode 100644
index 81082b45..00000000
--- a/crypto_sign/falcon-512-tree/m4-ct/api.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#include <stddef.h>
-
-#define CRYPTO_SECRETKEYBYTES   57344
-#define CRYPTO_PUBLICKEYBYTES   897
-#define CRYPTO_BYTES            690
-
-#define CRYPTO_ALGNAME          "Falcon-512-tree"
-
-int crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
-
-int crypto_sign(unsigned char *sm, size_t *smlen,
-	const unsigned char *m, size_t mlen,
-	const unsigned char *sk);
-
-int crypto_sign_open(unsigned char *m, size_t *mlen,
-	const unsigned char *sm, size_t smlen,
-	const unsigned char *pk);
diff --git a/crypto_sign/falcon-512-tree/m4-ct/codec.c b/crypto_sign/falcon-512-tree/m4-ct/codec.c
deleted file mode 100644
index 5bd61424..00000000
--- a/crypto_sign/falcon-512-tree/m4-ct/codec.c
+++ /dev/null
@@ -1,559 +0,0 @@
-/*
- * Encoding/decoding of keys and signatures.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-/* see inner.h */
-size_t
-Zf(modq_encode)(
-	void *out, size_t max_out_len,
-	const uint16_t *x, unsigned logn)
-{
-	size_t n, out_len, u;
-	uint8_t *buf;
-	uint32_t acc;
-	int acc_len;
-
-	n = (size_t)1 << logn;
-	for (u = 0; u < n; u ++) {
-		if (x[u] >= 12289) {
-			return 0;
-		}
-	}
-	out_len = ((n * 14) + 7) >> 3;
-	if (out == NULL) {
-		return out_len;
-	}
-	if (out_len > max_out_len) {
-		return 0;
-	}
-	buf = out;
-	acc = 0;
-	acc_len = 0;
-	for (u = 0; u < n; u ++) {
-		acc = (acc << 14) | x[u];
-		acc_len += 14;
-		while (acc_len >= 8) {
-			acc_len -= 8;
-			*buf ++ = (uint8_t)(acc >> acc_len);
-		}
-	}
-	if (acc_len > 0) {
-		*buf = (uint8_t)(acc << (8 - acc_len));
-	}
-	return out_len;
-}
-
-/* see inner.h */
-size_t
-Zf(modq_decode)(
-	uint16_t *x, unsigned logn,
-	const void *in, size_t max_in_len)
-{
-	size_t n, in_len, u;
-	const uint8_t *buf;
-	uint32_t acc;
-	int acc_len;
-
-	n = (size_t)1 << logn;
-	in_len = ((n * 14) + 7) >> 3;
-	if (in_len > max_in_len) {
-		return 0;
-	}
-	buf = in;
-	acc = 0;
-	acc_len = 0;
-	u = 0;
-	while (u < n) {
-		acc = (acc << 8) | (*buf ++);
-		acc_len += 8;
-		if (acc_len >= 14) {
-			unsigned w;
-
-			acc_len -= 14;
-			w = (acc >> acc_len) & 0x3FFF;
-			if (w >= 12289) {
-				return 0;
-			}
-			x[u ++] = (uint16_t)w;
-		}
-	}
-	if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
-		return 0;
-	}
-	return in_len;
-}
-
-/* see inner.h */
-size_t
-Zf(trim_i16_encode)(
-	void *out, size_t max_out_len,
-	const int16_t *x, unsigned logn, unsigned bits)
-{
-	size_t n, u, out_len;
-	int minv, maxv;
-	uint8_t *buf;
-	uint32_t acc, mask;
-	unsigned acc_len;
-
-	n = (size_t)1 << logn;
-	maxv = (1 << (bits - 1)) - 1;
-	minv = -maxv;
-	for (u = 0; u < n; u ++) {
-		if (x[u] < minv || x[u] > maxv) {
-			return 0;
-		}
-	}
-	out_len = ((n * bits) + 7) >> 3;
-	if (out == NULL) {
-		return out_len;
-	}
-	if (out_len > max_out_len) {
-		return 0;
-	}
-	buf = out;
-	acc = 0;
-	acc_len = 0;
-	mask = ((uint32_t)1 << bits) - 1;
-	for (u = 0; u < n; u ++) {
-		acc = (acc << bits) | ((uint16_t)x[u] & mask);
-		acc_len += bits;
-		while (acc_len >= 8) {
-			acc_len -= 8;
-			*buf ++ = (uint8_t)(acc >> acc_len);
-		}
-	}
-	if (acc_len > 0) {
-		*buf ++ = (uint8_t)(acc << (8 - acc_len));
-	}
-	return out_len;
-}
-
-/* see inner.h */
-size_t
-Zf(trim_i16_decode)(
-	int16_t *x, unsigned logn, unsigned bits,
-	const void *in, size_t max_in_len)
-{
-	size_t n, in_len;
-	const uint8_t *buf;
-	size_t u;
-	uint32_t acc, mask1, mask2;
-	unsigned acc_len;
-
-	n = (size_t)1 << logn;
-	in_len = ((n * bits) + 7) >> 3;
-	if (in_len > max_in_len) {
-		return 0;
-	}
-	buf = in;
-	u = 0;
-	acc = 0;
-	acc_len = 0;
-	mask1 = ((uint32_t)1 << bits) - 1;
-	mask2 = (uint32_t)1 << (bits - 1);
-	while (u < n) {
-		acc = (acc << 8) | *buf ++;
-		acc_len += 8;
-		while (acc_len >= bits && u < n) {
-			uint32_t w;
-
-			acc_len -= bits;
-			w = (acc >> acc_len) & mask1;
-			w |= -(w & mask2);
-			if (w == -mask2) {
-				/*
-				 * The -2^(bits-1) value is forbidden.
-				 */
-				return 0;
-			}
-			w |= -(w & mask2);
-			x[u ++] = (int16_t)*(int32_t *)&w;
-		}
-	}
-	if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
-		/*
-		 * Extra bits in the last byte must be zero.
-		 */
-		return 0;
-	}
-	return in_len;
-}
-
-/* see inner.h */
-size_t
-Zf(trim_i8_encode)(
-	void *out, size_t max_out_len,
-	const int8_t *x, unsigned logn, unsigned bits)
-{
-	size_t n, u, out_len;
-	int minv, maxv;
-	uint8_t *buf;
-	uint32_t acc, mask;
-	unsigned acc_len;
-
-	n = (size_t)1 << logn;
-	maxv = (1 << (bits - 1)) - 1;
-	minv = -maxv;
-	for (u = 0; u < n; u ++) {
-		if (x[u] < minv || x[u] > maxv) {
-			return 0;
-		}
-	}
-	out_len = ((n * bits) + 7) >> 3;
-	if (out == NULL) {
-		return out_len;
-	}
-	if (out_len > max_out_len) {
-		return 0;
-	}
-	buf = out;
-	acc = 0;
-	acc_len = 0;
-	mask = ((uint32_t)1 << bits) - 1;
-	for (u = 0; u < n; u ++) {
-		acc = (acc << bits) | ((uint8_t)x[u] & mask);
-		acc_len += bits;
-		while (acc_len >= 8) {
-			acc_len -= 8;
-			*buf ++ = (uint8_t)(acc >> acc_len);
-		}
-	}
-	if (acc_len > 0) {
-		*buf ++ = (uint8_t)(acc << (8 - acc_len));
-	}
-	return out_len;
-}
-
-/* see inner.h */
-size_t
-Zf(trim_i8_decode)(
-	int8_t *x, unsigned logn, unsigned bits,
-	const void *in, size_t max_in_len)
-{
-	size_t n, in_len;
-	const uint8_t *buf;
-	size_t u;
-	uint32_t acc, mask1, mask2;
-	unsigned acc_len;
-
-	n = (size_t)1 << logn;
-	in_len = ((n * bits) + 7) >> 3;
-	if (in_len > max_in_len) {
-		return 0;
-	}
-	buf = in;
-	u = 0;
-	acc = 0;
-	acc_len = 0;
-	mask1 = ((uint32_t)1 << bits) - 1;
-	mask2 = (uint32_t)1 << (bits - 1);
-	while (u < n) {
-		acc = (acc << 8) | *buf ++;
-		acc_len += 8;
-		while (acc_len >= bits && u < n) {
-			uint32_t w;
-
-			acc_len -= bits;
-			w = (acc >> acc_len) & mask1;
-			w |= -(w & mask2);
-			if (w == -mask2) {
-				/*
-				 * The -2^(bits-1) value is forbidden.
-				 */
-				return 0;
-			}
-			x[u ++] = (int8_t)*(int32_t *)&w;
-		}
-	}
-	if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
-		/*
-		 * Extra bits in the last byte must be zero.
-		 */
-		return 0;
-	}
-	return in_len;
-}
-
-/* see inner.h */
-size_t
-Zf(comp_encode)(
-	void *out, size_t max_out_len,
-	const int16_t *x, unsigned logn)
-{
-	uint8_t *buf;
-	size_t n, u, v;
-	uint32_t acc;
-	unsigned acc_len;
-
-	n = (size_t)1 << logn;
-	buf = out;
-
-	/*
-	 * Make sure that all values are within the -2047..+2047 range.
-	 */
-	for (u = 0; u < n; u ++) {
-		if (x[u] < -2047 || x[u] > +2047) {
-			return 0;
-		}
-	}
-
-	acc = 0;
-	acc_len = 0;
-	v = 0;
-	for (u = 0; u < n; u ++) {
-		int t;
-		unsigned w;
-
-		/*
-		 * Get sign and absolute value of next integer; push the
-		 * sign bit.
-		 */
-		acc <<= 1;
-		t = x[u];
-		if (t < 0) {
-			t = -t;
-			acc |= 1;
-		}
-		w = (unsigned)t;
-
-		/*
-		 * Push the low 7 bits of the absolute value.
-		 */
-		acc <<= 7;
-		acc |= w & 127u;
-		w >>= 7;
-
-		/*
-		 * We pushed exactly 8 bits.
-		 */
-		acc_len += 8;
-
-		/*
-		 * Push as many zeros as necessary, then a one. Since the
-		 * absolute value is at most 2047, w can only range up to
-		 * 15 at this point, thus we will add at most 16 bits
-		 * here. With the 8 bits above and possibly up to 7 bits
-		 * from previous iterations, we may go up to 31 bits, which
-		 * will fit in the accumulator, which is an uint32_t.
-		 */
-		acc <<= (w + 1);
-		acc |= 1;
-		acc_len += w + 1;
-
-		/*
-		 * Produce all full bytes.
-		 */
-		while (acc_len >= 8) {
-			acc_len -= 8;
-			if (buf != NULL) {
-				if (v >= max_out_len) {
-					return 0;
-				}
-				buf[v] = (uint8_t)(acc >> acc_len);
-			}
-			v ++;
-		}
-	}
-
-	/*
-	 * Flush remaining bits (if any).
-	 */
-	if (acc_len > 0) {
-		if (buf != NULL) {
-			if (v >= max_out_len) {
-				return 0;
-			}
-			buf[v] = (uint8_t)(acc << (8 - acc_len));
-		}
-		v ++;
-	}
-
-	return v;
-}
-
-/* see inner.h */
-size_t
-Zf(comp_decode)(
-	int16_t *x, unsigned logn,
-	const void *in, size_t max_in_len)
-{
-	const uint8_t *buf;
-	size_t n, u, v;
-	uint32_t acc;
-	unsigned acc_len;
-
-	n = (size_t)1 << logn;
-	buf = in;
-	acc = 0;
-	acc_len = 0;
-	v = 0;
-	for (u = 0; u < n; u ++) {
-		unsigned b, s, m;
-
-		/*
-		 * Get next eight bits: sign and low seven bits of the
-		 * absolute value.
-		 */
-		if (v >= max_in_len) {
-			return 0;
-		}
-		acc = (acc << 8) | (uint32_t)buf[v ++];
-		b = acc >> acc_len;
-		s = b & 128;
-		m = b & 127;
-
-		/*
-		 * Get next bits until a 1 is reached.
-		 */
-		for (;;) {
-			if (acc_len == 0) {
-				if (v >= max_in_len) {
-					return 0;
-				}
-				acc = (acc << 8) | (uint32_t)buf[v ++];
-				acc_len = 8;
-			}
-			acc_len --;
-			if (((acc >> acc_len) & 1) != 0) {
-				break;
-			}
-			m += 128;
-			if (m > 2047) {
-				return 0;
-			}
-		}
-		x[u] = (int16_t)(s ? -(int)m : (int)m);
-	}
-	return v;
-}
-
-/*
- * Key elements and signatures are polynomials with small integer
- * coefficients. Here are some statistics gathered over many
- * generated key pairs (10000 or more for each degree):
- *
- *   log(n)     n   max(f,g)   std(f,g)   max(F,G)   std(F,G)
- *      1       2     129       56.31       143       60.02
- *      2       4     123       40.93       160       46.52
- *      3       8      97       28.97       159       38.01
- *      4      16     100       21.48       154       32.50
- *      5      32      71       15.41       151       29.36
- *      6      64      59       11.07       138       27.77
- *      7     128      39        7.91       144       27.00
- *      8     256      32        5.63       148       26.61
- *      9     512      22        4.00       137       26.46
- *     10    1024      15        2.84       146       26.41
- *
- * We want a compact storage format for private key, and, as part of
- * key generation, we are allowed to reject some keys which would
- * otherwise be fine (this does not induce any noticeable vulnerability
- * as long as we reject only a small proportion of possible keys).
- * Hence, we enforce at key generation time maximum values for the
- * elements of f, g, F and G, so that their encoding can be expressed
- * in fixed-width values. Limits have been chosen so that generated
- * keys are almost always within bounds, thus not impacting neither
- * security or performance.
- *
- * IMPORTANT: the code assumes that all coefficients of f, g, F and G
- * ultimately fit in the -127..+127 range. Thus, none of the elements
- * of max_fg_bits[] and max_FG_bits[] shall be greater than 8.
- */
-
-const uint8_t Zf(max_fg_bits)[] = {
-	0, /* unused */
-	8,
-	8,
-	8,
-	8,
-	8,
-	7,
-	7,
-	6,
-	6,
-	5
-};
-
-const uint8_t Zf(max_FG_bits)[] = {
-	0, /* unused */
-	8,
-	8,
-	8,
-	8,
-	8,
-	8,
-	8,
-	8,
-	8,
-	8
-};
-
-/*
- * When generating a new key pair, we can always reject keys which
- * feature an abnormally large coefficient. This can also be done for
- * signatures, albeit with some care: in case the signature process is
- * used in a derandomized setup (explicitly seeded with the message and
- * private key), we have to follow the specification faithfully, and the
- * specification only enforces a limit on the L2 norm of the signature
- * vector. The limit on the L2 norm implies that the absolute value of
- * a coefficient of the signature cannot be more than the following:
- *
- *   log(n)     n   max sig coeff (theoretical)
- *      1       2       412
- *      2       4       583
- *      3       8       824
- *      4      16      1166
- *      5      32      1649
- *      6      64      2332
- *      7     128      3299
- *      8     256      4665
- *      9     512      6598
- *     10    1024      9331
- *
- * However, the largest observed signature coefficients during our
- * experiments was 1077 (in absolute value), hence we can assume that,
- * with overwhelming probability, signature coefficients will fit
- * in -2047..2047, i.e. 12 bits.
- */
-
-const uint8_t Zf(max_sig_bits)[] = {
-	0, /* unused */
-	10,
-	11,
-	11,
-	12,
-	12,
-	12,
-	12,
-	12,
-	12,
-	12
-};
diff --git a/crypto_sign/falcon-512-tree/m4-ct/common.c b/crypto_sign/falcon-512-tree/m4-ct/common.c
deleted file mode 100644
index ef30028b..00000000
--- a/crypto_sign/falcon-512-tree/m4-ct/common.c
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * Support functions for signatures (hash-to-point, norm).
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-/* see inner.h */
-void
-Zf(hash_to_point_vartime)(
-	inner_shake256_context *sc,
-	uint16_t *x, unsigned logn)
-{
-	/*
-	 * This is the straightforward per-the-spec implementation. It
-	 * is not constant-time, thus it might reveal information on the
-	 * plaintext (at least, enough to check the plaintext against a
-	 * list of potential plaintexts) in a scenario where the
-	 * attacker does not have access to the signature value or to
-	 * the public key, but knows the nonce (without knowledge of the
-	 * nonce, the hashed output cannot be matched against potential
-	 * plaintexts).
-	 */
-	size_t n;
-
-	n = (size_t)1 << logn;
-	while (n > 0) {
-		uint8_t buf[2];
-		uint32_t w;
-
-		inner_shake256_extract(sc, (void *)buf, sizeof buf);
-		w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
-		if (w < 61445) {
-			while (w >= 12289) {
-				w -= 12289;
-			}
-			*x ++ = (uint16_t)w;
-			n --;
-		}
-	}
-}
-
-/* see inner.h */
-void
-Zf(hash_to_point_ct)(
-	inner_shake256_context *sc,
-	uint16_t *x, unsigned logn, uint8_t *tmp)
-{
-	/*
-	 * Each 16-bit sample is a value in 0..65535. The value is
-	 * kept if it falls in 0..61444 (because 61445 = 5*12289)
-	 * and rejected otherwise; thus, each sample has probability
-	 * about 0.93758 of being selected.
-	 *
-	 * We want to oversample enough to be sure that we will
-	 * have enough values with probability at least 1 - 2^(-256).
-	 * Depending on degree N, this leads to the following
-	 * required oversampling:
-	 *
-	 *   logn     n  oversampling
-	 *     1      2     65
-	 *     2      4     67
-	 *     3      8     71
-	 *     4     16     77
-	 *     5     32     86
-	 *     6     64    100
-	 *     7    128    122
-	 *     8    256    154
-	 *     9    512    205
-	 *    10   1024    287
-	 *
-	 * If logn >= 7, then the provided temporary buffer is large
-	 * enough. Otherwise, we use a stack buffer of 63 entries
-	 * (i.e. 126 bytes) for the values that do not fit in tmp[].
-	 */
-
-	static const uint16_t overtab[] = {
-		0, /* unused */
-		65,
-		67,
-		71,
-		77,
-		86,
-		100,
-		122,
-		154,
-		205,
-		287
-	};
-
-	unsigned n, n2, u, m, p, over;
-	uint16_t *tt1, tt2[63];
-
-	/*
-	 * We first generate m 16-bit value. Values 0..n-1 go to x[].
-	 * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[].
-	 * We also reduce modulo q the values; rejected values are set
-	 * to 0xFFFF.
-	 */
-	n = 1U << logn;
-	n2 = n << 1;
-	over = overtab[logn];
-	m = n + over;
-	tt1 = (uint16_t *)tmp;
-	for (u = 0; u < m; u ++) {
-		uint8_t buf[2];
-		uint32_t w, wr;
-
-		inner_shake256_extract(sc, buf, sizeof buf);
-		w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
-		wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
-		wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
-		wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1));
-		wr |= ((w - 61445) >> 31) - 1;
-		if (u < n) {
-			x[u] = (uint16_t)wr;
-		} else if (u < n2) {
-			tt1[u - n] = (uint16_t)wr;
-		} else {
-			tt2[u - n2] = (uint16_t)wr;
-		}
-	}
-
-	/*
-	 * Now we must "squeeze out" the invalid values. We do this in
-	 * a logarithmic sequence of passes; each pass computes where a
-	 * value should go, and moves it down by 'p' slots if necessary,
-	 * where 'p' uses an increasing powers-of-two scale. It can be
-	 * shown that in all cases where the loop decides that a value
-	 * has to be moved down by p slots, the destination slot is
-	 * "free" (i.e. contains an invalid value).
-	 */
-	for (p = 1; p <= over; p <<= 1) {
-		unsigned v;
-
-		/*
-		 * In the loop below:
-		 *
-		 *   - v contains the index of the final destination of
-		 *     the value; it is recomputed dynamically based on
-		 *     whether values are valid or not.
-		 *
-		 *   - u is the index of the value we consider ("source");
-		 *     its address is s.
-		 *
-		 *   - The loop may swap the value with the one at index
-		 *     u-p. The address of the swap destination is d.
-		 */
-		v = 0;
-		for (u = 0; u < m; u ++) {
-			uint16_t *s, *d;
-			unsigned j, sv, dv, mk;
-
-			if (u < n) {
-				s = &x[u];
-			} else if (u < n2) {
-				s = &tt1[u - n];
-			} else {
-				s = &tt2[u - n2];
-			}
-			sv = *s;
-
-			/*
-			 * The value in sv should ultimately go to
-			 * address v, i.e. jump back by u-v slots.
-			 */
-			j = u - v;
-
-			/*
-			 * We increment v for the next iteration, but
-			 * only if the source value is valid. The mask
-			 * 'mk' is -1 if the value is valid, 0 otherwise,
-			 * so we _subtract_ mk.
-			 */
-			mk = (sv >> 15) - 1U;
-			v -= mk;
-
-			/*
-			 * In this loop we consider jumps by p slots; if
-			 * u < p then there is nothing more to do.
-			 */
-			if (u < p) {
-				continue;
-			}
-
-			/*
-			 * Destination for the swap: value at address u-p.
-			 */
-			if ((u - p) < n) {
-				d = &x[u - p];
-			} else if ((u - p) < n2) {
-				d = &tt1[(u - p) - n];
-			} else {
-				d = &tt2[(u - p) - n2];
-			}
-			dv = *d;
-
-			/*
-			 * The swap should be performed only if the source
-			 * is valid AND the jump j has its 'p' bit set.
-			 */
-			mk &= -(((j & p) + 0x1FF) >> 9);
-
-			*s = (uint16_t)(sv ^ (mk & (sv ^ dv)));
-			*d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
-		}
-	}
-}
-
-/* see inner.h */
-int
-Zf(is_short)(
-	const int16_t *s1, const int16_t *s2, unsigned logn)
-{
-	/*
-	 * We use the l2-norm. Code below uses only 32-bit operations to
-	 * compute the square of the norm with saturation to 2^32-1 if
-	 * the value exceeds 2^31-1.
-	 */
-	size_t n, u;
-	uint32_t s, ng;
-
-	n = (size_t)1 << logn;
-	s = 0;
-	ng = 0;
-	for (u = 0; u < n; u ++) {
-		int32_t z;
-
-		z = s1[u];
-		s += (uint32_t)(z * z);
-		ng |= s;
-		z = s2[u];
-		s += (uint32_t)(z * z);
-		ng |= s;
-	}
-	s |= -(ng >> 31);
-
-	/*
-	 * Acceptance bound on the l2-norm is:
-	 *   1.2*1.55*sqrt(q)*sqrt(2*N)
-	 * Value 7085 is floor((1.2^2)*(1.55^2)*2*1024).
-	 */
-	return s < (((uint32_t)7085 * (uint32_t)12289) >> (10 - logn));
-}
-
-/* see inner.h */
-int
-Zf(is_short_half)(
-	uint32_t sqn, const int16_t *s2, unsigned logn)
-{
-	size_t n, u;
-	uint32_t ng;
-
-	n = (size_t)1 << logn;
-	ng = -(sqn >> 31);
-	for (u = 0; u < n; u ++) {
-		int32_t z;
-
-		z = s2[u];
-		sqn += (uint32_t)(z * z);
-		ng |= sqn;
-	}
-	sqn |= -(ng >> 31);
-
-	/*
-	 * Acceptance bound on the l2-norm is:
-	 *   1.2*1.55*sqrt(q)*sqrt(2*N)
-	 * Value 7085 is floor((1.2^2)*(1.55^2)*2*1024).
-	 */
-	return sqn < (((uint32_t)7085 * (uint32_t)12289) >> (10 - logn));
-}
diff --git a/crypto_sign/falcon-512-tree/m4-ct/config.h b/crypto_sign/falcon-512-tree/m4-ct/config.h
deleted file mode 100644
index cd78727e..00000000
--- a/crypto_sign/falcon-512-tree/m4-ct/config.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Manual configuration file for the Falcon implementation. Here can
- * be set some compilation-time options.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#ifndef FALCON_CONFIG_H__
-#define FALCON_CONFIG_H__
-
-/*
- * Each option is a macro which should be defined to either 1 or 0.
- * If any of the options below is left undefined, then a default value
- * will be used by the code, possibly using compile-time autodetection
- * from compiler-defined macros.
- *
- * Explicitly setting a parameter can be done by uncommenting/modifying
- * its definition below, in this file, or equivalently by setting it as
- * a compiler flag.
- */
-
-/*
- * Use the native 'double' C type for floating-point computations. Exact
- * reproducibility of all tests requires that type to faithfully follow
- * IEEE-754 "round-to-nearest" rules.
- *
- * Native double support will use the CPU hardware and/or
- * compiler-provided functions; the latter is typically NOT
- * constant-time, while the former MAY be constant-time, or not. On
- * recent x86 CPU in 64-bit mode, SSE2 opcodes are used and they provide
- * constant-time operations for all the operations used in Falcon,
- * except for some special cases of divisions and square roots, but it
- * can be shown that theses cases imply only negligible leak of
- * information that cannot be leveraged into a full attack.
- *
- * If neither FALCON_FPNATIVE nor FALCON_FPEMU is defined, then use of
- * the native 'double' C type is the default behaviour unless
- * FALCON_ASM_CORTEXM4 is defined to 1, in which case the emulated code
- * will be used.
- *
-#define FALCON_FPNATIVE   1
- */
-
-/*
- * Use emulated floating-point implementation.
- *
- * Emulation uses only integer operations with uint32_t and uint64_t
- * types. This is constant-time, provided that the underlying platform
- * offers constant-time opcodes for the following operations:
- *
- *  - Multiplication of two 32-bit unsigned integers into a 64-bit result.
- *  - Left-shift or right-shift of a 32-bit unsigned integer by a
- *    potentially secret shift count in the 0..31 range.
- *
- * Notably, the ARM Cortex M3 does not fulfill the first condition,
- * while the Pentium IV does not fulfill the second.
- *
- * If neither FALCON_FPNATIVE nor FALCON_FPEMU is defined, then use of
- * the native 'double' C type is the default behaviour unless
- * FALCON_ASM_CORTEXM4 is defined to 1, in which case the emulated code
- * will be used.
- *
-#define FALCON_FPEMU   1
- */
-
-/*
- * Enable use of assembly for ARM Cortex-M4 CPU. By default, such
- * support will be used based on some autodection on the compiler
- * version and target architecture. Define this variable to 1 to force
- * use of the assembly code, or 0 to disable it regardless of the
- * autodetection.
- *
- * When FALCON_ASM_CORTEXM4 is enabled (whether defined explicitly or
- * autodetected), emulated floating-point code will be used, unless
- * FALCON_FPNATIVE or FALCON_FPEMU is explicitly set to override the
- * choice. Emulated code with ARM assembly is constant-time and provides
- * better performance than emulated code with plain C.
- *
- * The assembly code for the M4 can also work on a Cortex-M3. If the
- * compiler is instructed to target the M3 (e.g. '-mcpu=cortex-m3' with
- * GCC) then FALCON_ASM_CORTEXM4 won't be autodetected, but it can be
- * enabled explicitly. Take care, though, that the M3 multiplication
- * opcode (multiplication of two 32-bit unsigned integers with a 64-bit
- * result) is NOT constant-time.
- *
-#define FALCON_ASM_CORTEXM4   1
- */
-
-#define FALCON_ASM_CORTEXM4   1
-
-/*
- * Enable use of AVX2 intrinsics. If enabled, then the code will compile
- * only when targeting x86 with a compiler that supports AVX2 intrinsics
- * (tested with GCC 7.4.0, Clang 6.0.0, and MSVC 2015, both in 32-bit
- * and 64-bit modes), and run only on systems that offer the AVX2
- * opcodes. Some operations leverage AVX2 for better performance.
- *
-#define FALCON_AVX2   1
- */
-
-/*
- * Enable use of FMA intrinsics. This setting has any effect only if
- * FALCON_AVX2 is also enabled. The FMA intrinsics are normally available
- * on any x86 CPU that also has AVX2. Note that setting this option will
- * slightly modify the values of expanded private keys, but will normally
- * not change the values of non-expanded private keys, public keys or
- * signatures, for a given keygen/sign seed (non-expanded private keys
- * and signatures might theoretically change, but only with low probability,
- * less than 2^(-40); produced signatures are still safe and interoperable).
- *
-#define FALCON_FMA   1
- */
-
-/*
- * Assert that the platform uses little-endian encoding. If enabled,
- * then encoding and decoding of aligned multibyte values will be
- * slightly faster (especially for hashing and random number
- * generation). If not defined explicitly, then autodetection is
- * applied.
- *
-#define FALCON_LE   1
- */
-
-/*
- * Assert that the platform tolerates accesses to unaligned multibyte
- * values. If enabled, then some operations are slightly faster. Note
- * that ARM Cortex M4 do _not_ fully tolerate unaligned accesses; for
- * such systems, this option should not be enabled. If not defined
- * explicitly, then autodetection is applied.
- *
-#define FALCON_UNALIGNED   1
- */
-
-/*
- * Use a PRNG based on ChaCha20 and seeded with SHAKE256, instead of
- * SHAKE256 directly, for key pair generation purposes. This speeds up
- * key pair generation, especially on platforms where SHAKE256 is
- * comparatively slow: on the ARM Cortex M4, average key generation time
- * is reduced by 19% with this setting; on a recent x86 Skylake, the
- * reduction is smaller (less than 8%).
- *
- * However, this setting changes the private/public key pair obtained
- * from a given seed, thus preventing reproducibility of the
- * known-answer tests vectors. For compatibility with existing KAT
- * vectors (e.g. in PQClean, pqm4 and NIST implementations), this
- * setting is not enabled by default.
- *
-#define FALCON_KG_CHACHA20   1
- */
-
-/*
- * Use an explicit OS-provided source of randomness for seeding (for the
- * Zf(get_seed)() function implementation). Three possible sources are
- * defined:
- *
- *  - getentropy() system call
- *  - /dev/urandom special file
- *  - CryptGenRandom() function call
- *
- * More than one source may be enabled, in which case they will be tried
- * in the order above, until a success is reached.
- *
- * By default, sources are enabled at compile-time based on these
- * conditions:
- *
- *  - getentropy(): target is one of: Linux with Glibc-2.25+, FreeBSD 12+,
- *    or OpenBSD.
- *  - /dev/urandom: target is a Unix-like system (including Linux,
- *    FreeBSD, NetBSD, OpenBSD, DragonFly, macOS, Android, Solaris, AIX).
- *  - CryptGenRandom(): target is Windows (Win32 or Win64).
- *
- * On most small embedded systems, none will be enabled and Zf(get_seed)()
- * will always return 0. Applications will need to provide their own seeds.
- *
-#define FALCON_RAND_GETENTROPY   1
-#define FALCON_RAND_URANDOM      1
-#define FALCON_RAND_WIN32        1
- */
-
-#endif
diff --git a/crypto_sign/falcon-512-tree/m4-ct/fft.c b/crypto_sign/falcon-512-tree/m4-ct/fft.c
deleted file mode 100644
index b1904b24..00000000
--- a/crypto_sign/falcon-512-tree/m4-ct/fft.c
+++ /dev/null
@@ -1,1412 +0,0 @@
-/*
- * FFT code.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-/*
- * Rules for complex number macros:
- * --------------------------------
- *
- * Operand order is: destination, source1, source2...
- *
- * Each operand is a real and an imaginary part.
- *
- * All overlaps are allowed.
- */
-
-/*
- * Addition of two complex numbers (d = a + b).
- */
-#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
-		fpr fpct_re, fpct_im; \
-		fpct_re = fpr_add(a_re, b_re); \
-		fpct_im = fpr_add(a_im, b_im); \
-		(d_re) = fpct_re; \
-		(d_im) = fpct_im; \
-	} while (0)
-
-/*
- * Subtraction of two complex numbers (d = a - b).
- */
-#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
-		fpr fpct_re, fpct_im; \
-		fpct_re = fpr_sub(a_re, b_re); \
-		fpct_im = fpr_sub(a_im, b_im); \
-		(d_re) = fpct_re; \
-		(d_im) = fpct_im; \
-	} while (0)
-
-/*
- * Multplication of two complex numbers (d = a * b).
- */
-#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
-		fpr fpct_a_re, fpct_a_im; \
-		fpr fpct_b_re, fpct_b_im; \
-		fpr fpct_d_re, fpct_d_im; \
-		fpct_a_re = (a_re); \
-		fpct_a_im = (a_im); \
-		fpct_b_re = (b_re); \
-		fpct_b_im = (b_im); \
-		fpct_d_re = fpr_sub( \
-			fpr_mul(fpct_a_re, fpct_b_re), \
-			fpr_mul(fpct_a_im, fpct_b_im)); \
-		fpct_d_im = fpr_add( \
-			fpr_mul(fpct_a_re, fpct_b_im), \
-			fpr_mul(fpct_a_im, fpct_b_re)); \
-		(d_re) = fpct_d_re; \
-		(d_im) = fpct_d_im; \
-	} while (0)
-
-/*
- * Squaring of a complex number (d = a * a).
- */
-#define FPC_SQR(d_re, d_im, a_re, a_im)   do { \
-		fpr fpct_a_re, fpct_a_im; \
-		fpr fpct_d_re, fpct_d_im; \
-		fpct_a_re = (a_re); \
-		fpct_a_im = (a_im); \
-		fpct_d_re = fpr_sub(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
-		fpct_d_im = fpr_double(fpr_mul(fpct_a_re, fpct_a_im)); \
-		(d_re) = fpct_d_re; \
-		(d_im) = fpct_d_im; \
-	} while (0)
-
-/*
- * Inversion of a complex number (d = 1 / a).
- */
-#define FPC_INV(d_re, d_im, a_re, a_im)   do { \
-		fpr fpct_a_re, fpct_a_im; \
-		fpr fpct_d_re, fpct_d_im; \
-		fpr fpct_m; \
-		fpct_a_re = (a_re); \
-		fpct_a_im = (a_im); \
-		fpct_m = fpr_add(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
-		fpct_m = fpr_inv(fpct_m); \
-		fpct_d_re = fpr_mul(fpct_a_re, fpct_m); \
-		fpct_d_im = fpr_mul(fpr_neg(fpct_a_im), fpct_m); \
-		(d_re) = fpct_d_re; \
-		(d_im) = fpct_d_im; \
-	} while (0)
-
-/*
- * Division of complex numbers (d = a / b).
- */
-#define FPC_DIV(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
-		fpr fpct_a_re, fpct_a_im; \
-		fpr fpct_b_re, fpct_b_im; \
-		fpr fpct_d_re, fpct_d_im; \
-		fpr fpct_m; \
-		fpct_a_re = (a_re); \
-		fpct_a_im = (a_im); \
-		fpct_b_re = (b_re); \
-		fpct_b_im = (b_im); \
-		fpct_m = fpr_add(fpr_sqr(fpct_b_re), fpr_sqr(fpct_b_im)); \
-		fpct_m = fpr_inv(fpct_m); \
-		fpct_b_re = fpr_mul(fpct_b_re, fpct_m); \
-		fpct_b_im = fpr_mul(fpr_neg(fpct_b_im), fpct_m); \
-		fpct_d_re = fpr_sub( \
-			fpr_mul(fpct_a_re, fpct_b_re), \
-			fpr_mul(fpct_a_im, fpct_b_im)); \
-		fpct_d_im = fpr_add( \
-			fpr_mul(fpct_a_re, fpct_b_im), \
-			fpr_mul(fpct_a_im, fpct_b_re)); \
-		(d_re) = fpct_d_re; \
-		(d_im) = fpct_d_im; \
-	} while (0)
-
-/*
- * Let w = exp(i*pi/N); w is a primitive 2N-th root of 1. We define the
- * values w_j = w^(2j+1) for all j from 0 to N-1: these are the roots
- * of X^N+1 in the field of complex numbers. A crucial property is that
- * w_{N-1-j} = conj(w_j) = 1/w_j for all j.
- *
- * FFT representation of a polynomial f (taken modulo X^N+1) is the
- * set of values f(w_j). Since f is real, conj(f(w_j)) = f(conj(w_j)),
- * thus f(w_{N-1-j}) = conj(f(w_j)). We thus store only half the values,
- * for j = 0 to N/2-1; the other half can be recomputed easily when (if)
- * needed. A consequence is that FFT representation has the same size
- * as normal representation: N/2 complex numbers use N real numbers (each
- * complex number is the combination of a real and an imaginary part).
- *
- * We use a specific ordering which makes computations easier. Let rev()
- * be the bit-reversal function over log(N) bits. For j in 0..N/2-1, we
- * store the real and imaginary parts of f(w_j) in slots:
- *
- *    Re(f(w_j)) -> slot rev(j)/2
- *    Im(f(w_j)) -> slot rev(j)/2+N/2
- *
- * (Note that rev(j) is even for j < N/2.)
- */
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(FFT)(fpr *f, unsigned logn)
-{
-	/*
-	 * FFT algorithm in bit-reversal order uses the following
-	 * iterative algorithm:
-	 *
-	 *   t = N
-	 *   for m = 1; m < N; m *= 2:
-	 *       ht = t/2
-	 *       for i1 = 0; i1 < m; i1 ++:
-	 *           j1 = i1 * t
-	 *           s = GM[m + i1]
-	 *           for j = j1; j < (j1 + ht); j ++:
-	 *               x = f[j]
-	 *               y = s * f[j + ht]
-	 *               f[j] = x + y
-	 *               f[j + ht] = x - y
-	 *       t = ht
-	 *
-	 * GM[k] contains w^rev(k) for primitive root w = exp(i*pi/N).
-	 *
-	 * In the description above, f[] is supposed to contain complex
-	 * numbers. In our in-memory representation, the real and
-	 * imaginary parts of f[k] are in array slots k and k+N/2.
-	 *
-	 * We only keep the first half of the complex numbers. We can
-	 * see that after the first iteration, the first and second halves
-	 * of the array of complex numbers have separate lives, so we
-	 * simply ignore the second part.
-	 */
-
-	unsigned u;
-	size_t t, n, hn, m;
-
-	/*
-	 * First iteration: compute f[j] + i * f[j+N/2] for all j < N/2
-	 * (because GM[1] = w^rev(1) = w^(N/2) = i).
-	 * In our chosen representation, this is a no-op: everything is
-	 * already where it should be.
-	 */
-
-	/*
-	 * Subsequent iterations are truncated to use only the first
-	 * half of values.
-	 */
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-	t = hn;
-	for (u = 1, m = 2; u < logn; u ++, m <<= 1) {
-		size_t ht, hm, i1, j1;
-
-		ht = t >> 1;
-		hm = m >> 1;
-		for (i1 = 0, j1 = 0; i1 < hm; i1 ++, j1 += t) {
-			size_t j, j2;
-
-			j2 = j1 + ht;
-#if FALCON_AVX2 // yyyAVX2+1
-			if (ht >= 4) {
-				__m256d s_re, s_im;
-
-				s_re = _mm256_set1_pd(
-					fpr_gm_tab[((m + i1) << 1) + 0].v);
-				s_im = _mm256_set1_pd(
-					fpr_gm_tab[((m + i1) << 1) + 1].v);
-				for (j = j1; j < j2; j += 4) {
-					__m256d x_re, x_im, y_re, y_im;
-					__m256d z_re, z_im;
-
-					x_re = _mm256_loadu_pd(&f[j].v);
-					x_im = _mm256_loadu_pd(&f[j + hn].v);
-					z_re = _mm256_loadu_pd(&f[j+ht].v);
-					z_im = _mm256_loadu_pd(&f[j+ht + hn].v);
-					y_re = FMSUB(z_re, s_re,
-						_mm256_mul_pd(z_im, s_im));
-					y_im = FMADD(z_re, s_im,
-						_mm256_mul_pd(z_im, s_re));
-					_mm256_storeu_pd(&f[j].v,
-						_mm256_add_pd(x_re, y_re));
-					_mm256_storeu_pd(&f[j + hn].v,
-						_mm256_add_pd(x_im, y_im));
-					_mm256_storeu_pd(&f[j + ht].v,
-						_mm256_sub_pd(x_re, y_re));
-					_mm256_storeu_pd(&f[j + ht + hn].v,
-						_mm256_sub_pd(x_im, y_im));
-				}
-			} else {
-				fpr s_re, s_im;
-
-				s_re = fpr_gm_tab[((m + i1) << 1) + 0];
-				s_im = fpr_gm_tab[((m + i1) << 1) + 1];
-				for (j = j1; j < j2; j ++) {
-					fpr x_re, x_im, y_re, y_im;
-
-					x_re = f[j];
-					x_im = f[j + hn];
-					y_re = f[j + ht];
-					y_im = f[j + ht + hn];
-					FPC_MUL(y_re, y_im,
-						y_re, y_im, s_re, s_im);
-					FPC_ADD(f[j], f[j + hn],
-						x_re, x_im, y_re, y_im);
-					FPC_SUB(f[j + ht], f[j + ht + hn],
-						x_re, x_im, y_re, y_im);
-				}
-			}
-#else // yyyAVX2+0
-			fpr s_re, s_im;
-
-			s_re = fpr_gm_tab[((m + i1) << 1) + 0];
-			s_im = fpr_gm_tab[((m + i1) << 1) + 1];
-			for (j = j1; j < j2; j ++) {
-				fpr x_re, x_im, y_re, y_im;
-
-				x_re = f[j];
-				x_im = f[j + hn];
-				y_re = f[j + ht];
-				y_im = f[j + ht + hn];
-				FPC_MUL(y_re, y_im, y_re, y_im, s_re, s_im);
-				FPC_ADD(f[j], f[j + hn],
-					x_re, x_im, y_re, y_im);
-				FPC_SUB(f[j + ht], f[j + ht + hn],
-					x_re, x_im, y_re, y_im);
-			}
-#endif // yyyAVX2-
-		}
-		t = ht;
-	}
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(iFFT)(fpr *f, unsigned logn)
-{
-	/*
-	 * Inverse FFT algorithm in bit-reversal order uses the following
-	 * iterative algorithm:
-	 *
-	 *   t = 1
-	 *   for m = N; m > 1; m /= 2:
-	 *       hm = m/2
-	 *       dt = t*2
-	 *       for i1 = 0; i1 < hm; i1 ++:
-	 *           j1 = i1 * dt
-	 *           s = iGM[hm + i1]
-	 *           for j = j1; j < (j1 + t); j ++:
-	 *               x = f[j]
-	 *               y = f[j + t]
-	 *               f[j] = x + y
-	 *               f[j + t] = s * (x - y)
-	 *       t = dt
-	 *   for i1 = 0; i1 < N; i1 ++:
-	 *       f[i1] = f[i1] / N
-	 *
-	 * iGM[k] contains (1/w)^rev(k) for primitive root w = exp(i*pi/N)
-	 * (actually, iGM[k] = 1/GM[k] = conj(GM[k])).
-	 *
-	 * In the main loop (not counting the final division loop), in
-	 * all iterations except the last, the first and second half of f[]
-	 * (as an array of complex numbers) are separate. In our chosen
-	 * representation, we do not keep the second half.
-	 *
-	 * The last iteration recombines the recomputed half with the
-	 * implicit half, and should yield only real numbers since the
-	 * target polynomial is real; moreover, s = i at that step.
-	 * Thus, when considering x and y:
-	 *    y = conj(x) since the final f[j] must be real
-	 *    Therefore, f[j] is filled with 2*Re(x), and f[j + t] is
-	 *    filled with 2*Im(x).
-	 * But we already have Re(x) and Im(x) in array slots j and j+t
-	 * in our chosen representation. That last iteration is thus a
-	 * simple doubling of the values in all the array.
-	 *
-	 * We make the last iteration a no-op by tweaking the final
-	 * division into a division by N/2, not N.
-	 */
-	size_t u, n, hn, t, m;
-
-	n = (size_t)1 << logn;
-	t = 1;
-	m = n;
-	hn = n >> 1;
-	for (u = logn; u > 1; u --) {
-		size_t hm, dt, i1, j1;
-
-		hm = m >> 1;
-		dt = t << 1;
-		for (i1 = 0, j1 = 0; j1 < hn; i1 ++, j1 += dt) {
-			size_t j, j2;
-
-			j2 = j1 + t;
-#if FALCON_AVX2 // yyyAVX2+1
-			if (t >= 4) {
-				__m256d s_re, s_im;
-
-				s_re = _mm256_set1_pd(
-					fpr_gm_tab[((hm + i1) << 1) + 0].v);
-				s_im = _mm256_set1_pd(
-					fpr_gm_tab[((hm + i1) << 1) + 1].v);
-				for (j = j1; j < j2; j += 4) {
-					__m256d x_re, x_im, y_re, y_im;
-					__m256d z_re, z_im;
-
-					x_re = _mm256_loadu_pd(&f[j].v);
-					x_im = _mm256_loadu_pd(&f[j + hn].v);
-					y_re = _mm256_loadu_pd(&f[j+t].v);
-					y_im = _mm256_loadu_pd(&f[j+t + hn].v);
-					_mm256_storeu_pd(&f[j].v,
-						_mm256_add_pd(x_re, y_re));
-					_mm256_storeu_pd(&f[j + hn].v,
-						_mm256_add_pd(x_im, y_im));
-					x_re = _mm256_sub_pd(y_re, x_re);
-					x_im = _mm256_sub_pd(x_im, y_im);
-					z_re = FMSUB(x_im, s_im,
-						_mm256_mul_pd(x_re, s_re));
-					z_im = FMADD(x_re, s_im,
-						_mm256_mul_pd(x_im, s_re));
-					_mm256_storeu_pd(&f[j+t].v, z_re);
-					_mm256_storeu_pd(&f[j+t + hn].v, z_im);
-				}
-			} else {
-				fpr s_re, s_im;
-
-				s_re = fpr_gm_tab[((hm + i1) << 1)+0];
-				s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1)+1]);
-				for (j = j1; j < j2; j ++) {
-					fpr x_re, x_im, y_re, y_im;
-
-					x_re = f[j];
-					x_im = f[j + hn];
-					y_re = f[j + t];
-					y_im = f[j + t + hn];
-					FPC_ADD(f[j], f[j + hn],
-						x_re, x_im, y_re, y_im);
-					FPC_SUB(x_re, x_im,
-						x_re, x_im, y_re, y_im);
-					FPC_MUL(f[j + t], f[j + t + hn],
-						x_re, x_im, s_re, s_im);
-				}
-			}
-#else // yyyAVX2+0
-			fpr s_re, s_im;
-
-			s_re = fpr_gm_tab[((hm + i1) << 1) + 0];
-			s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1) + 1]);
-			for (j = j1; j < j2; j ++) {
-				fpr x_re, x_im, y_re, y_im;
-
-				x_re = f[j];
-				x_im = f[j + hn];
-				y_re = f[j + t];
-				y_im = f[j + t + hn];
-				FPC_ADD(f[j], f[j + hn],
-					x_re, x_im, y_re, y_im);
-				FPC_SUB(x_re, x_im, x_re, x_im, y_re, y_im);
-				FPC_MUL(f[j + t], f[j + t + hn],
-					x_re, x_im, s_re, s_im);
-			}
-#endif // yyyAVX2-
-		}
-		t = dt;
-		m = hm;
-	}
-
-	/*
-	 * Last iteration is a no-op, provided that we divide by N/2
-	 * instead of N. We need to make a special case for logn = 0.
-	 */
-	if (logn > 0) {
-		fpr ni;
-
-		ni = fpr_p2_tab[logn];
-		for (u = 0; u < n; u ++) {
-			f[u] = fpr_mul(f[u], ni);
-		}
-	}
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_add)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, u;
-
-	n = (size_t)1 << logn;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 4) {
-		for (u = 0; u < n; u += 4) {
-			_mm256_storeu_pd(&a[u].v,
-				_mm256_add_pd(
-					_mm256_loadu_pd(&a[u].v),
-					_mm256_loadu_pd(&b[u].v)));
-		}
-	} else {
-		for (u = 0; u < n; u ++) {
-			a[u] = fpr_add(a[u], b[u]);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < n; u ++) {
-		a[u] = fpr_add(a[u], b[u]);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_sub)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, u;
-
-	n = (size_t)1 << logn;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 4) {
-		for (u = 0; u < n; u += 4) {
-			_mm256_storeu_pd(&a[u].v,
-				_mm256_sub_pd(
-					_mm256_loadu_pd(&a[u].v),
-					_mm256_loadu_pd(&b[u].v)));
-		}
-	} else {
-		for (u = 0; u < n; u ++) {
-			a[u] = fpr_sub(a[u], b[u]);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < n; u ++) {
-		a[u] = fpr_sub(a[u], b[u]);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_neg)(fpr *a, unsigned logn)
-{
-	size_t n, u;
-
-	n = (size_t)1 << logn;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 4) {
-		__m256d s;
-
-		s = _mm256_set1_pd(-0.0);
-		for (u = 0; u < n; u += 4) {
-			_mm256_storeu_pd(&a[u].v,
-				_mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
-		}
-	} else {
-		for (u = 0; u < n; u ++) {
-			a[u] = fpr_neg(a[u]);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < n; u ++) {
-		a[u] = fpr_neg(a[u]);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_adj_fft)(fpr *a, unsigned logn)
-{
-	size_t n, u;
-
-	n = (size_t)1 << logn;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d s;
-
-		s = _mm256_set1_pd(-0.0);
-		for (u = (n >> 1); u < n; u += 4) {
-			_mm256_storeu_pd(&a[u].v,
-				_mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
-		}
-	} else {
-		for (u = (n >> 1); u < n; u ++) {
-			a[u] = fpr_neg(a[u]);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = (n >> 1); u < n; u ++) {
-		a[u] = fpr_neg(a[u]);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_mul_fft)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		for (u = 0; u < hn; u += 4) {
-			__m256d a_re, a_im, b_re, b_im, c_re, c_im;
-
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			b_re = _mm256_loadu_pd(&b[u].v);
-			b_im = _mm256_loadu_pd(&b[u + hn].v);
-			c_re = FMSUB(
-				a_re, b_re, _mm256_mul_pd(a_im, b_im));
-			c_im = FMADD(
-				a_re, b_im, _mm256_mul_pd(a_im, b_re));
-			_mm256_storeu_pd(&a[u].v, c_re);
-			_mm256_storeu_pd(&a[u + hn].v, c_im);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr a_re, a_im, b_re, b_im;
-
-			a_re = a[u];
-			a_im = a[u + hn];
-			b_re = b[u];
-			b_im = b[u + hn];
-			FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr a_re, a_im, b_re, b_im;
-
-		a_re = a[u];
-		a_im = a[u + hn];
-		b_re = b[u];
-		b_im = b[u + hn];
-		FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_muladj_fft)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		for (u = 0; u < hn; u += 4) {
-			__m256d a_re, a_im, b_re, b_im, c_re, c_im;
-
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			b_re = _mm256_loadu_pd(&b[u].v);
-			b_im = _mm256_loadu_pd(&b[u + hn].v);
-			c_re = FMADD(
-				a_re, b_re, _mm256_mul_pd(a_im, b_im));
-			c_im = FMSUB(
-				a_im, b_re, _mm256_mul_pd(a_re, b_im));
-			_mm256_storeu_pd(&a[u].v, c_re);
-			_mm256_storeu_pd(&a[u + hn].v, c_im);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr a_re, a_im, b_re, b_im;
-
-			a_re = a[u];
-			a_im = a[u + hn];
-			b_re = b[u];
-			b_im = fpr_neg(b[u + hn]);
-			FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr a_re, a_im, b_re, b_im;
-
-		a_re = a[u];
-		a_im = a[u + hn];
-		b_re = b[u];
-		b_im = fpr_neg(b[u + hn]);
-		FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn)
-{
-	/*
-	 * Since each coefficient is multiplied with its own conjugate,
-	 * the result contains only real values.
-	 */
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d zero;
-
-		zero = _mm256_setzero_pd();
-		for (u = 0; u < hn; u += 4) {
-			__m256d a_re, a_im;
-
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			_mm256_storeu_pd(&a[u].v,
-				FMADD(a_re, a_re,
-					_mm256_mul_pd(a_im, a_im)));
-			_mm256_storeu_pd(&a[u + hn].v, zero);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr a_re, a_im;
-
-			a_re = a[u];
-			a_im = a[u + hn];
-			a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
-			a[u + hn] = fpr_zero;
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr a_re, a_im;
-
-		a_re = a[u];
-		a_im = a[u + hn];
-		a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
-		a[u + hn] = fpr_zero;
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_mulconst)(fpr *a, fpr x, unsigned logn)
-{
-	size_t n, u;
-
-	n = (size_t)1 << logn;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 4) {
-		__m256d x4;
-
-		x4 = _mm256_set1_pd(x.v);
-		for (u = 0; u < n; u += 4) {
-			_mm256_storeu_pd(&a[u].v,
-				_mm256_mul_pd(x4, _mm256_loadu_pd(&a[u].v)));
-		}
-	} else {
-		for (u = 0; u < n; u ++) {
-			a[u] = fpr_mul(a[u], x);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < n; u ++) {
-		a[u] = fpr_mul(a[u], x);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_div_fft)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d one;
-
-		one = _mm256_set1_pd(1.0);
-		for (u = 0; u < hn; u += 4) {
-			__m256d a_re, a_im, b_re, b_im, c_re, c_im, t;
-
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			b_re = _mm256_loadu_pd(&b[u].v);
-			b_im = _mm256_loadu_pd(&b[u + hn].v);
-			t = _mm256_div_pd(one,
-				FMADD(b_re, b_re,
-					_mm256_mul_pd(b_im, b_im)));
-			b_re = _mm256_mul_pd(b_re, t);
-			b_im = _mm256_mul_pd(b_im, t);
-			c_re = FMADD(
-				a_re, b_re, _mm256_mul_pd(a_im, b_im));
-			c_im = FMSUB(
-				a_im, b_re, _mm256_mul_pd(a_re, b_im));
-			_mm256_storeu_pd(&a[u].v, c_re);
-			_mm256_storeu_pd(&a[u + hn].v, c_im);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr a_re, a_im, b_re, b_im;
-
-			a_re = a[u];
-			a_im = a[u + hn];
-			b_re = b[u];
-			b_im = b[u + hn];
-			FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr a_re, a_im, b_re, b_im;
-
-		a_re = a[u];
-		a_im = a[u + hn];
-		b_re = b[u];
-		b_im = b[u + hn];
-		FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_invnorm2_fft)(fpr *restrict d,
-	const fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d one;
-
-		one = _mm256_set1_pd(1.0);
-		for (u = 0; u < hn; u += 4) {
-			__m256d a_re, a_im, b_re, b_im, dv;
-
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			b_re = _mm256_loadu_pd(&b[u].v);
-			b_im = _mm256_loadu_pd(&b[u + hn].v);
-			dv = _mm256_div_pd(one,
-				_mm256_add_pd(
-					FMADD(a_re, a_re,
-						_mm256_mul_pd(a_im, a_im)),
-					FMADD(b_re, b_re,
-						_mm256_mul_pd(b_im, b_im))));
-			_mm256_storeu_pd(&d[u].v, dv);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr a_re, a_im;
-			fpr b_re, b_im;
-
-			a_re = a[u];
-			a_im = a[u + hn];
-			b_re = b[u];
-			b_im = b[u + hn];
-			d[u] = fpr_inv(fpr_add(
-				fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
-				fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr a_re, a_im;
-		fpr b_re, b_im;
-
-		a_re = a[u];
-		a_im = a[u + hn];
-		b_re = b[u];
-		b_im = b[u + hn];
-		d[u] = fpr_inv(fpr_add(
-			fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
-			fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_add_muladj_fft)(fpr *restrict d,
-	const fpr *restrict F, const fpr *restrict G,
-	const fpr *restrict f, const fpr *restrict g, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		for (u = 0; u < hn; u += 4) {
-			__m256d F_re, F_im, G_re, G_im;
-			__m256d f_re, f_im, g_re, g_im;
-			__m256d a_re, a_im, b_re, b_im;
-
-			F_re = _mm256_loadu_pd(&F[u].v);
-			F_im = _mm256_loadu_pd(&F[u + hn].v);
-			G_re = _mm256_loadu_pd(&G[u].v);
-			G_im = _mm256_loadu_pd(&G[u + hn].v);
-			f_re = _mm256_loadu_pd(&f[u].v);
-			f_im = _mm256_loadu_pd(&f[u + hn].v);
-			g_re = _mm256_loadu_pd(&g[u].v);
-			g_im = _mm256_loadu_pd(&g[u + hn].v);
-
-			a_re = FMADD(F_re, f_re,
-				_mm256_mul_pd(F_im, f_im));
-			a_im = FMSUB(F_im, f_re,
-				_mm256_mul_pd(F_re, f_im));
-			b_re = FMADD(G_re, g_re,
-				_mm256_mul_pd(G_im, g_im));
-			b_im = FMSUB(G_im, g_re,
-				_mm256_mul_pd(G_re, g_im));
-			_mm256_storeu_pd(&d[u].v,
-				_mm256_add_pd(a_re, b_re));
-			_mm256_storeu_pd(&d[u + hn].v,
-				_mm256_add_pd(a_im, b_im));
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr F_re, F_im, G_re, G_im;
-			fpr f_re, f_im, g_re, g_im;
-			fpr a_re, a_im, b_re, b_im;
-
-			F_re = F[u];
-			F_im = F[u + hn];
-			G_re = G[u];
-			G_im = G[u + hn];
-			f_re = f[u];
-			f_im = f[u + hn];
-			g_re = g[u];
-			g_im = g[u + hn];
-
-			FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
-			FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
-			d[u] = fpr_add(a_re, b_re);
-			d[u + hn] = fpr_add(a_im, b_im);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr F_re, F_im, G_re, G_im;
-		fpr f_re, f_im, g_re, g_im;
-		fpr a_re, a_im, b_re, b_im;
-
-		F_re = F[u];
-		F_im = F[u + hn];
-		G_re = G[u];
-		G_im = G[u + hn];
-		f_re = f[u];
-		f_im = f[u + hn];
-		g_re = g[u];
-		g_im = g[u + hn];
-
-		FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
-		FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
-		d[u] = fpr_add(a_re, b_re);
-		d[u + hn] = fpr_add(a_im, b_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_mul_autoadj_fft)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		for (u = 0; u < hn; u += 4) {
-			__m256d a_re, a_im, bv;
-
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			bv = _mm256_loadu_pd(&b[u].v);
-			_mm256_storeu_pd(&a[u].v,
-				_mm256_mul_pd(a_re, bv));
-			_mm256_storeu_pd(&a[u + hn].v,
-				_mm256_mul_pd(a_im, bv));
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			a[u] = fpr_mul(a[u], b[u]);
-			a[u + hn] = fpr_mul(a[u + hn], b[u]);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		a[u] = fpr_mul(a[u], b[u]);
-		a[u + hn] = fpr_mul(a[u + hn], b[u]);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_div_autoadj_fft)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d one;
-
-		one = _mm256_set1_pd(1.0);
-		for (u = 0; u < hn; u += 4) {
-			__m256d ib, a_re, a_im;
-
-			ib = _mm256_div_pd(one, _mm256_loadu_pd(&b[u].v));
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			_mm256_storeu_pd(&a[u].v, _mm256_mul_pd(a_re, ib));
-			_mm256_storeu_pd(&a[u + hn].v, _mm256_mul_pd(a_im, ib));
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr ib;
-
-			ib = fpr_inv(b[u]);
-			a[u] = fpr_mul(a[u], ib);
-			a[u + hn] = fpr_mul(a[u + hn], ib);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr ib;
-
-		ib = fpr_inv(b[u]);
-		a[u] = fpr_mul(a[u], ib);
-		a[u + hn] = fpr_mul(a[u + hn], ib);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_LDL_fft)(
-	const fpr *restrict g00,
-	fpr *restrict g01, fpr *restrict g11, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d one;
-
-		one = _mm256_set1_pd(1.0);
-		for (u = 0; u < hn; u += 4) {
-			__m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
-			__m256d t, mu_re, mu_im, xi_re, xi_im;
-
-			g00_re = _mm256_loadu_pd(&g00[u].v);
-			g00_im = _mm256_loadu_pd(&g00[u + hn].v);
-			g01_re = _mm256_loadu_pd(&g01[u].v);
-			g01_im = _mm256_loadu_pd(&g01[u + hn].v);
-			g11_re = _mm256_loadu_pd(&g11[u].v);
-			g11_im = _mm256_loadu_pd(&g11[u + hn].v);
-
-			t = _mm256_div_pd(one,
-				FMADD(g00_re, g00_re,
-					_mm256_mul_pd(g00_im, g00_im)));
-			g00_re = _mm256_mul_pd(g00_re, t);
-			g00_im = _mm256_mul_pd(g00_im, t);
-			mu_re = FMADD(g01_re, g00_re,
-				_mm256_mul_pd(g01_im, g00_im));
-			mu_im = FMSUB(g01_re, g00_im,
-				_mm256_mul_pd(g01_im, g00_re));
-			xi_re = FMSUB(mu_re, g01_re,
-				_mm256_mul_pd(mu_im, g01_im));
-			xi_im = FMADD(mu_im, g01_re,
-				_mm256_mul_pd(mu_re, g01_im));
-			_mm256_storeu_pd(&g11[u].v,
-				_mm256_sub_pd(g11_re, xi_re));
-			_mm256_storeu_pd(&g11[u + hn].v,
-				_mm256_add_pd(g11_im, xi_im));
-			_mm256_storeu_pd(&g01[u].v, mu_re);
-			_mm256_storeu_pd(&g01[u + hn].v, mu_im);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
-			fpr mu_re, mu_im;
-
-			g00_re = g00[u];
-			g00_im = g00[u + hn];
-			g01_re = g01[u];
-			g01_im = g01[u + hn];
-			g11_re = g11[u];
-			g11_im = g11[u + hn];
-			FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
-			FPC_MUL(g01_re, g01_im,
-				mu_re, mu_im, g01_re, fpr_neg(g01_im));
-			FPC_SUB(g11[u], g11[u + hn],
-				g11_re, g11_im, g01_re, g01_im);
-			g01[u] = mu_re;
-			g01[u + hn] = fpr_neg(mu_im);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
-		fpr mu_re, mu_im;
-
-		g00_re = g00[u];
-		g00_im = g00[u + hn];
-		g01_re = g01[u];
-		g01_im = g01[u + hn];
-		g11_re = g11[u];
-		g11_im = g11[u + hn];
-		FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
-		FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
-		FPC_SUB(g11[u], g11[u + hn], g11_re, g11_im, g01_re, g01_im);
-		g01[u] = mu_re;
-		g01[u + hn] = fpr_neg(mu_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_LDLmv_fft)(
-	fpr *restrict d11, fpr *restrict l10,
-	const fpr *restrict g00, const fpr *restrict g01,
-	const fpr *restrict g11, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d one;
-
-		one = _mm256_set1_pd(1.0);
-		for (u = 0; u < hn; u += 4) {
-			__m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
-			__m256d t, mu_re, mu_im, xi_re, xi_im;
-
-			g00_re = _mm256_loadu_pd(&g00[u].v);
-			g00_im = _mm256_loadu_pd(&g00[u + hn].v);
-			g01_re = _mm256_loadu_pd(&g01[u].v);
-			g01_im = _mm256_loadu_pd(&g01[u + hn].v);
-			g11_re = _mm256_loadu_pd(&g11[u].v);
-			g11_im = _mm256_loadu_pd(&g11[u + hn].v);
-
-			t = _mm256_div_pd(one,
-				FMADD(g00_re, g00_re,
-					_mm256_mul_pd(g00_im, g00_im)));
-			g00_re = _mm256_mul_pd(g00_re, t);
-			g00_im = _mm256_mul_pd(g00_im, t);
-			mu_re = FMADD(g01_re, g00_re,
-				_mm256_mul_pd(g01_im, g00_im));
-			mu_im = FMSUB(g01_re, g00_im,
-				_mm256_mul_pd(g01_im, g00_re));
-			xi_re = FMSUB(mu_re, g01_re,
-				_mm256_mul_pd(mu_im, g01_im));
-			xi_im = FMADD(mu_im, g01_re,
-				_mm256_mul_pd(mu_re, g01_im));
-			_mm256_storeu_pd(&d11[u].v,
-				_mm256_sub_pd(g11_re, xi_re));
-			_mm256_storeu_pd(&d11[u + hn].v,
-				_mm256_add_pd(g11_im, xi_im));
-			_mm256_storeu_pd(&l10[u].v, mu_re);
-			_mm256_storeu_pd(&l10[u + hn].v, mu_im);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
-			fpr mu_re, mu_im;
-
-			g00_re = g00[u];
-			g00_im = g00[u + hn];
-			g01_re = g01[u];
-			g01_im = g01[u + hn];
-			g11_re = g11[u];
-			g11_im = g11[u + hn];
-			FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
-			FPC_MUL(g01_re, g01_im,
-				mu_re, mu_im, g01_re, fpr_neg(g01_im));
-			FPC_SUB(d11[u], d11[u + hn],
-				g11_re, g11_im, g01_re, g01_im);
-			l10[u] = mu_re;
-			l10[u + hn] = fpr_neg(mu_im);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
-		fpr mu_re, mu_im;
-
-		g00_re = g00[u];
-		g00_im = g00[u + hn];
-		g01_re = g01[u];
-		g01_im = g01[u + hn];
-		g11_re = g11[u];
-		g11_im = g11[u + hn];
-		FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
-		FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
-		FPC_SUB(d11[u], d11[u + hn], g11_re, g11_im, g01_re, g01_im);
-		l10[u] = mu_re;
-		l10[u + hn] = fpr_neg(mu_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_split_fft)(
-	fpr *restrict f0, fpr *restrict f1,
-	const fpr *restrict f, unsigned logn)
-{
-	/*
-	 * The FFT representation we use is in bit-reversed order
-	 * (element i contains f(w^(rev(i))), where rev() is the
-	 * bit-reversal function over the ring degree. This changes
-	 * indexes with regards to the Falcon specification.
-	 */
-	size_t n, hn, qn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-	qn = hn >> 1;
-
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d half, sv;
-
-		half = _mm256_set1_pd(0.5);
-		sv = _mm256_set_pd(-0.0, 0.0, -0.0, 0.0);
-		for (u = 0; u < qn; u += 2) {
-			__m256d ab_re, ab_im, ff0, ff1, ff2, ff3, gmt;
-
-			ab_re = _mm256_loadu_pd(&f[(u << 1)].v);
-			ab_im = _mm256_loadu_pd(&f[(u << 1) + hn].v);
-			ff0 = _mm256_mul_pd(_mm256_hadd_pd(ab_re, ab_im), half);
-			ff0 = _mm256_permute4x64_pd(ff0, 0xD8);
-			_mm_storeu_pd(&f0[u].v,
-				_mm256_extractf128_pd(ff0, 0));
-			_mm_storeu_pd(&f0[u + qn].v,
-				_mm256_extractf128_pd(ff0, 1));
-
-			ff1 = _mm256_mul_pd(_mm256_hsub_pd(ab_re, ab_im), half);
-			gmt = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
-			ff2 = _mm256_shuffle_pd(ff1, ff1, 0x5);
-			ff3 = _mm256_hadd_pd(
-				_mm256_mul_pd(ff1, gmt),
-				_mm256_xor_pd(_mm256_mul_pd(ff2, gmt), sv));
-			ff3 = _mm256_permute4x64_pd(ff3, 0xD8);
-			_mm_storeu_pd(&f1[u].v,
-				_mm256_extractf128_pd(ff3, 0));
-			_mm_storeu_pd(&f1[u + qn].v,
-				_mm256_extractf128_pd(ff3, 1));
-		}
-	} else {
-		f0[0] = f[0];
-		f1[0] = f[hn];
-
-		for (u = 0; u < qn; u ++) {
-			fpr a_re, a_im, b_re, b_im;
-			fpr t_re, t_im;
-
-			a_re = f[(u << 1) + 0];
-			a_im = f[(u << 1) + 0 + hn];
-			b_re = f[(u << 1) + 1];
-			b_im = f[(u << 1) + 1 + hn];
-
-			FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
-			f0[u] = fpr_half(t_re);
-			f0[u + qn] = fpr_half(t_im);
-
-			FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
-			FPC_MUL(t_re, t_im, t_re, t_im,
-				fpr_gm_tab[((u + hn) << 1) + 0],
-				fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
-			f1[u] = fpr_half(t_re);
-			f1[u + qn] = fpr_half(t_im);
-		}
-	}
-#else // yyyAVX2+0
-	/*
-	 * We process complex values by pairs. For logn = 1, there is only
-	 * one complex value (the other one is the implicit conjugate),
-	 * so we add the two lines below because the loop will be
-	 * skipped.
-	 */
-	f0[0] = f[0];
-	f1[0] = f[hn];
-
-	for (u = 0; u < qn; u ++) {
-		fpr a_re, a_im, b_re, b_im;
-		fpr t_re, t_im;
-
-		a_re = f[(u << 1) + 0];
-		a_im = f[(u << 1) + 0 + hn];
-		b_re = f[(u << 1) + 1];
-		b_im = f[(u << 1) + 1 + hn];
-
-		FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
-		f0[u] = fpr_half(t_re);
-		f0[u + qn] = fpr_half(t_im);
-
-		FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
-		FPC_MUL(t_re, t_im, t_re, t_im,
-			fpr_gm_tab[((u + hn) << 1) + 0],
-			fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
-		f1[u] = fpr_half(t_re);
-		f1[u + qn] = fpr_half(t_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_merge_fft)(
-	fpr *restrict f,
-	const fpr *restrict f0, const fpr *restrict f1, unsigned logn)
-{
-	size_t n, hn, qn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-	qn = hn >> 1;
-
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 16) {
-		for (u = 0; u < qn; u += 4) {
-			__m256d a_re, a_im, b_re, b_im, c_re, c_im;
-			__m256d gm1, gm2, g_re, g_im;
-			__m256d t_re, t_im, u_re, u_im;
-			__m256d tu1_re, tu2_re, tu1_im, tu2_im;
-
-			a_re = _mm256_loadu_pd(&f0[u].v);
-			a_im = _mm256_loadu_pd(&f0[u + qn].v);
-			c_re = _mm256_loadu_pd(&f1[u].v);
-			c_im = _mm256_loadu_pd(&f1[u + qn].v);
-
-			gm1 = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
-			gm2 = _mm256_loadu_pd(&fpr_gm_tab[(u + 2 + hn) << 1].v);
-			g_re = _mm256_unpacklo_pd(gm1, gm2);
-			g_im = _mm256_unpackhi_pd(gm1, gm2);
-			g_re = _mm256_permute4x64_pd(g_re, 0xD8);
-			g_im = _mm256_permute4x64_pd(g_im, 0xD8);
-
-			b_re = FMSUB(
-				c_re, g_re, _mm256_mul_pd(c_im, g_im));
-			b_im = FMADD(
-				c_re, g_im, _mm256_mul_pd(c_im, g_re));
-
-			t_re = _mm256_add_pd(a_re, b_re);
-			t_im = _mm256_add_pd(a_im, b_im);
-			u_re = _mm256_sub_pd(a_re, b_re);
-			u_im = _mm256_sub_pd(a_im, b_im);
-
-			tu1_re = _mm256_unpacklo_pd(t_re, u_re);
-			tu2_re = _mm256_unpackhi_pd(t_re, u_re);
-			tu1_im = _mm256_unpacklo_pd(t_im, u_im);
-			tu2_im = _mm256_unpackhi_pd(t_im, u_im);
-			_mm256_storeu_pd(&f[(u << 1)].v,
-				_mm256_permute2f128_pd(tu1_re, tu2_re, 0x20));
-			_mm256_storeu_pd(&f[(u << 1) + 4].v,
-				_mm256_permute2f128_pd(tu1_re, tu2_re, 0x31));
-			_mm256_storeu_pd(&f[(u << 1) + hn].v,
-				_mm256_permute2f128_pd(tu1_im, tu2_im, 0x20));
-			_mm256_storeu_pd(&f[(u << 1) + 4 + hn].v,
-				_mm256_permute2f128_pd(tu1_im, tu2_im, 0x31));
-		}
-	} else {
-		f[0] = f0[0];
-		f[hn] = f1[0];
-
-		for (u = 0; u < qn; u ++) {
-			fpr a_re, a_im, b_re, b_im;
-			fpr t_re, t_im;
-
-			a_re = f0[u];
-			a_im = f0[u + qn];
-			FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
-				fpr_gm_tab[((u + hn) << 1) + 0],
-				fpr_gm_tab[((u + hn) << 1) + 1]);
-			FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
-			f[(u << 1) + 0] = t_re;
-			f[(u << 1) + 0 + hn] = t_im;
-			FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
-			f[(u << 1) + 1] = t_re;
-			f[(u << 1) + 1 + hn] = t_im;
-		}
-	}
-#else // yyyAVX2+0
-	/*
-	 * An extra copy to handle the special case logn = 1.
-	 */
-	f[0] = f0[0];
-	f[hn] = f1[0];
-
-	for (u = 0; u < qn; u ++) {
-		fpr a_re, a_im, b_re, b_im;
-		fpr t_re, t_im;
-
-		a_re = f0[u];
-		a_im = f0[u + qn];
-		FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
-			fpr_gm_tab[((u + hn) << 1) + 0],
-			fpr_gm_tab[((u + hn) << 1) + 1]);
-		FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
-		f[(u << 1) + 0] = t_re;
-		f[(u << 1) + 0 + hn] = t_im;
-		FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
-		f[(u << 1) + 1] = t_re;
-		f[(u << 1) + 1 + hn] = t_im;
-	}
-#endif // yyyAVX2-
-}
diff --git a/crypto_sign/falcon-512-tree/m4-ct/fpr.c b/crypto_sign/falcon-512-tree/m4-ct/fpr.c
deleted file mode 100644
index eb23a44b..00000000
--- a/crypto_sign/falcon-512-tree/m4-ct/fpr.c
+++ /dev/null
@@ -1,3460 +0,0 @@
-/*
- * Floating-point operations.
- *
- * This file implements the non-inline functions declared in
- * fpr.h, as well as the constants for FFT / iFFT.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-#if FALCON_FPEMU // yyyFPEMU+1
-
-/*
- * Normalize a provided unsigned integer to the 2^63..2^64-1 range by
- * left-shifting it if necessary. The exponent e is adjusted accordingly
- * (i.e. if the value was left-shifted by n bits, then n is subtracted
- * from e). If source m is 0, then it remains 0, but e is altered.
- * Both m and e must be simple variables (no expressions allowed).
- */
-#define FPR_NORM64(m, e)   do { \
-		uint32_t nt; \
- \
-		(e) -= 63; \
- \
-		nt = (uint32_t)((m) >> 32); \
-		nt = (nt | -nt) >> 31; \
-		(m) ^= ((m) ^ ((m) << 32)) & ((uint64_t)nt - 1); \
-		(e) += (int)(nt << 5); \
- \
-		nt = (uint32_t)((m) >> 48); \
-		nt = (nt | -nt) >> 31; \
-		(m) ^= ((m) ^ ((m) << 16)) & ((uint64_t)nt - 1); \
-		(e) += (int)(nt << 4); \
- \
-		nt = (uint32_t)((m) >> 56); \
-		nt = (nt | -nt) >> 31; \
-		(m) ^= ((m) ^ ((m) <<  8)) & ((uint64_t)nt - 1); \
-		(e) += (int)(nt << 3); \
- \
-		nt = (uint32_t)((m) >> 60); \
-		nt = (nt | -nt) >> 31; \
-		(m) ^= ((m) ^ ((m) <<  4)) & ((uint64_t)nt - 1); \
-		(e) += (int)(nt << 2); \
- \
-		nt = (uint32_t)((m) >> 62); \
-		nt = (nt | -nt) >> 31; \
-		(m) ^= ((m) ^ ((m) <<  2)) & ((uint64_t)nt - 1); \
-		(e) += (int)(nt << 1); \
- \
-		nt = (uint32_t)((m) >> 63); \
-		(m) ^= ((m) ^ ((m) <<  1)) & ((uint64_t)nt - 1); \
-		(e) += (int)(nt); \
-	} while (0)
-
-#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1
-
-__attribute__((naked))
-fpr
-fpr_scaled(int64_t i __attribute__((unused)), int sc __attribute__((unused)))
-{
-	__asm__ (
-	"push	{ r4, r5, r6, lr }\n\t"
-	"\n\t"
-	"@ Input i is in r0:r1, and sc in r2.\n\t"
-	"@ Extract the sign bit, and compute the absolute value.\n\t"
-	"@ -> sign bit in r3, with value 0 or -1\n\t"
-	"asrs	r3, r1, #31\n\t"
-	"eors	r0, r3\n\t"
-	"eors	r1, r3\n\t"
-	"subs	r0, r3\n\t"
-	"sbcs	r1, r3\n\t"
-	"\n\t"
-	"@ Scale exponent to account for the encoding; if the source is\n\t"
-	"@ zero or if the scaled exponent is negative, it is set to 32.\n\t"
-	"addw	r2, r2, #1022\n\t"
-	"orrs	r4, r0, r1\n\t"
-	"bics	r4, r4, r2, asr #31\n\t"
-	"rsbs	r5, r4, #0\n\t"
-	"orrs	r4, r5\n\t"
-	"ands	r2, r2, r4, asr #31\n\t"
-	"adds	r2, #32\n\t"
-	"\n\t"
-	"@ Normalize value to a full 64-bit width, by shifting it left.\n\t"
-	"@ The shift count is subtracted from the exponent (in r2).\n\t"
-	"@ If the mantissa is 0, the exponent is set to 0.\n\t"
-	"\n\t"
-	"@ If top word is 0, replace with low word; otherwise, add 32 to\n\t"
-	"@ the exponent.\n\t"
-	"rsbs	r4, r1, #0\n\t"
-	"orrs	r4, r1\n\t"
-	"eors	r5, r0, r1\n\t"
-	"bics	r5, r5, r4, asr #31\n\t"
-	"eors	r1, r5\n\t"
-	"ands	r0, r0, r4, asr #31\n\t"
-	"lsrs	r4, r4, #31\n\t"
-	"adds	r2, r2, r4, lsl #5\n\t"
-	"\n\t"
-	"@ Count leading zeros of r1 to finish the shift.\n\t"
-	"clz	r4, r1\n\t"
-	"subs	r2, r4\n\t"
-	"rsbs	r5, r4, #32\n\t"
-	"lsls	r1, r4\n\t"
-	"lsrs	r5, r0, r5\n\t"
-	"lsls	r0, r4\n\t"
-	"orrs	r1, r5\n\t"
-	"\n\t"
-	"@ Clear the top bit; we know it's a 1 (unless the whole mantissa\n\t"
-	"@ was zero, but then it's still OK to clear it)\n\t"
-	"bfc	r1, #31, #1\n\t"
-	"\n\t"
-	"@ Now shift right the value by 11 bits; this puts the value in\n\t"
-	"@ the 2^52..2^53-1 range. We also keep a copy of the pre-shift\n\t"
-	"@ low bits in r5.\n\t"
-	"movs	r5, r0\n\t"
-	"lsrs	r0, #11\n\t"
-	"orrs	r0, r0, r1, lsl #21\n\t"
-	"lsrs	r1, #11\n\t"
-	"\n\t"
-	"@ Also plug the exponent at the right place. This must be done\n\t"
-	"@ now so that, in case the rounding creates a carry, that carry\n\t"
-	"@ adds to the exponent, which would be exactly what we want at\n\t"
-	"@ that point.\n\t"
-	"orrs	r1, r1, r2, lsl #20\n\t"
-	"\n\t"
-	"@ Rounding: we must add 1 to the mantissa in the following cases:\n\t"
-	"@  - bits 11 to 9 of r5 are '011', '110' or '111'\n\t"
-	"@  - bits 11 to 9 of r5 are '010' and one of the\n\t"
-	"@    bits 0 to 8 is non-zero\n\t"
-	"ubfx	r6, r5, #0, #9\n\t"
-	"addw	r6, r6, #511\n\t"
-	"orrs	r5, r6\n\t"
-	"\n\t"
-	"ubfx	r5, r5, #9, #3\n\t"
-	"movs	r6, #0xC8\n\t"
-	"lsrs	r6, r5\n\t"
-	"ands	r6, #1\n\t"
-	"adds	r0, r6\n\t"
-	"adcs	r1, #0\n\t"
-	"\n\t"
-	"@ Put back the sign.\n\t"
-	"orrs	r1, r1, r3, lsl #31\n\t"
-	"\n\t"
-	"pop	{ r4, r5, r6, pc}\n\t"
-	);
-}
-
-#else // yyyASM_CORTEXM4+0
-
-fpr
-fpr_scaled(int64_t i, int sc)
-{
-	/*
-	 * To convert from int to float, we have to do the following:
-	 *  1. Get the absolute value of the input, and its sign
-	 *  2. Shift right or left the value as appropriate
-	 *  3. Pack the result
-	 *
-	 * We can assume that the source integer is not -2^63.
-	 */
-	int s, e;
-	uint32_t t;
-	uint64_t m;
-
-	/*
-	 * Extract sign bit.
-	 * We have: -i = 1 + ~i
-	 */
-	s = (int)((uint64_t)i >> 63);
-	i ^= -(int64_t)s;
-	i += s;
-
-	/*
-	 * For now we suppose that i != 0.
-	 * Otherwise, we set m to i and left-shift it as much as needed
-	 * to get a 1 in the top bit. We can do that in a logarithmic
-	 * number of conditional shifts.
-	 */
-	m = (uint64_t)i;
-	e = 9 + sc;
-	FPR_NORM64(m, e);
-
-	/*
-	 * Now m is in the 2^63..2^64-1 range. We must divide it by 512;
-	 * if one of the dropped bits is a 1, this should go into the
-	 * "sticky bit".
-	 */
-	m |= ((uint32_t)m & 0x1FF) + 0x1FF;
-	m >>= 9;
-
-	/*
-	 * Corrective action: if i = 0 then all of the above was
-	 * incorrect, and we clamp e and m down to zero.
-	 */
-	t = (uint32_t)((uint64_t)(i | -i) >> 63);
-	m &= -(uint64_t)t;
-	e &= -(int)t;
-
-	/*
-	 * Assemble back everything. The FPR() function will handle cases
-	 * where e is too low.
-	 */
-	return FPR(s, e, m);
-}
-
-#endif // yyyASM_CORTEXM4-
-
-#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1
-
-// yyyPQCLEAN+0
-#if 0
-/* Debug code -- To get a printout of registers from a specific point
-   in ARM Cortex M4 assembly code, uncomment this code and add a
-   "bl DEBUG" call where wished for. */
-
-void
-print_regs(uint32_t *rr, uint32_t flags)
-{
-	int i;
-	extern int printf(const char *fmt, ...);
-
-	printf("\nRegs:\n");
-	for (i = 0; i < 7; i ++) {
-		int j;
-
-		j = i + 7;
-		printf("  %2d = %08X    %2d = %08X\n", i, rr[i], j, rr[j]);
-	}
-	printf("  flags = %08X  ", flags);
-	if ((flags >> 31) & 1) {
-		printf("N");
-	}
-	if ((flags >> 30) & 1) {
-		printf("Z");
-	}
-	if ((flags >> 29) & 1) {
-		printf("C");
-	}
-	if ((flags >> 28) & 1) {
-		printf("V");
-	}
-	if ((flags >> 27) & 1) {
-		printf("Q");
-	}
-	printf("\n");
-}
-
-__attribute__((naked))
-void
-DEBUG(void)
-{
-	__asm__ (
-	"push	{ r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr }\n\t"
-	"mov	r0, sp\n\t"
-	"mrs	r1, apsr\n\t"
-	"bl	print_regs\n\t"
-	"pop	{ r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, pc }\n\t"
-	);
-}
-#endif
-// yyyPQCLEAN-
-
-__attribute__((naked))
-fpr
-fpr_add(fpr x __attribute__((unused)), fpr y __attribute__((unused)))
-{
-	__asm__ (
-	"push	{ r4, r5, r6, r7, r8, r10, r11, lr }\n\t"
-	"\n\t"
-	"@ Make sure that the first operand (x) has the larger absolute\n\t"
-	"@ value. This guarantees that the exponent of y is less than\n\t"
-	"@ or equal to the exponent of x, and, if they are equal, then\n\t"
-	"@ the mantissa of y will not be greater than the mantissa of x.\n\t"
-	"@ However, if absolute values are equal and the sign of x is 1,\n\t"
-	"@ then we want to also swap the values.\n\t"
-	"ubfx	r4, r1, #0, #31  @ top word without sign bit\n\t"
-	"ubfx	r5, r3, #0, #31  @ top word without sign bit\n\t"
-	"subs	r7, r0, r2       @ difference in r7:r4\n\t"
-	"sbcs	r4, r5\n\t"
-	"orrs	r7, r4\n\t"
-	"rsbs	r5, r7, #0\n\t"
-	"orrs	r7, r5      @ bit 31 of r7 is 0 iff difference is zero\n\t"
-	"bics	r6, r1, r7\n\t"
-	"orrs	r6, r4      @ bit 31 of r6 is 1 iff the swap must be done\n\t"
-	"\n\t"
-	"@ Conditional swap\n\t"
-	"eors	r4, r0, r2\n\t"
-	"eors	r5, r1, r3\n\t"
-	"ands	r4, r4, r6, asr #31\n\t"
-	"ands	r5, r5, r6, asr #31\n\t"
-	"eors	r0, r4\n\t"
-	"eors	r1, r5\n\t"
-	"eors	r2, r4\n\t"
-	"eors	r3, r5\n\t"
-	"\n\t"
-	"@ Extract mantissa of x into r0:r1, exponent in r4, sign in r5\n\t"
-	"ubfx	r4, r1, #20, #11   @ Exponent in r4 (without sign)\n\t"
-	"addw	r5, r4, #2047 @ Get a carry to test r4 for zero\n\t"
-	"lsrs	r5, #11       @ r5 is the mantissa implicit high bit\n\t"
-	"bfc	r1, #20, #11  @ Clear exponent bits (not the sign)\n\t"
-	"orrs	r1, r1, r5, lsl #20  @ Set mantissa high bit\n\t"
-	"asrs	r5, r1, #31   @ Get sign bit (sign-extended)\n\t"
-	"bfc	r1, #31, #1   @ Clear the sign bit\n\t"
-	"\n\t"
-	"@ Extract mantissa of y into r2:r3, exponent in r6, sign in r7\n\t"
-	"ubfx	r6, r3, #20, #11   @ Exponent in r6 (without sign)\n\t"
-	"addw	r7, r6, #2047 @ Get a carry to test r6 for zero\n\t"
-	"lsrs	r7, #11       @ r7 is the mantissa implicit high bit\n\t"
-	"bfc	r3, #20, #11  @ Clear exponent bits (not the sign)\n\t"
-	"orrs	r3, r3, r7, lsl #20  @ Set mantissa high bit\n\t"
-	"asrs	r7, r3, #31   @ Get sign bit (sign-extended)\n\t"
-	"bfc	r3, #31, #1   @ Clear the sign bit\n\t"
-	"\n\t"
-	"@ Scale mantissas up by three bits.\n\t"
-	"lsls	r1, #3\n\t"
-	"orrs	r1, r1, r0, lsr #29\n\t"
-	"lsls	r0, #3\n\t"
-	"lsls	r3, #3\n\t"
-	"orrs	r3, r3, r2, lsr #29\n\t"
-	"lsls	r2, #3\n\t"
-	"\n\t"
-	"@ x: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t"
-	"@ y: exponent=r6, sign=r7, mantissa=r2:r3 (scaled up 3 bits)\n\t"
-	"\n\t"
-	"@ At that point, the exponent of x (in r4) is larger than that\n\t"
-	"@ of y (in r6). The difference is the amount of shifting that\n\t"
-	"@ should be done on y. If that amount is larger than 59 then\n\t"
-	"@ we clamp y to 0. We won't need y's exponent beyond that point,\n\t"
-	"@ so we store that shift count in r6.\n\t"
-	"subs	r6, r4, r6\n\t"
-	"subs	r8, r6, #60\n\t"
-	"ands	r2, r2, r8, asr #31\n\t"
-	"ands	r3, r3, r8, asr #31\n\t"
-	"\n\t"
-	"@ Shift right r2:r3 by r6 bits. The shift count is in the 0..59\n\t"
-	"@ range. r11 will be non-zero if and only if some non-zero bits\n\t"
-	"@ were dropped.\n\t"
-	"subs	r8, r6, #32\n\t"
-	"bics	r11, r2, r8, asr #31\n\t"
-	"ands	r2, r2, r8, asr #31\n\t"
-	"bics	r10, r3, r8, asr #31\n\t"
-	"orrs	r2, r2, r10\n\t"
-	"ands	r3, r3, r8, asr #31\n\t"
-	"ands	r6, r6, #31\n\t"
-	"rsbs	r8, r6, #32\n\t"
-	"lsls	r10, r2, r8\n\t"
-	"orrs	r11, r11, r10\n\t"
-	"lsrs	r2, r2, r6\n\t"
-	"lsls	r10, r3, r8\n\t"
-	"orrs	r2, r2, r10\n\t"
-	"lsrs	r3, r3, r6\n\t"
-	"\n\t"
-	"@ If r11 is non-zero then some non-zero bit was dropped and the\n\t"
-	"@ low bit of r2 must be forced to 1 ('sticky bit').\n\t"
-	"rsbs	r6, r11, #0\n\t"
-	"orrs	r6, r6, r11\n\t"
-	"orrs	r2, r2, r6, lsr #31\n\t"
-	"\n\t"
-	"@ x: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t"
-	"@ y: sign=r7, value=r2:r3 (scaled to same exponent as x)\n\t"
-	"\n\t"
-	"@ If x and y don't have the same sign, then we should negate r2:r3\n\t"
-	"@ (i.e. subtract the mantissa instead of adding it). Signs of x\n\t"
-	"@ and y are in r5 and r7, as full-width words. We won't need r7\n\t"
-	"@ afterwards.\n\t"
-	"eors	r7, r5    @ r7 = -1 if y must be negated, 0 otherwise\n\t"
-	"eors	r2, r7\n\t"
-	"eors	r3, r7\n\t"
-	"subs	r2, r7\n\t"
-	"sbcs	r3, r7\n\t"
-	"\n\t"
-	"@ r2:r3 has been shifted, we can add to r0:r1.\n\t"
-	"adds	r0, r2\n\t"
-	"adcs	r1, r3\n\t"
-	"\n\t"
-	"@ result: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t"
-	"\n\t"
-	"@ Normalize the result with some left-shifting to full 64-bit\n\t"
-	"@ width. Shift count goes to r2, and exponent (r4) is adjusted.\n\t"
-	"clz	r2, r0\n\t"
-	"clz	r3, r1\n\t"
-	"sbfx	r6, r3, #5, #1\n\t"
-	"ands	r2, r6\n\t"
-	"adds	r2, r2, r3\n\t"
-	"subs	r4, r4, r2\n\t"
-	"\n\t"
-	"@ Shift r0:r1 to the left by r2 bits.\n\t"
-	"subs	r7, r2, #32\n\t"
-	"lsls	r7, r0, r7\n\t"
-	"lsls	r1, r1, r2\n\t"
-	"rsbs	r6, r2, #32\n\t"
-	"orrs	r1, r1, r7\n\t"
-	"lsrs	r6, r0, r6\n\t"
-	"orrs	r1, r1, r6\n\t"
-	"lsls	r0, r0, r2\n\t"
-	"\n\t"
-	"@ The exponent of x was in r4. The left-shift operation has\n\t"
-	"@ subtracted some value from it, 8 in case the result has the\n\t"
-	"@ same exponent as x. However, the high bit of the mantissa will\n\t"
-	"@ add 1 to the exponent, so we only add back 7 (the exponent is\n\t"
-	"@ added in because rounding might have produced a carry, which\n\t"
-	"@ should then spill into the exponent).\n\t"
-	"adds	r4, #7\n\t"
-	"\n\t"
-	"@ If the mantissa new mantissa is non-zero, then its bit 63 is\n\t"
-	"@ non-zero (thanks to the normalizing shift). Otherwise, that bit\n\t"
-	"@ is zero, and we should then set the exponent to zero as well.\n\t"
-	"ands	r4, r4, r1, asr #31\n\t"
-	"\n\t"
-	"@ Shrink back the value to a 52-bit mantissa. This requires\n\t"
-	"@ right-shifting by 11 bits; we keep a copy of the pre-shift\n\t"
-	"@ low word in r3.\n\t"
-	"movs	r3, r0\n\t"
-	"lsrs	r0, #11\n\t"
-	"orrs	r0, r0, r1, lsl #21\n\t"
-	"lsrs	r1, #11\n\t"
-	"\n\t"
-	"@ Apply rounding.\n\t"
-	"ubfx	r6, r3, #0, #9\n\t"
-	"addw	r6, r6, #511\n\t"
-	"orrs	r3, r6\n\t"
-	"ubfx	r3, r3, #9, #3\n\t"
-	"movs	r6, #0xC8\n\t"
-	"lsrs	r6, r3\n\t"
-	"ands	r6, #1\n\t"
-	"adds	r0, r6\n\t"
-	"adcs	r1, #0\n\t"
-	"\n\t"
-	"@Plug in the exponent with an addition.\n\t"
-	"adds	r1, r1, r4, lsl #20\n\t"
-	"\n\t"
-	"@ If the new exponent is negative or zero, then it underflowed\n\t"
-	"@ and we must clear the whole mantissa and exponent.\n\t"
-	"rsbs	r4, r4, #0\n\t"
-	"ands	r0, r0, r4, asr #31\n\t"
-	"ands	r1, r1, r4, asr #31\n\t"
-	"\n\t"
-	"@ Put back the sign. This is the sign of x: thanks to the\n\t"
-	"@ conditional swap at the start, this is always correct.\n\t"
-	"bfi	r1, r5, #31, #1\n\t"
-	"\n\t"
-	"pop	{ r4, r5, r6, r7, r8, r10, r11, pc }\n\t"
-	);
-}
-
-#else // yyyASM_CORTEXM4+0
-
-fpr
-fpr_add(fpr x, fpr y)
-{
-	uint64_t m, xu, yu, za;
-	uint32_t cs;
-	int ex, ey, sx, sy, cc;
-
-	/*
-	 * Make sure that the first operand (x) has the larger absolute
-	 * value. This guarantees that the exponent of y is less than
-	 * or equal to the exponent of x, and, if they are equal, then
-	 * the mantissa of y will not be greater than the mantissa of x.
-	 *
-	 * After this swap, the result will have the sign x, except in
-	 * the following edge case: abs(x) = abs(y), and x and y have
-	 * opposite sign bits; in that case, the result shall be +0
-	 * even if the sign bit of x is 1. To handle this case properly,
-	 * we do the swap is abs(x) = abs(y) AND the sign of x is 1.
-	 */
-	m = ((uint64_t)1 << 63) - 1;
-	za = (x & m) - (y & m);
-	cs = (uint32_t)(za >> 63)
-		| ((1U - (uint32_t)(-za >> 63)) & (uint32_t)(x >> 63));
-	m = (x ^ y) & -(uint64_t)cs;
-	x ^= m;
-	y ^= m;
-
-	/*
-	 * Extract sign bits, exponents and mantissas. The mantissas are
-	 * scaled up to 2^55..2^56-1, and the exponent is unbiased. If
-	 * an operand is zero, its mantissa is set to 0 at this step, and
-	 * its exponent will be -1078.
-	 */
-	ex = (int)(x >> 52);
-	sx = ex >> 11;
-	ex &= 0x7FF;
-	m = (uint64_t)(uint32_t)((ex + 0x7FF) >> 11) << 52;
-	xu = ((x & (((uint64_t)1 << 52) - 1)) | m) << 3;
-	ex -= 1078;
-	ey = (int)(y >> 52);
-	sy = ey >> 11;
-	ey &= 0x7FF;
-	m = (uint64_t)(uint32_t)((ey + 0x7FF) >> 11) << 52;
-	yu = ((y & (((uint64_t)1 << 52) - 1)) | m) << 3;
-	ey -= 1078;
-
-	/*
-	 * x has the larger exponent; hence, we only need to right-shift y.
-	 * If the shift count is larger than 59 bits then we clamp the
-	 * value to zero.
-	 */
-	cc = ex - ey;
-	yu &= -(uint64_t)((uint32_t)(cc - 60) >> 31);
-	cc &= 63;
-
-	/*
-	 * The lowest bit of yu is "sticky".
-	 */
-	m = fpr_ulsh(1, cc) - 1;
-	yu |= (yu & m) + m;
-	yu = fpr_ursh(yu, cc);
-
-	/*
-	 * If the operands have the same sign, then we add the mantissas;
-	 * otherwise, we subtract the mantissas.
-	 */
-	xu += yu - ((yu << 1) & -(uint64_t)(sx ^ sy));
-
-	/*
-	 * The result may be smaller, or slightly larger. We normalize
-	 * it to the 2^63..2^64-1 range (if xu is zero, then it stays
-	 * at zero).
-	 */
-	FPR_NORM64(xu, ex);
-
-	/*
-	 * Scale down the value to 2^54..s^55-1, handling the last bit
-	 * as sticky.
-	 */
-	xu |= ((uint32_t)xu & 0x1FF) + 0x1FF;
-	xu >>= 9;
-	ex += 9;
-
-	/*
-	 * In general, the result has the sign of x. However, if the
-	 * result is exactly zero, then the following situations may
-	 * be encountered:
-	 *   x > 0, y = -x   -> result should be +0
-	 *   x < 0, y = -x   -> result should be +0
-	 *   x = +0, y = +0  -> result should be +0
-	 *   x = -0, y = +0  -> result should be +0
-	 *   x = +0, y = -0  -> result should be +0
-	 *   x = -0, y = -0  -> result should be -0
-	 *
-	 * But at the conditional swap step at the start of the
-	 * function, we ensured that if abs(x) = abs(y) and the
-	 * sign of x was 1, then x and y were swapped. Thus, the
-	 * two following cases cannot actually happen:
-	 *   x < 0, y = -x
-	 *   x = -0, y = +0
-	 * In all other cases, the sign bit of x is conserved, which
-	 * is what the FPR() function does. The FPR() function also
-	 * properly clamps values to zero when the exponent is too
-	 * low, but does not alter the sign in that case.
-	 */
-	return FPR(sx, ex, xu);
-}
-
-#endif // yyyASM_CORTEXM4-
-
-#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1
-
-__attribute__((naked))
-fpr
-fpr_mul(fpr x __attribute__((unused)), fpr y __attribute__((unused)))
-{
-	__asm__ (
-	"push	{ r4, r5, r6, r7, r8, r10, r11, lr }\n\t"
-	"\n\t"
-	"@ Extract mantissas: x.m = r4:r5, y.m = r6:r7\n\t"
-	"@ r4 and r6 contain only 25 bits each.\n\t"
-	"bics	r4, r0, #0xFE000000\n\t"
-	"lsls	r5, r1, #7\n\t"
-	"orrs	r5, r5, r0, lsr #25\n\t"
-	"orrs	r5, r5, #0x08000000\n\t"
-	"bics	r5, r5, #0xF0000000\n\t"
-	"bics	r6, r2, #0xFE000000\n\t"
-	"lsls	r7, r3, #7\n\t"
-	"orrs	r7, r7, r2, lsr #25\n\t"
-	"orrs	r7, r7, #0x08000000\n\t"
-	"bics	r7, r7, #0xF0000000\n\t"
-	"\n\t"
-	"@ Perform product. Values are in the 2^52..2^53-1 range, so\n\t"
-	"@ the product is at most 106-bit long. Of the low 50 bits,\n\t"
-	"@ we only want to know if they are all zeros or not. Here,\n\t"
-	"@ we get the top 56 bits in r10:r11, and r8 will be non-zero\n\t"
-	"@ if and only if at least one of the low 50 bits is non-zero.\n\t"
-	"umull	r8, r10, r4, r6      @ x0*y0\n\t"
-	"lsls	r10, #7\n\t"
-	"orrs	r10, r10, r8, lsr #25\n\t"
-	"eors	r11, r11\n\t"
-	"umlal	r10, r11, r4, r7     @ x0*y1\n\t"
-	"umlal	r10, r11, r5, r6     @ x1*y0\n\t"
-	"orrs	r8, r8, r10, lsl #7\n\t"
-	"lsrs	r10, #25\n\t"
-	"orrs	r10, r10, r11, lsl #7\n\t"
-	"eors	r11, r11\n\t"
-	"umlal	r10, r11, r5, r7     @ x1*y1\n\t"
-	"\n\t"
-	"@ Now r0, r2, r4, r5, r6 and r7 are free.\n\t"
-	"@ If any of the low 50 bits was non-zero, then we force the\n\t"
-	"@ low bit of r10 to 1.\n\t"
-	"rsbs	r4, r8, #0\n\t"
-	"orrs	r8, r8, r4\n\t"
-	"orrs	r10, r10, r8, lsr #31\n\t"
-	"\n\t"
-	"@ r8 is free.\n\t"
-	"@ r10:r11 contains the product in the 2^54..2^56-1 range. We\n\t"
-	"@ normalize it to 2^54..2^55-1 (into r6:r7) with a conditional\n\t"
-	"@ shift (low bit is sticky). r5 contains -1 if the shift was done,\n\t"
-	"@ 0 otherwise.\n\t"
-	"ands	r6, r10, #1\n\t"
-	"lsrs	r5, r11, #23\n\t"
-	"rsbs	r5, r5, #0\n\t"
-	"orrs	r6, r6, r10, lsr #1\n\t"
-	"orrs	r6, r6, r11, lsl #31\n\t"
-	"lsrs	r7, r11, #1\n\t"
-	"eors	r10, r10, r6\n\t"
-	"eors	r11, r11, r7\n\t"
-	"bics	r10, r10, r5\n\t"
-	"bics	r11, r11, r5\n\t"
-	"eors	r6, r6, r10\n\t"
-	"eors	r7, r7, r11\n\t"
-	"\n\t"
-	"@ Compute aggregate exponent: ex + ey - 1023 + w\n\t"
-	"@ (where w = 1 if the conditional shift was done, 0 otherwise)\n\t"
-	"@ But we subtract 1 because the injection of the mantissa high\n\t"
-	"@ bit will increment the exponent by 1.\n\t"
-	"lsls	r0, r1, #1\n\t"
-	"lsls	r2, r3, #1\n\t"
-	"lsrs	r0, #21\n\t"
-	"addw	r4, r0, #0x7FF   @ save ex + 2047 in r4\n\t"
-	"lsrs	r2, #21\n\t"
-	"addw	r8, r2, #0x7FF   @ save ey + 2047 in r8\n\t"
-	"adds	r2, r0\n\t"
-	"subw	r2, r2, #1024\n\t"
-	"subs	r2, r5\n\t"
-	"\n\t"
-	"@ r5 is free.\n\t"
-	"@ Also, if either of the source exponents is 0, or the result\n\t"
-	"@ exponent is 0 or negative, then the result is zero and the\n\t"
-	"@ mantissa and the exponent shall be clamped to zero. Since\n\t"
-	"@ r2 contains the result exponent minus 1, we test on r2\n\t"
-	"@ being strictly negative.\n\t"
-	"ands	r4, r8    @ if bit 11 = 0 then one of the exponents was 0\n\t"
-	"mvns	r5, r2\n\t"
-	"ands	r5, r5, r4, lsl #20\n\t"
-	"ands	r2, r2, r5, asr #31\n\t"
-	"ands	r6, r6, r5, asr #31\n\t"
-	"ands	r7, r7, r5, asr #31\n\t"
-	"\n\t"
-	"@ Sign is the XOR of the sign of the operands. This is true in\n\t"
-	"@ all cases, including very small results (exponent underflow)\n\t"
-	"@ and zeros.\n\t"
-	"eors	r1, r3\n\t"
-	"bfc	r1, #0, #31\n\t"
-	"\n\t"
-	"@ Plug in the exponent.\n\t"
-	"bfi	r1, r2, #20, #11\n\t"
-	"\n\t"
-	"@ r2 and r3 are free.\n\t"
-	"@ Shift back to the normal 53-bit mantissa, with rounding.\n\t"
-	"@ Mantissa goes into r0:r1. For r1, we must use an addition\n\t"
-	"@ because the rounding may have triggered a carry, that should\n\t"
-	"@ be added to the exponent.\n\t"
-	"movs	r4, r6\n\t"
-	"lsrs	r0, r6, #2\n\t"
-	"orrs	r0, r0, r7, lsl #30\n\t"
-	"adds	r1, r1, r7, lsr #2\n\t"
-	"ands	r4, #0x7\n\t"
-	"movs	r3, #0xC8\n\t"
-	"lsrs	r3, r4\n\t"
-	"ands	r3, #1\n\t"
-	"adds	r0, r3\n\t"
-	"adcs	r1, #0\n\t"
-	"\n\t"
-	"pop	{ r4, r5, r6, r7, r8, r10, r11, pc }\n\t"
-	);
-}
-
-#else // yyyASM_CORTEXM4+0
-
-fpr
-fpr_mul(fpr x, fpr y)
-{
-	uint64_t xu, yu, w, zu, zv;
-	uint32_t x0, x1, y0, y1, z0, z1, z2;
-	int ex, ey, d, e, s;
-
-	/*
-	 * Extract absolute values as scaled unsigned integers. We
-	 * don't extract exponents yet.
-	 */
-	xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
-	yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
-
-	/*
-	 * We have two 53-bit integers to multiply; we need to split
-	 * each into a lower half and a upper half. Moreover, we
-	 * prefer to have lower halves to be of 25 bits each, for
-	 * reasons explained later on.
-	 */
-	x0 = (uint32_t)xu & 0x01FFFFFF;
-	x1 = (uint32_t)(xu >> 25);
-	y0 = (uint32_t)yu & 0x01FFFFFF;
-	y1 = (uint32_t)(yu >> 25);
-	w = (uint64_t)x0 * (uint64_t)y0;
-	z0 = (uint32_t)w & 0x01FFFFFF;
-	z1 = (uint32_t)(w >> 25);
-	w = (uint64_t)x0 * (uint64_t)y1;
-	z1 += (uint32_t)w & 0x01FFFFFF;
-	z2 = (uint32_t)(w >> 25);
-	w = (uint64_t)x1 * (uint64_t)y0;
-	z1 += (uint32_t)w & 0x01FFFFFF;
-	z2 += (uint32_t)(w >> 25);
-	zu = (uint64_t)x1 * (uint64_t)y1;
-	z2 += (z1 >> 25);
-	z1 &= 0x01FFFFFF;
-	zu += z2;
-
-	/*
-	 * Since xu and yu are both in the 2^52..2^53-1 range, the
-	 * product is in the 2^104..2^106-1 range. We first reassemble
-	 * it and round it into the 2^54..2^56-1 range; the bottom bit
-	 * is made "sticky". Since the low limbs z0 and z1 are 25 bits
-	 * each, we just take the upper part (zu), and consider z0 and
-	 * z1 only for purposes of stickiness.
-	 * (This is the reason why we chose 25-bit limbs above.)
-	 */
-	zu |= ((z0 | z1) + 0x01FFFFFF) >> 25;
-
-	/*
-	 * We normalize zu to the 2^54..s^55-1 range: it could be one
-	 * bit too large at this point. This is done with a conditional
-	 * right-shift that takes into account the sticky bit.
-	 */
-	zv = (zu >> 1) | (zu & 1);
-	w = zu >> 55;
-	zu ^= (zu ^ zv) & -w;
-
-	/*
-	 * Get the aggregate scaling factor:
-	 *
-	 *   - Each exponent is biased by 1023.
-	 *
-	 *   - Integral mantissas are scaled by 2^52, hence an
-	 *     extra 52 bias for each exponent.
-	 *
-	 *   - However, we right-shifted z by 50 bits, and then
-	 *     by 0 or 1 extra bit (depending on the value of w).
-	 *
-	 * In total, we must add the exponents, then subtract
-	 * 2 * (1023 + 52), then add 50 + w.
-	 */
-	ex = (int)((x >> 52) & 0x7FF);
-	ey = (int)((y >> 52) & 0x7FF);
-	e = ex + ey - 2100 + (int)w;
-
-	/*
-	 * Sign bit is the XOR of the operand sign bits.
-	 */
-	s = (int)((x ^ y) >> 63);
-
-	/*
-	 * Corrective actions for zeros: if either of the operands is
-	 * zero, then the computations above were wrong. Test for zero
-	 * is whether ex or ey is zero. We just have to set the mantissa
-	 * (zu) to zero, the FPR() function will normalize e.
-	 */
-	d = ((ex + 0x7FF) & (ey + 0x7FF)) >> 11;
-	zu &= -(uint64_t)d;
-
-	/*
-	 * FPR() packs the result and applies proper rounding.
-	 */
-	return FPR(s, e, zu);
-}
-
-#endif // yyyASM_CORTEXM4-
-
-#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1
-
-__attribute__((naked))
-fpr
-fpr_div(fpr x __attribute__((unused)), fpr y __attribute__((unused)))
-{
-	__asm__ (
-	"push	{ r4, r5, r6, r7, r8, r10, r11, lr }\n\t"
-
-	"@ Extract mantissas of x and y, in r0:r4 and r2:r5, respectively.\n\t"
-	"@ We don't touch r1 and r3 as they contain the exponents and\n\t"
-	"@ signs, which we'll need later on.\n\t"
-	"ubfx	r4, r1, #0, #20\n\t"
-	"ubfx	r5, r3, #0, #20\n\t"
-	"orrs	r4, r4, #0x00100000\n\t"
-	"orrs	r5, r5, #0x00100000\n\t"
-	"\n\t"
-	"@ Perform bit-by-bit division. We want a 56-bit result in r8:r10\n\t"
-	"@ (low bit is 0). Bits come from the carry flag and are\n\t"
-	"@ injected with rrx, i.e. in position 31; we thus get bits in\n\t"
-	"@ the reverse order. Bits accumulate in r8; after the first 24\n\t"
-	"@ bits, we move the quotient bits to r10.\n\t"
-	"eors	r8, r8\n\t"
-	"\n\t"
-
-#define DIVSTEP \
-	"subs	r6, r0, r2\n\t" \
-	"sbcs	r7, r4, r5\n\t" \
-	"rrx	r8, r8\n\t" \
-	"ands	r6, r2, r8, asr #31\n\t" \
-	"ands	r7, r5, r8, asr #31\n\t" \
-	"subs	r0, r6\n\t" \
-	"sbcs	r4, r7\n\t" \
-	"adds	r0, r0, r0\n\t" \
-	"adcs	r4, r4, r4\n\t"
-
-#define DIVSTEP4   DIVSTEP DIVSTEP DIVSTEP DIVSTEP
-#define DIVSTEP8   DIVSTEP4 DIVSTEP4
-
-	DIVSTEP8
-	DIVSTEP8
-	DIVSTEP8
-
-	"\n\t"
-	"@ We have the first 24 bits of the quotient, move them to r10.\n\t"
-	"rbit	r10, r8\n\t"
-	"\n\t"
-
-	DIVSTEP8
-	DIVSTEP8
-	DIVSTEP8
-	DIVSTEP4 DIVSTEP DIVSTEP DIVSTEP
-
-#undef DIVSTEP
-#undef DIVSTEP4
-#undef DIVSTEP8
-
-	"\n\t"
-	"@ Lowest bit will be set if remainder is non-zero at this point\n\t"
-	"@ (this is the 'sticky' bit).\n\t"
-	"subs	r0, #1\n\t"
-	"sbcs	r4, #0\n\t"
-	"rrx	r8, r8\n\t"
-	"\n\t"
-	"@ We now have the next (low) 32 bits of the quotient.\n\t"
-	"rbit	r8, r8\n\t"
-	"\n\t"
-	"@ Since both operands had their top bit set, we know that the\n\t"
-	"@ result at this point is in 2^54..2^56-1. We scale it down\n\t"
-	"@ to 2^54..2^55-1 with a conditional shift. We also write the\n\t"
-	"@ result in r4:r5. If the shift is done, r6 will contain -1.\n\t"
-	"ands	r4, r8, #1\n\t"
-	"lsrs	r6, r10, #23\n\t"
-	"rsbs	r6, r6, #0\n\t"
-	"orrs	r4, r4, r8, lsr #1\n\t"
-	"orrs	r4, r4, r10, lsl #31\n\t"
-	"lsrs	r5, r10, #1\n\t"
-	"eors	r8, r8, r4\n\t"
-	"eors	r10, r10, r5\n\t"
-	"bics	r8, r8, r6\n\t"
-	"bics	r10, r10, r6\n\t"
-	"eors	r4, r4, r8\n\t"
-	"eors	r5, r5, r10\n\t"
-	"\n\t"
-	"@ Compute aggregate exponent: ex - ey + 1022 + w\n\t"
-	"@ (where w = 1 if the conditional shift was done, 0 otherwise)\n\t"
-	"@ But we subtract 1 because the injection of the mantissa high\n\t"
-	"@ bit will increment the exponent by 1.\n\t"
-	"lsls	r0, r1, #1\n\t"
-	"lsls	r2, r3, #1\n\t"
-	"lsrs	r0, r0, #21\n\t"
-	"addw	r7, r0, #0x7FF  @ save ex + 2047 in r7\n\t"
-	"subs	r0, r0, r2, lsr #21\n\t"
-	"addw	r0, r0, #1021\n\t"
-	"subs	r0, r6\n\t"
-	"\n\t"
-	"@ If the x operand was zero, then the computation was wrong and\n\t"
-	"@ the result is zero. Also, if the result exponent is zero or\n\t"
-	"@ negative, then the mantissa shall be clamped to zero. Since r0\n\t"
-	"@ contains the result exponent minus 1, we test on r0 being\n\t"
-	"@ strictly negative.\n\t"
-	"mvns	r2, r0\n\t"
-	"ands	r2, r2, r7, lsl #20\n\t"
-	"ands	r0, r0, r2, asr #31\n\t"
-	"ands	r4, r4, r2, asr #31\n\t"
-	"ands	r5, r5, r2, asr #31\n\t"
-	"\n\t"
-	"@ Sign is the XOR of the sign of the operands. This is true in\n\t"
-	"@ all cases, including very small results (exponent underflow)\n\t"
-	"@ and zeros.\n\t"
-	"eors	r1, r3\n\t"
-	"bfc	r1, #0, #31\n\t"
-	"\n\t"
-	"@ Plug in the exponent.\n\t"
-	"bfi	r1, r0, #20, #11\n\t"
-	"\n\t"
-	"@ Shift back to the normal 53-bit mantissa, with rounding.\n\t"
-	"@ Mantissa goes into r0:r1. For r1, we must use an addition\n\t"
-	"@ because the rounding may have triggered a carry, that should\n\t"
-	"@ be added to the exponent.\n\t"
-	"movs	r6, r4\n\t"
-	"lsrs	r0, r4, #2\n\t"
-	"orrs	r0, r0, r5, lsl #30\n\t"
-	"adds	r1, r1, r5, lsr #2\n\t"
-	"ands	r6, #0x7\n\t"
-	"movs	r3, #0xC8\n\t"
-	"lsrs	r3, r6\n\t"
-	"ands	r3, #1\n\t"
-	"adds	r0, r3\n\t"
-	"adcs	r1, #0\n\t"
-	"\n\t"
-	"pop	{ r4, r5, r6, r7, r8, r10, r11, pc }\n\t"
-	);
-}
-
-#else // yyyASM_CORTEXM4+0
-
-fpr
-fpr_div(fpr x, fpr y)
-{
-	uint64_t xu, yu, q, q2, w;
-	int i, ex, ey, e, d, s;
-
-	/*
-	 * Extract mantissas of x and y (unsigned).
-	 */
-	xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
-	yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
-
-	/*
-	 * Perform bit-by-bit division of xu by yu. We run it for 55 bits.
-	 */
-	q = 0;
-	for (i = 0; i < 55; i ++) {
-		/*
-		 * If yu is less than or equal xu, then subtract it and
-		 * push a 1 in the quotient; otherwise, leave xu unchanged
-		 * and push a 0.
-		 */
-		uint64_t b;
-
-		b = ((xu - yu) >> 63) - 1;
-		xu -= b & yu;
-		q |= b & 1;
-		xu <<= 1;
-		q <<= 1;
-	}
-
-	/*
-	 * We got 55 bits in the quotient, followed by an extra zero. We
-	 * want that 56th bit to be "sticky": it should be a 1 if and
-	 * only if the remainder (xu) is non-zero.
-	 */
-	q |= (xu | -xu) >> 63;
-
-	/*
-	 * Quotient is at most 2^56-1. Its top bit may be zero, but in
-	 * that case the next-to-top bit will be a one, since the
-	 * initial xu and yu were both in the 2^52..2^53-1 range.
-	 * We perform a conditional shift to normalize q to the
-	 * 2^54..2^55-1 range (with the bottom bit being sticky).
-	 */
-	q2 = (q >> 1) | (q & 1);
-	w = q >> 55;
-	q ^= (q ^ q2) & -w;
-
-	/*
-	 * Extract exponents to compute the scaling factor:
-	 *
-	 *   - Each exponent is biased and we scaled them up by
-	 *     52 bits; but these biases will cancel out.
-	 *
-	 *   - The division loop produced a 55-bit shifted result,
-	 *     so we must scale it down by 55 bits.
-	 *
-	 *   - If w = 1, we right-shifted the integer by 1 bit,
-	 *     hence we must add 1 to the scaling.
-	 */
-	ex = (int)((x >> 52) & 0x7FF);
-	ey = (int)((y >> 52) & 0x7FF);
-	e = ex - ey - 55 + (int)w;
-
-	/*
-	 * Sign is the XOR of the signs of the operands.
-	 */
-	s = (int)((x ^ y) >> 63);
-
-	/*
-	 * Corrective actions for zeros: if x = 0, then the computation
-	 * is wrong, and we must clamp e and q to 0. We do not care
-	 * about the case y = 0 (as per assumptions in this module,
-	 * the caller does not perform divisions by zero).
-	 */
-	d = (ex + 0x7FF) >> 11;
-	s &= d;
-	e &= -d;
-	q &= -(uint64_t)d;
-
-	/*
-	 * FPR() packs the result and applies proper rounding.
-	 */
-	return FPR(s, e, q);
-}
-
-#endif // yyyASM_CORTEXM4-
-
-#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1
-
-__attribute__((naked))
-fpr
-fpr_sqrt(fpr x __attribute__((unused)))
-{
-	__asm__ (
-	"push	{ r4, r5, r6, r7, r8, r10, r11, lr }\n\t"
-	"\n\t"
-	"@ Extract mantissa (r0:r1) and exponent (r2). We assume that the\n\t"
-	"@ sign is positive. If the source is zero, then the mantissa is\n\t"
-	"@ set to 0.\n\t"
-	"lsrs	r2, r1, #20\n\t"
-	"bfc	r1, #20, #12\n\t"
-	"addw	r3, r2, #0x7FF\n\t"
-	"subw	r2, r2, #1023\n\t"
-	"lsrs	r3, r3, #11\n\t"
-	"orrs	r1, r1, r3, lsl #20\n\t"
-	"\n\t"
-	"@ If the exponent is odd, then multiply mantissa by 2 and subtract\n\t"
-	"@ 1 from the exponent.\n\t"
-	"ands	r3, r2, #1\n\t"
-	"subs	r2, r2, r3\n\t"
-	"rsbs	r3, r3, #0\n\t"
-	"ands	r4, r1, r3\n\t"
-	"ands	r3, r0\n\t"
-	"adds	r0, r3\n\t"
-	"adcs	r1, r4\n\t"
-	"\n\t"
-	"@ Left-shift the mantissa by 9 bits to put it in the\n\t"
-	"@ 2^61..2^63-1 range (unless it is exactly 0).\n\t"
-	"lsls	r1, r1, #9\n\t"
-	"orrs	r1, r1, r0, lsr #23\n\t"
-	"lsls	r0, r0, #9\n\t"
-	"\n\t"
-	"@ Compute the square root bit-by-bit.\n\t"
-	"@ There are 54 iterations; first 30 can work on top word only.\n\t"
-	"@   q = r3 (bit-reversed)\n\t"
-	"@   s = r5\n\t"
-	"eors	r3, r3\n\t"
-	"eors	r5, r5\n\t"
-
-#define SQRT_STEP_HI(bit) \
-	"orrs	r6, r5, #(1 << (" #bit "))\n\t" \
-	"subs	r7, r1, r6\n\t" \
-	"rrx	r3, r3\n\t" \
-	"ands	r6, r6, r3, asr #31\n\t" \
-	"subs	r1, r1, r6\n\t" \
-	"lsrs	r6, r3, #31\n\t" \
-	"orrs	r5, r5, r6, lsl #((" #bit ") + 1)\n\t" \
-	"adds	r0, r0\n\t" \
-	"adcs	r1, r1\n\t"
-
-#define SQRT_STEP_HIx5(b)  \
-		SQRT_STEP_HI((b)+4) \
-		SQRT_STEP_HI((b)+3) \
-		SQRT_STEP_HI((b)+2) \
-		SQRT_STEP_HI((b)+1) \
-		SQRT_STEP_HI(b)
-
-	SQRT_STEP_HIx5(25)
-	SQRT_STEP_HIx5(20)
-	SQRT_STEP_HIx5(15)
-	SQRT_STEP_HIx5(10)
-	SQRT_STEP_HIx5(5)
-	SQRT_STEP_HIx5(0)
-
-#undef SQRT_STEP_HI
-#undef SQRT_STEP_HIx5
-
-	"@ Top 30 bits of the result must be reversed: they were\n\t"
-	"@ accumulated with rrx (hence from the top bit).\n\t"
-	"rbit	r3, r3\n\t"
-	"\n\t"
-	"@ For the next 24 iterations, we must use two-word operations.\n\t"
-	"@   bits of q now accumulate in r4\n\t"
-	"@   s is in r6:r5\n\t"
-	"eors	r4, r4\n\t"
-	"eors	r6, r6\n\t"
-	"\n\t"
-	"@ First iteration is special because the potential bit goes into\n\t"
-	"@ r5, not r6.\n\t"
-	"orrs	r7, r6, #(1 << 31)\n\t"
-	"subs	r8, r0, r7\n\t"
-	"sbcs	r10, r1, r5\n\t"
-	"rrx	r4, r4\n\t"
-	"ands	r7, r7, r4, asr #31\n\t"
-	"ands	r8, r5, r4, asr #31\n\t"
-	"subs	r0, r0, r7\n\t"
-	"sbcs	r1, r1, r8\n\t"
-	"lsrs	r7, r4, #31\n\t"
-	"orrs	r5, r5, r4, lsr #31\n\t"
-	"adds	r0, r0\n\t"
-	"adcs	r1, r1\n\t"
-
-#define SQRT_STEP_LO(bit) \
-	"orrs	r7, r6, #(1 << (" #bit "))\n\t" \
-	"subs	r8, r0, r7\n\t" \
-	"sbcs	r10, r1, r5\n\t" \
-	"rrx	r4, r4\n\t" \
-	"ands	r7, r7, r4, asr #31\n\t" \
-	"ands	r8, r5, r4, asr #31\n\t" \
-	"subs	r0, r0, r7\n\t" \
-	"sbcs	r1, r1, r8\n\t" \
-	"lsrs	r7, r4, #31\n\t" \
-	"orrs	r6, r6, r7, lsl #((" #bit ") + 1)\n\t" \
-	"adds	r0, r0\n\t" \
-	"adcs	r1, r1\n\t"
-
-#define SQRT_STEP_LOx4(b) \
-		SQRT_STEP_LO((b)+3) \
-		SQRT_STEP_LO((b)+2) \
-		SQRT_STEP_LO((b)+1) \
-		SQRT_STEP_LO(b)
-
-	SQRT_STEP_LO(30)
-	SQRT_STEP_LO(29)
-	SQRT_STEP_LO(28)
-	SQRT_STEP_LOx4(24)
-	SQRT_STEP_LOx4(20)
-	SQRT_STEP_LOx4(16)
-	SQRT_STEP_LOx4(12)
-	SQRT_STEP_LOx4(8)
-
-#undef SQRT_STEP_LO
-#undef SQRT_STEP_LOx4
-
-	"@ Put low 24 bits in the right order.\n\t"
-	"rbit	r4, r4\n\t"
-	"\n\t"
-	"@ We have a 54-bit result; compute the 55-th bit as the 'sticky'\n\t"
-	"@ bit: it is non-zero if and only if r0:r1 is non-zero. We put the\n\t"
-	"@ three low bits (including the sticky bit) in r5.\n\t"
-	"orrs	r0, r1\n\t"
-	"rsbs	r1, r0, #0\n\t"
-	"orrs	r0, r1\n\t"
-	"lsls	r5, r4, #1\n\t"
-	"orrs	r5, r5, r0, lsr #31\n\t"
-	"ands	r5, #0x7\n\t"
-	"\n\t"
-	"@ Compute the rounding: r6 is set to 0 or 1, and will be added\n\t"
-	"@ to the mantissa.\n\t"
-	"movs	r6, #0xC8\n\t"
-	"lsrs	r6, r5\n\t"
-	"ands	r6, #1\n\t"
-	"\n\t"
-	"@ Put the mantissa (53 bits, in the 2^52..2^53-1 range) in r0:r1\n\t"
-	"@ (rounding not applied yet).\n\t"
-	"lsrs	r0, r4, #1\n\t"
-	"orrs	r0, r0, r3, lsl #23\n\t"
-	"lsrs	r1, r3, #9\n\t"
-	"\n\t"
-	"@ Compute new exponent. This is half the old one (then reencoded\n\t"
-	"@ by adding 1023). Exception: if the mantissa is zero, then the\n\t"
-	"@ encoded exponent is set to 0. At that point, if the mantissa\n\t"
-	"@ is non-zero, then its high bit (bit 52, i.e. bit 20 of r1) is\n\t"
-	"@ non-zero. Note that the exponent cannot go out of range.\n\t"
-	"lsrs	r2, r2, #1\n\t"
-	"addw	r2, r2, #1023\n\t"
-	"lsrs	r5, r1, #20\n\t"
-	"rsbs	r5, r5, #0\n\t"
-	"ands	r2, r5\n\t"
-	"\n\t"
-	"@ Place exponent. This overwrites the high bit of the mantissa.\n\t"
-	"bfi	r1, r2, #20, #11\n\t"
-	"\n\t"
-	"@ Apply rounding. This may create a carry that will spill into\n\t"
-	"@ the exponent, which is exactly what should be done in that case\n\t"
-	"@ (i.e. increment the exponent).\n\t"
-	"adds	r0, r0, r6\n\t"
-	"adcs	r1, r1, #0\n\t"
-	"\n\t"
-	"pop	{ r4, r5, r6, r7, r8, r10, r11, pc }\n\t"
-	);
-}
-
-#else // yyyASM_CORTEXM4+0
-
-fpr
-fpr_sqrt(fpr x)
-{
-	uint64_t xu, q, s, r;
-	int ex, e;
-
-	/*
-	 * Extract the mantissa and the exponent. We don't care about
-	 * the sign: by assumption, the operand is nonnegative.
-	 * We want the "true" exponent corresponding to a mantissa
-	 * in the 1..2 range.
-	 */
-	xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
-	ex = (int)((x >> 52) & 0x7FF);
-	e = ex - 1023;
-
-	/*
-	 * If the exponent is odd, double the mantissa and decrement
-	 * the exponent. The exponent is then halved to account for
-	 * the square root.
-	 */
-	xu += xu & -(uint64_t)(e & 1);
-	e >>= 1;
-
-	/*
-	 * Double the mantissa.
-	 */
-	xu <<= 1;
-
-	/*
-	 * We now have a mantissa in the 2^53..2^55-1 range. It
-	 * represents a value between 1 (inclusive) and 4 (exclusive)
-	 * in fixed point notation (with 53 fractional bits). We
-	 * compute the square root bit by bit.
-	 */
-	q = 0;
-	s = 0;
-	r = (uint64_t)1 << 53;
-	for (int i = 0; i < 54; i ++) {
-		uint64_t t, b;
-
-		t = s + r;
-		b = ((xu - t) >> 63) - 1;
-		s += (r << 1) & b;
-		xu -= t & b;
-		q += r & b;
-		xu <<= 1;
-		r >>= 1;
-	}
-
-	/*
-	 * Now, q is a rounded-low 54-bit value, with a leading 1,
-	 * 52 fractional digits, and an additional guard bit. We add
-	 * an extra sticky bit to account for what remains of the operand.
-	 */
-	q <<= 1;
-	q |= (xu | -xu) >> 63;
-
-	/*
-	 * Result q is in the 2^54..2^55-1 range; we bias the exponent
-	 * by 54 bits (the value e at that point contains the "true"
-	 * exponent, but q is now considered an integer, i.e. scaled
-	 * up.
-	 */
-	e -= 54;
-
-	/*
-	 * Corrective action for an operand of value zero.
-	 */
-	q &= -(uint64_t)((ex + 0x7FF) >> 11);
-
-	/*
-	 * Apply rounding and back result.
-	 */
-	return FPR(0, e, q);
-}
-
-#endif // yyyASM_CORTEXM4-
-
-uint64_t
-fpr_expm_p63(fpr x, fpr ccs)
-{
-	/*
-	 * Polynomial approximation of exp(-x) is taken from FACCT:
-	 *   https://eprint.iacr.org/2018/1234
-	 * Specifically, values are extracted from the implementation
-	 * referenced from the FACCT article, and available at:
-	 *   https://github.com/raykzhao/gaussian
-	 * Here, the coefficients have been scaled up by 2^63 and
-	 * converted to integers.
-	 *
-	 * Tests over more than 24 billions of random inputs in the
-	 * 0..log(2) range have never shown a deviation larger than
-	 * 2^(-50) from the true mathematical value.
-	 */
-	static const uint64_t C[] = {
-		0x00000004741183A3u,
-		0x00000036548CFC06u,
-		0x0000024FDCBF140Au,
-		0x0000171D939DE045u,
-		0x0000D00CF58F6F84u,
-		0x000680681CF796E3u,
-		0x002D82D8305B0FEAu,
-		0x011111110E066FD0u,
-		0x0555555555070F00u,
-		0x155555555581FF00u,
-		0x400000000002B400u,
-		0x7FFFFFFFFFFF4800u,
-		0x8000000000000000u
-	};
-
-	uint64_t z, y;
-	unsigned u;
-	uint32_t z0, z1, y0, y1;
-	uint64_t a, b;
-
-	y = C[0];
-	z = (uint64_t)fpr_trunc(fpr_mul(x, fpr_ptwo63)) << 1;
-	for (u = 1; u < (sizeof C) / sizeof(C[0]); u ++) {
-		/*
-		 * Compute product z * y over 128 bits, but keep only
-		 * the top 64 bits.
-		 *
-		 * TODO: On some architectures/compilers we could use
-		 * some intrinsics (__umulh() on MSVC) or other compiler
-		 * extensions (unsigned __int128 on GCC / Clang) for
-		 * improved speed; however, most 64-bit architectures
-		 * also have appropriate IEEE754 floating-point support,
-		 * which is better.
-		 */
-		uint64_t c;
-
-		z0 = (uint32_t)z;
-		z1 = (uint32_t)(z >> 32);
-		y0 = (uint32_t)y;
-		y1 = (uint32_t)(y >> 32);
-		a = ((uint64_t)z0 * (uint64_t)y1)
-			+ (((uint64_t)z0 * (uint64_t)y0) >> 32);
-		b = ((uint64_t)z1 * (uint64_t)y0);
-		c = (a >> 32) + (b >> 32);
-		c += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
-		c += (uint64_t)z1 * (uint64_t)y1;
-		y = C[u] - c;
-	}
-
-	/*
-	 * The scaling factor must be applied at the end. Since y is now
-	 * in fixed-point notation, we have to convert the factor to the
-	 * same format, and do an extra integer multiplication.
-	 */
-	z = (uint64_t)fpr_trunc(fpr_mul(ccs, fpr_ptwo63)) << 1;
-	z0 = (uint32_t)z;
-	z1 = (uint32_t)(z >> 32);
-	y0 = (uint32_t)y;
-	y1 = (uint32_t)(y >> 32);
-	a = ((uint64_t)z0 * (uint64_t)y1)
-		+ (((uint64_t)z0 * (uint64_t)y0) >> 32);
-	b = ((uint64_t)z1 * (uint64_t)y0);
-	y = (a >> 32) + (b >> 32);
-	y += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
-	y += (uint64_t)z1 * (uint64_t)y1;
-
-	return y;
-}
-
-const fpr fpr_gm_tab[] = {
-	0, 0,
-	 9223372036854775808U,  4607182418800017408U,
-	 4604544271217802189U,  4604544271217802189U,
-	13827916308072577997U,  4604544271217802189U,
-	 4606496786581982534U,  4600565431771507043U,
-	13823937468626282851U,  4606496786581982534U,
-	 4600565431771507043U,  4606496786581982534U,
-	13829868823436758342U,  4600565431771507043U,
-	 4607009347991985328U,  4596196889902818827U,
-	13819568926757594635U,  4607009347991985328U,
-	 4603179351334086856U,  4605664432017547683U,
-	13829036468872323491U,  4603179351334086856U,
-	 4605664432017547683U,  4603179351334086856U,
-	13826551388188862664U,  4605664432017547683U,
-	 4596196889902818827U,  4607009347991985328U,
-	13830381384846761136U,  4596196889902818827U,
-	 4607139046673687846U,  4591727299969791020U,
-	13815099336824566828U,  4607139046673687846U,
-	 4603889326261607894U,  4605137878724712257U,
-	13828509915579488065U,  4603889326261607894U,
-	 4606118860100255153U,  4602163548591158843U,
-	13825535585445934651U,  4606118860100255153U,
-	 4598900923775164166U,  4606794571824115162U,
-	13830166608678890970U,  4598900923775164166U,
-	 4606794571824115162U,  4598900923775164166U,
-	13822272960629939974U,  4606794571824115162U,
-	 4602163548591158843U,  4606118860100255153U,
-	13829490896955030961U,  4602163548591158843U,
-	 4605137878724712257U,  4603889326261607894U,
-	13827261363116383702U,  4605137878724712257U,
-	 4591727299969791020U,  4607139046673687846U,
-	13830511083528463654U,  4591727299969791020U,
-	 4607171569234046334U,  4587232218149935124U,
-	13810604255004710932U,  4607171569234046334U,
-	 4604224084862889120U,  4604849113969373103U,
-	13828221150824148911U,  4604224084862889120U,
-	 4606317631232591731U,  4601373767755717824U,
-	13824745804610493632U,  4606317631232591731U,
-	 4599740487990714333U,  4606655894547498725U,
-	13830027931402274533U,  4599740487990714333U,
-	 4606912484326125783U,  4597922303871901467U,
-	13821294340726677275U,  4606912484326125783U,
-	 4602805845399633902U,  4605900952042040894U,
-	13829272988896816702U,  4602805845399633902U,
-	 4605409869824231233U,  4603540801876750389U,
-	13826912838731526197U,  4605409869824231233U,
-	 4594454542771183930U,  4607084929468638487U,
-	13830456966323414295U,  4594454542771183930U,
-	 4607084929468638487U,  4594454542771183930U,
-	13817826579625959738U,  4607084929468638487U,
-	 4603540801876750389U,  4605409869824231233U,
-	13828781906679007041U,  4603540801876750389U,
-	 4605900952042040894U,  4602805845399633902U,
-	13826177882254409710U,  4605900952042040894U,
-	 4597922303871901467U,  4606912484326125783U,
-	13830284521180901591U,  4597922303871901467U,
-	 4606655894547498725U,  4599740487990714333U,
-	13823112524845490141U,  4606655894547498725U,
-	 4601373767755717824U,  4606317631232591731U,
-	13829689668087367539U,  4601373767755717824U,
-	 4604849113969373103U,  4604224084862889120U,
-	13827596121717664928U,  4604849113969373103U,
-	 4587232218149935124U,  4607171569234046334U,
-	13830543606088822142U,  4587232218149935124U,
-	 4607179706000002317U,  4582730748936808062U,
-	13806102785791583870U,  4607179706000002317U,
-	 4604386048625945823U,  4604698657331085206U,
-	13828070694185861014U,  4604386048625945823U,
-	 4606409688975526202U,  4600971798440897930U,
-	13824343835295673738U,  4606409688975526202U,
-	 4600154912527631775U,  4606578871587619388U,
-	13829950908442395196U,  4600154912527631775U,
-	 4606963563043808649U,  4597061974398750563U,
-	13820434011253526371U,  4606963563043808649U,
-	 4602994049708411683U,  4605784983948558848U,
-	13829157020803334656U,  4602994049708411683U,
-	 4605539368864982914U,  4603361638657888991U,
-	13826733675512664799U,  4605539368864982914U,
-	 4595327571478659014U,  4607049811591515049U,
-	13830421848446290857U,  4595327571478659014U,
-	 4607114680469659603U,  4593485039402578702U,
-	13816857076257354510U,  4607114680469659603U,
-	 4603716733069447353U,  4605276012900672507U,
-	13828648049755448315U,  4603716733069447353U,
-	 4606012266443150634U,  4602550884377336506U,
-	13825922921232112314U,  4606012266443150634U,
-	 4598476289818621559U,  4606856142606846307U,
-	13830228179461622115U,  4598476289818621559U,
-	 4606727809065869586U,  4599322407794599425U,
-	13822694444649375233U,  4606727809065869586U,
-	 4601771097584682078U,  4606220668805321205U,
-	13829592705660097013U,  4601771097584682078U,
-	 4604995550503212910U,  4604058477489546729U,
-	13827430514344322537U,  4604995550503212910U,
-	 4589965306122607094U,  4607158013403433018U,
-	13830530050258208826U,  4589965306122607094U,
-	 4607158013403433018U,  4589965306122607094U,
-	13813337342977382902U,  4607158013403433018U,
-	 4604058477489546729U,  4604995550503212910U,
-	13828367587357988718U,  4604058477489546729U,
-	 4606220668805321205U,  4601771097584682078U,
-	13825143134439457886U,  4606220668805321205U,
-	 4599322407794599425U,  4606727809065869586U,
-	13830099845920645394U,  4599322407794599425U,
-	 4606856142606846307U,  4598476289818621559U,
-	13821848326673397367U,  4606856142606846307U,
-	 4602550884377336506U,  4606012266443150634U,
-	13829384303297926442U,  4602550884377336506U,
-	 4605276012900672507U,  4603716733069447353U,
-	13827088769924223161U,  4605276012900672507U,
-	 4593485039402578702U,  4607114680469659603U,
-	13830486717324435411U,  4593485039402578702U,
-	 4607049811591515049U,  4595327571478659014U,
-	13818699608333434822U,  4607049811591515049U,
-	 4603361638657888991U,  4605539368864982914U,
-	13828911405719758722U,  4603361638657888991U,
-	 4605784983948558848U,  4602994049708411683U,
-	13826366086563187491U,  4605784983948558848U,
-	 4597061974398750563U,  4606963563043808649U,
-	13830335599898584457U,  4597061974398750563U,
-	 4606578871587619388U,  4600154912527631775U,
-	13823526949382407583U,  4606578871587619388U,
-	 4600971798440897930U,  4606409688975526202U,
-	13829781725830302010U,  4600971798440897930U,
-	 4604698657331085206U,  4604386048625945823U,
-	13827758085480721631U,  4604698657331085206U,
-	 4582730748936808062U,  4607179706000002317U,
-	13830551742854778125U,  4582730748936808062U,
-	 4607181740574479067U,  4578227681973159812U,
-	13801599718827935620U,  4607181740574479067U,
-	 4604465633578481725U,  4604621949701367983U,
-	13827993986556143791U,  4604465633578481725U,
-	 4606453861145241227U,  4600769149537129431U,
-	13824141186391905239U,  4606453861145241227U,
-	 4600360675823176935U,  4606538458821337243U,
-	13829910495676113051U,  4600360675823176935U,
-	 4606987119037722413U,  4596629994023683153U,
-	13820002030878458961U,  4606987119037722413U,
-	 4603087070374583113U,  4605725276488455441U,
-	13829097313343231249U,  4603087070374583113U,
-	 4605602459698789090U,  4603270878689749849U,
-	13826642915544525657U,  4605602459698789090U,
-	 4595762727260045105U,  4607030246558998647U,
-	13830402283413774455U,  4595762727260045105U,
-	 4607127537664763515U,  4592606767730311893U,
-	13815978804585087701U,  4607127537664763515U,
-	 4603803453461190356U,  4605207475328619533U,
-	13828579512183395341U,  4603803453461190356U,
-	 4606066157444814153U,  4602357870542944470U,
-	13825729907397720278U,  4606066157444814153U,
-	 4598688984595225406U,  4606826008603986804U,
-	13830198045458762612U,  4598688984595225406U,
-	 4606761837001494797U,  4599112075441176914U,
-	13822484112295952722U,  4606761837001494797U,
-	 4601967947786150793U,  4606170366472647579U,
-	13829542403327423387U,  4601967947786150793U,
-	 4605067233569943231U,  4603974338538572089U,
-	13827346375393347897U,  4605067233569943231U,
-	 4590846768565625881U,  4607149205763218185U,
-	13830521242617993993U,  4590846768565625881U,
-	 4607165468267934125U,  4588998070480937184U,
-	13812370107335712992U,  4607165468267934125U,
-	 4604141730443515286U,  4604922840319727473U,
-	13828294877174503281U,  4604141730443515286U,
-	 4606269759522929756U,  4601573027631668967U,
-	13824945064486444775U,  4606269759522929756U,
-	 4599531889160152938U,  4606692493141721470U,
-	13830064529996497278U,  4599531889160152938U,
-	 4606884969294623682U,  4598262871476403630U,
-	13821634908331179438U,  4606884969294623682U,
-	 4602710690099904183U,  4605957195211051218U,
-	13829329232065827026U,  4602710690099904183U,
-	 4605343481119364930U,  4603629178146150899U,
-	13827001215000926707U,  4605343481119364930U,
-	 4594016801320007031U,  4607100477024622401U,
-	13830472513879398209U,  4594016801320007031U,
-	 4607068040143112603U,  4594891488091520602U,
-	13818263524946296410U,  4607068040143112603U,
-	 4603451617570386922U,  4605475169017376660U,
-	13828847205872152468U,  4603451617570386922U,
-	 4605843545406134034U,  4602900303344142735U,
-	13826272340198918543U,  4605843545406134034U,
-	 4597492765973365521U,  4606938683557690074U,
-	13830310720412465882U,  4597492765973365521U,
-	 4606618018794815019U,  4599948172872067014U,
-	13823320209726842822U,  4606618018794815019U,
-	 4601173347964633034U,  4606364276725003740U,
-	13829736313579779548U,  4601173347964633034U,
-	 4604774382555066977U,  4604305528345395596U,
-	13827677565200171404U,  4604774382555066977U,
-	 4585465300892538317U,  4607176315382986589U,
-	13830548352237762397U,  4585465300892538317U,
-	 4607176315382986589U,  4585465300892538317U,
-	13808837337747314125U,  4607176315382986589U,
-	 4604305528345395596U,  4604774382555066977U,
-	13828146419409842785U,  4604305528345395596U,
-	 4606364276725003740U,  4601173347964633034U,
-	13824545384819408842U,  4606364276725003740U,
-	 4599948172872067014U,  4606618018794815019U,
-	13829990055649590827U,  4599948172872067014U,
-	 4606938683557690074U,  4597492765973365521U,
-	13820864802828141329U,  4606938683557690074U,
-	 4602900303344142735U,  4605843545406134034U,
-	13829215582260909842U,  4602900303344142735U,
-	 4605475169017376660U,  4603451617570386922U,
-	13826823654425162730U,  4605475169017376660U,
-	 4594891488091520602U,  4607068040143112603U,
-	13830440076997888411U,  4594891488091520602U,
-	 4607100477024622401U,  4594016801320007031U,
-	13817388838174782839U,  4607100477024622401U,
-	 4603629178146150899U,  4605343481119364930U,
-	13828715517974140738U,  4603629178146150899U,
-	 4605957195211051218U,  4602710690099904183U,
-	13826082726954679991U,  4605957195211051218U,
-	 4598262871476403630U,  4606884969294623682U,
-	13830257006149399490U,  4598262871476403630U,
-	 4606692493141721470U,  4599531889160152938U,
-	13822903926014928746U,  4606692493141721470U,
-	 4601573027631668967U,  4606269759522929756U,
-	13829641796377705564U,  4601573027631668967U,
-	 4604922840319727473U,  4604141730443515286U,
-	13827513767298291094U,  4604922840319727473U,
-	 4588998070480937184U,  4607165468267934125U,
-	13830537505122709933U,  4588998070480937184U,
-	 4607149205763218185U,  4590846768565625881U,
-	13814218805420401689U,  4607149205763218185U,
-	 4603974338538572089U,  4605067233569943231U,
-	13828439270424719039U,  4603974338538572089U,
-	 4606170366472647579U,  4601967947786150793U,
-	13825339984640926601U,  4606170366472647579U,
-	 4599112075441176914U,  4606761837001494797U,
-	13830133873856270605U,  4599112075441176914U,
-	 4606826008603986804U,  4598688984595225406U,
-	13822061021450001214U,  4606826008603986804U,
-	 4602357870542944470U,  4606066157444814153U,
-	13829438194299589961U,  4602357870542944470U,
-	 4605207475328619533U,  4603803453461190356U,
-	13827175490315966164U,  4605207475328619533U,
-	 4592606767730311893U,  4607127537664763515U,
-	13830499574519539323U,  4592606767730311893U,
-	 4607030246558998647U,  4595762727260045105U,
-	13819134764114820913U,  4607030246558998647U,
-	 4603270878689749849U,  4605602459698789090U,
-	13828974496553564898U,  4603270878689749849U,
-	 4605725276488455441U,  4603087070374583113U,
-	13826459107229358921U,  4605725276488455441U,
-	 4596629994023683153U,  4606987119037722413U,
-	13830359155892498221U,  4596629994023683153U,
-	 4606538458821337243U,  4600360675823176935U,
-	13823732712677952743U,  4606538458821337243U,
-	 4600769149537129431U,  4606453861145241227U,
-	13829825898000017035U,  4600769149537129431U,
-	 4604621949701367983U,  4604465633578481725U,
-	13827837670433257533U,  4604621949701367983U,
-	 4578227681973159812U,  4607181740574479067U,
-	13830553777429254875U,  4578227681973159812U,
-	 4607182249242036882U,  4573724215515480177U,
-	13797096252370255985U,  4607182249242036882U,
-	 4604505071555817232U,  4604583231088591477U,
-	13827955267943367285U,  4604505071555817232U,
-	 4606475480113671417U,  4600667422348321968U,
-	13824039459203097776U,  4606475480113671417U,
-	 4600463181646572228U,  4606517779747998088U,
-	13829889816602773896U,  4600463181646572228U,
-	 4606998399608725124U,  4596413578358834022U,
-	13819785615213609830U,  4606998399608725124U,
-	 4603133304188877240U,  4605694995810664660U,
-	13829067032665440468U,  4603133304188877240U,
-	 4605633586259814045U,  4603225210076562971U,
-	13826597246931338779U,  4605633586259814045U,
-	 4595979936813835462U,  4607019963775302583U,
-	13830392000630078391U,  4595979936813835462U,
-	 4607133460805585796U,  4592167175087283203U,
-	13815539211942059011U,  4607133460805585796U,
-	 4603846496621587377U,  4605172808754305228U,
-	13828544845609081036U,  4603846496621587377U,
-	 4606092657816072624U,  4602260871257280788U,
-	13825632908112056596U,  4606092657816072624U,
-	 4598795050632330097U,  4606810452769876110U,
-	13830182489624651918U,  4598795050632330097U,
-	 4606778366364612594U,  4599006600037663623U,
-	13822378636892439431U,  4606778366364612594U,
-	 4602065906208722008U,  4606144763310860551U,
-	13829516800165636359U,  4602065906208722008U,
-	 4605102686554936490U,  4603931940768740167U,
-	13827303977623515975U,  4605102686554936490U,
-	 4591287158938884897U,  4607144295058764886U,
-	13830516331913540694U,  4591287158938884897U,
-	 4607168688050493276U,  4588115294056142819U,
-	13811487330910918627U,  4607168688050493276U,
-	 4604183020748362039U,  4604886103475043762U,
-	13828258140329819570U,  4604183020748362039U,
-	 4606293848208650998U,  4601473544562720001U,
-	13824845581417495809U,  4606293848208650998U,
-	 4599636300858866724U,  4606674353838411301U,
-	13830046390693187109U,  4599636300858866724U,
-	 4606898891031025132U,  4598136582470364665U,
-	13821508619325140473U,  4606898891031025132U,
-	 4602758354025980442U,  4605929219593405673U,
-	13829301256448181481U,  4602758354025980442U,
-	 4605376811039722786U,  4603585091850767959U,
-	13826957128705543767U,  4605376811039722786U,
-	 4594235767444503503U,  4607092871118901179U,
-	13830464907973676987U,  4594235767444503503U,
-	 4607076652372832968U,  4594673119063280916U,
-	13818045155918056724U,  4607076652372832968U,
-	 4603496309891590679U,  4605442656228245717U,
-	13828814693083021525U,  4603496309891590679U,
-	 4605872393621214213U,  4602853162432841185U,
-	13826225199287616993U,  4605872393621214213U,
-	 4597707695679609371U,  4606925748668145757U,
-	13830297785522921565U,  4597707695679609371U,
-	 4606637115963965612U,  4599844446633109139U,
-	13823216483487884947U,  4606637115963965612U,
-	 4601273700967202825U,  4606341107699334546U,
-	13829713144554110354U,  4601273700967202825U,
-	 4604811873195349477U,  4604264921241055824U,
-	13827636958095831632U,  4604811873195349477U,
-	 4586348876009622851U,  4607174111710118367U,
-	13830546148564894175U,  4586348876009622851U,
-	 4607178180169683960U,  4584498631466405633U,
-	13807870668321181441U,  4607178180169683960U,
-	 4604345904647073908U,  4604736643460027021U,
-	13828108680314802829U,  4604345904647073908U,
-	 4606387137437298591U,  4601072712526242277U,
-	13824444749381018085U,  4606387137437298591U,
-	 4600051662802353687U,  4606598603759044570U,
-	13829970640613820378U,  4600051662802353687U,
-	 4606951288507767453U,  4597277522845151878U,
-	13820649559699927686U,  4606951288507767453U,
-	 4602947266358709886U,  4605814408482919348U,
-	13829186445337695156U,  4602947266358709886U,
-	 4605507406967535927U,  4603406726595779752U,
-	13826778763450555560U,  4605507406967535927U,
-	 4595109641634432498U,  4607059093103722971U,
-	13830431129958498779U,  4595109641634432498U,
-	 4607107746899444102U,  4593797652641645341U,
-	13817169689496421149U,  4607107746899444102U,
-	 4603673059103075106U,  4605309881318010327U,
-	13828681918172786135U,  4603673059103075106U,
-	 4605984877841711338U,  4602646891659203088U,
-	13826018928513978896U,  4605984877841711338U,
-	 4598369669086960528U,  4606870719641066940U,
-	13830242756495842748U,  4598369669086960528U,
-	 4606710311774494716U,  4599427256825614420U,
-	13822799293680390228U,  4606710311774494716U,
-	 4601672213217083403U,  4606245366082353408U,
-	13829617402937129216U,  4601672213217083403U,
-	 4604959323120302796U,  4604100215502905499U,
-	13827472252357681307U,  4604959323120302796U,
-	 4589524267239410099U,  4607161910007591876U,
-	13830533946862367684U,  4589524267239410099U,
-	 4607153778602162496U,  4590406145430462614U,
-	13813778182285238422U,  4607153778602162496U,
-	 4604016517974851588U,  4605031521104517324U,
-	13828403557959293132U,  4604016517974851588U,
-	 4606195668621671667U,  4601869677011524443U,
-	13825241713866300251U,  4606195668621671667U,
-	 4599217346014614711U,  4606744984357082948U,
-	13830117021211858756U,  4599217346014614711U,
-	 4606841238740778884U,  4598582729657176439U,
-	13821954766511952247U,  4606841238740778884U,
-	 4602454542796181607U,  4606039359984203741U,
-	13829411396838979549U,  4602454542796181607U,
-	 4605241877142478242U,  4603760198400967492U,
-	13827132235255743300U,  4605241877142478242U,
-	 4593046061348462537U,  4607121277474223905U,
-	13830493314328999713U,  4593046061348462537U,
-	 4607040195955932526U,  4595545269419264690U,
-	13818917306274040498U,  4607040195955932526U,
-	 4603316355454250015U,  4605571053506370248U,
-	13828943090361146056U,  4603316355454250015U,
-	 4605755272910869620U,  4603040651631881451U,
-	13826412688486657259U,  4605755272910869620U,
-	 4596846128749438754U,  4606975506703684317U,
-	13830347543558460125U,  4596846128749438754U,
-	 4606558823023444576U,  4600257918160607478U,
-	13823629955015383286U,  4606558823023444576U,
-	 4600870609507958271U,  4606431930490633905U,
-	13829803967345409713U,  4600870609507958271U,
-	 4604660425598397818U,  4604425958770613225U,
-	13827797995625389033U,  4604660425598397818U,
-	 4580962600092897021U,  4607180892816495009U,
-	13830552929671270817U,  4580962600092897021U,
-	 4607180892816495009U,  4580962600092897021U,
-	13804334636947672829U,  4607180892816495009U,
-	 4604425958770613225U,  4604660425598397818U,
-	13828032462453173626U,  4604425958770613225U,
-	 4606431930490633905U,  4600870609507958271U,
-	13824242646362734079U,  4606431930490633905U,
-	 4600257918160607478U,  4606558823023444576U,
-	13829930859878220384U,  4600257918160607478U,
-	 4606975506703684317U,  4596846128749438754U,
-	13820218165604214562U,  4606975506703684317U,
-	 4603040651631881451U,  4605755272910869620U,
-	13829127309765645428U,  4603040651631881451U,
-	 4605571053506370248U,  4603316355454250015U,
-	13826688392309025823U,  4605571053506370248U,
-	 4595545269419264690U,  4607040195955932526U,
-	13830412232810708334U,  4595545269419264690U,
-	 4607121277474223905U,  4593046061348462537U,
-	13816418098203238345U,  4607121277474223905U,
-	 4603760198400967492U,  4605241877142478242U,
-	13828613913997254050U,  4603760198400967492U,
-	 4606039359984203741U,  4602454542796181607U,
-	13825826579650957415U,  4606039359984203741U,
-	 4598582729657176439U,  4606841238740778884U,
-	13830213275595554692U,  4598582729657176439U,
-	 4606744984357082948U,  4599217346014614711U,
-	13822589382869390519U,  4606744984357082948U,
-	 4601869677011524443U,  4606195668621671667U,
-	13829567705476447475U,  4601869677011524443U,
-	 4605031521104517324U,  4604016517974851588U,
-	13827388554829627396U,  4605031521104517324U,
-	 4590406145430462614U,  4607153778602162496U,
-	13830525815456938304U,  4590406145430462614U,
-	 4607161910007591876U,  4589524267239410099U,
-	13812896304094185907U,  4607161910007591876U,
-	 4604100215502905499U,  4604959323120302796U,
-	13828331359975078604U,  4604100215502905499U,
-	 4606245366082353408U,  4601672213217083403U,
-	13825044250071859211U,  4606245366082353408U,
-	 4599427256825614420U,  4606710311774494716U,
-	13830082348629270524U,  4599427256825614420U,
-	 4606870719641066940U,  4598369669086960528U,
-	13821741705941736336U,  4606870719641066940U,
-	 4602646891659203088U,  4605984877841711338U,
-	13829356914696487146U,  4602646891659203088U,
-	 4605309881318010327U,  4603673059103075106U,
-	13827045095957850914U,  4605309881318010327U,
-	 4593797652641645341U,  4607107746899444102U,
-	13830479783754219910U,  4593797652641645341U,
-	 4607059093103722971U,  4595109641634432498U,
-	13818481678489208306U,  4607059093103722971U,
-	 4603406726595779752U,  4605507406967535927U,
-	13828879443822311735U,  4603406726595779752U,
-	 4605814408482919348U,  4602947266358709886U,
-	13826319303213485694U,  4605814408482919348U,
-	 4597277522845151878U,  4606951288507767453U,
-	13830323325362543261U,  4597277522845151878U,
-	 4606598603759044570U,  4600051662802353687U,
-	13823423699657129495U,  4606598603759044570U,
-	 4601072712526242277U,  4606387137437298591U,
-	13829759174292074399U,  4601072712526242277U,
-	 4604736643460027021U,  4604345904647073908U,
-	13827717941501849716U,  4604736643460027021U,
-	 4584498631466405633U,  4607178180169683960U,
-	13830550217024459768U,  4584498631466405633U,
-	 4607174111710118367U,  4586348876009622851U,
-	13809720912864398659U,  4607174111710118367U,
-	 4604264921241055824U,  4604811873195349477U,
-	13828183910050125285U,  4604264921241055824U,
-	 4606341107699334546U,  4601273700967202825U,
-	13824645737821978633U,  4606341107699334546U,
-	 4599844446633109139U,  4606637115963965612U,
-	13830009152818741420U,  4599844446633109139U,
-	 4606925748668145757U,  4597707695679609371U,
-	13821079732534385179U,  4606925748668145757U,
-	 4602853162432841185U,  4605872393621214213U,
-	13829244430475990021U,  4602853162432841185U,
-	 4605442656228245717U,  4603496309891590679U,
-	13826868346746366487U,  4605442656228245717U,
-	 4594673119063280916U,  4607076652372832968U,
-	13830448689227608776U,  4594673119063280916U,
-	 4607092871118901179U,  4594235767444503503U,
-	13817607804299279311U,  4607092871118901179U,
-	 4603585091850767959U,  4605376811039722786U,
-	13828748847894498594U,  4603585091850767959U,
-	 4605929219593405673U,  4602758354025980442U,
-	13826130390880756250U,  4605929219593405673U,
-	 4598136582470364665U,  4606898891031025132U,
-	13830270927885800940U,  4598136582470364665U,
-	 4606674353838411301U,  4599636300858866724U,
-	13823008337713642532U,  4606674353838411301U,
-	 4601473544562720001U,  4606293848208650998U,
-	13829665885063426806U,  4601473544562720001U,
-	 4604886103475043762U,  4604183020748362039U,
-	13827555057603137847U,  4604886103475043762U,
-	 4588115294056142819U,  4607168688050493276U,
-	13830540724905269084U,  4588115294056142819U,
-	 4607144295058764886U,  4591287158938884897U,
-	13814659195793660705U,  4607144295058764886U,
-	 4603931940768740167U,  4605102686554936490U,
-	13828474723409712298U,  4603931940768740167U,
-	 4606144763310860551U,  4602065906208722008U,
-	13825437943063497816U,  4606144763310860551U,
-	 4599006600037663623U,  4606778366364612594U,
-	13830150403219388402U,  4599006600037663623U,
-	 4606810452769876110U,  4598795050632330097U,
-	13822167087487105905U,  4606810452769876110U,
-	 4602260871257280788U,  4606092657816072624U,
-	13829464694670848432U,  4602260871257280788U,
-	 4605172808754305228U,  4603846496621587377U,
-	13827218533476363185U,  4605172808754305228U,
-	 4592167175087283203U,  4607133460805585796U,
-	13830505497660361604U,  4592167175087283203U,
-	 4607019963775302583U,  4595979936813835462U,
-	13819351973668611270U,  4607019963775302583U,
-	 4603225210076562971U,  4605633586259814045U,
-	13829005623114589853U,  4603225210076562971U,
-	 4605694995810664660U,  4603133304188877240U,
-	13826505341043653048U,  4605694995810664660U,
-	 4596413578358834022U,  4606998399608725124U,
-	13830370436463500932U,  4596413578358834022U,
-	 4606517779747998088U,  4600463181646572228U,
-	13823835218501348036U,  4606517779747998088U,
-	 4600667422348321968U,  4606475480113671417U,
-	13829847516968447225U,  4600667422348321968U,
-	 4604583231088591477U,  4604505071555817232U,
-	13827877108410593040U,  4604583231088591477U,
-	 4573724215515480177U,  4607182249242036882U,
-	13830554286096812690U,  4573724215515480177U,
-	 4607182376410422530U,  4569220649180767418U,
-	13792592686035543226U,  4607182376410422530U,
-	 4604524701268679793U,  4604563781218984604U,
-	13827935818073760412U,  4604524701268679793U,
-	 4606486172460753999U,  4600616459743653188U,
-	13823988496598428996U,  4606486172460753999U,
-	 4600514338912178239U,  4606507322377452870U,
-	13829879359232228678U,  4600514338912178239U,
-	 4607003915349878877U,  4596305267720071930U,
-	13819677304574847738U,  4607003915349878877U,
-	 4603156351203636159U,  4605679749231851918U,
-	13829051786086627726U,  4603156351203636159U,
-	 4605649044311923410U,  4603202304363743346U,
-	13826574341218519154U,  4605649044311923410U,
-	 4596088445927168004U,  4607014697483910382U,
-	13830386734338686190U,  4596088445927168004U,
-	 4607136295912168606U,  4591947271803021404U,
-	13815319308657797212U,  4607136295912168606U,
-	 4603867938232615808U,  4605155376589456981U,
-	13828527413444232789U,  4603867938232615808U,
-	 4606105796280968177U,  4602212250118051877U,
-	13825584286972827685U,  4606105796280968177U,
-	 4598848011564831930U,  4606802552898869248U,
-	13830174589753645056U,  4598848011564831930U,
-	 4606786509620734768U,  4598953786765296928U,
-	13822325823620072736U,  4606786509620734768U,
-	 4602114767134999006U,  4606131849150971908U,
-	13829503886005747716U,  4602114767134999006U,
-	 4605120315324767624U,  4603910660507251362U,
-	13827282697362027170U,  4605120315324767624U,
-	 4591507261658050721U,  4607141713064252300U,
-	13830513749919028108U,  4591507261658050721U,
-	 4607170170974224083U,  4587673791460508439U,
-	13811045828315284247U,  4607170170974224083U,
-	 4604203581176243359U,  4604867640218014515U,
-	13828239677072790323U,  4604203581176243359U,
-	 4606305777984577632U,  4601423692641949331U,
-	13824795729496725139U,  4606305777984577632U,
-	 4599688422741010356U,  4606665164148251002U,
-	13830037201003026810U,  4599688422741010356U,
-	 4606905728766014348U,  4598029484874872834U,
-	13821401521729648642U,  4606905728766014348U,
-	 4602782121393764535U,  4605915122243179241U,
-	13829287159097955049U,  4602782121393764535U,
-	 4605393374401988274U,  4603562972219549215U,
-	13826935009074325023U,  4605393374401988274U,
-	 4594345179472540681U,  4607088942243446236U,
-	13830460979098222044U,  4594345179472540681U,
-	 4607080832832247697U,  4594563856311064231U,
-	13817935893165840039U,  4607080832832247697U,
-	 4603518581031047189U,  4605426297151190466U,
-	13828798334005966274U,  4603518581031047189U,
-	 4605886709123365959U,  4602829525820289164U,
-	13826201562675064972U,  4605886709123365959U,
-	 4597815040470278984U,  4606919157647773535U,
-	13830291194502549343U,  4597815040470278984U,
-	 4606646545123403481U,  4599792496117920694U,
-	13823164532972696502U,  4606646545123403481U,
-	 4601323770373937522U,  4606329407841126011U,
-	13829701444695901819U,  4601323770373937522U,
-	 4604830524903495634U,  4604244531615310815U,
-	13827616568470086623U,  4604830524903495634U,
-	 4586790578280679046U,  4607172882816799076U,
-	13830544919671574884U,  4586790578280679046U,
-	 4607178985458280057U,  4583614727651146525U,
-	13806986764505922333U,  4607178985458280057U,
-	 4604366005771528720U,  4604717681185626434U,
-	13828089718040402242U,  4604366005771528720U,
-	 4606398451906509788U,  4601022290077223616U,
-	13824394326931999424U,  4606398451906509788U,
-	 4600103317933788342U,  4606588777269136769U,
-	13829960814123912577U,  4600103317933788342U,
-	 4606957467106717424U,  4597169786279785693U,
-	13820541823134561501U,  4606957467106717424U,
-	 4602970680601913687U,  4605799732098147061U,
-	13829171768952922869U,  4602970680601913687U,
-	 4605523422498301790U,  4603384207141321914U,
-	13826756243996097722U,  4605523422498301790U,
-	 4595218635031890910U,  4607054494135176056U,
-	13830426530989951864U,  4595218635031890910U,
-	 4607111255739239816U,  4593688012422887515U,
-	13817060049277663323U,  4607111255739239816U,
-	 4603694922063032361U,  4605292980606880364U,
-	13828665017461656172U,  4603694922063032361U,
-	 4605998608960791335U,  4602598930031891166U,
-	13825970966886666974U,  4605998608960791335U,
-	 4598423001813699022U,  4606863472012527185U,
-	13830235508867302993U,  4598423001813699022U,
-	 4606719100629313491U,  4599374859150636784U,
-	13822746896005412592U,  4606719100629313491U,
-	 4601721693286060937U,  4606233055365547081U,
-	13829605092220322889U,  4601721693286060937U,
-	 4604977468824438271U,  4604079374282302598U,
-	13827451411137078406U,  4604977468824438271U,
-	 4589744810590291021U,  4607160003989618959U,
-	13830532040844394767U,  4589744810590291021U,
-	 4607155938267770208U,  4590185751760970393U,
-	13813557788615746201U,  4607155938267770208U,
-	 4604037525321326463U,  4605013567986435066U,
-	13828385604841210874U,  4604037525321326463U,
-	 4606208206518262803U,  4601820425647934753U,
-	13825192462502710561U,  4606208206518262803U,
-	 4599269903251194481U,  4606736437002195879U,
-	13830108473856971687U,  4599269903251194481U,
-	 4606848731493011465U,  4598529532600161144U,
-	13821901569454936952U,  4606848731493011465U,
-	 4602502755147763107U,  4606025850160239809U,
-	13829397887015015617U,  4602502755147763107U,
-	 4605258978359093269U,  4603738491917026584U,
-	13827110528771802392U,  4605258978359093269U,
-	 4593265590854265407U,  4607118021058468598U,
-	13830490057913244406U,  4593265590854265407U,
-	 4607045045516813836U,  4595436449949385485U,
-	13818808486804161293U,  4607045045516813836U,
-	 4603339021357904144U,  4605555245917486022U,
-	13828927282772261830U,  4603339021357904144U,
-	 4605770164172969910U,  4603017373458244943U,
-	13826389410313020751U,  4605770164172969910U,
-	 4596954088216812973U,  4606969576261663845U,
-	13830341613116439653U,  4596954088216812973U,
-	 4606568886807728474U,  4600206446098256018U,
-	13823578482953031826U,  4606568886807728474U,
-	 4600921238092511730U,  4606420848538580260U,
-	13829792885393356068U,  4600921238092511730U,
-	 4604679572075463103U,  4604406033021674239U,
-	13827778069876450047U,  4604679572075463103U,
-	 4581846703643734566U,  4607180341788068727U,
-	13830552378642844535U,  4581846703643734566U,
-	 4607181359080094673U,  4579996072175835083U,
-	13803368109030610891U,  4607181359080094673U,
-	 4604445825685214043U,  4604641218080103285U,
-	13828013254934879093U,  4604445825685214043U,
-	 4606442934727379583U,  4600819913163773071U,
-	13824191950018548879U,  4606442934727379583U,
-	 4600309328230211502U,  4606548680329491866U,
-	13829920717184267674U,  4600309328230211502U,
-	 4606981354314050484U,  4596738097012783531U,
-	13820110133867559339U,  4606981354314050484U,
-	 4603063884010218172U,  4605740310302420207U,
-	13829112347157196015U,  4603063884010218172U,
-	 4605586791482848547U,  4603293641160266722U,
-	13826665678015042530U,  4605586791482848547U,
-	 4595654028864046335U,  4607035262954517034U,
-	13830407299809292842U,  4595654028864046335U,
-	 4607124449686274900U,  4592826452951465409U,
-	13816198489806241217U,  4607124449686274900U,
-	 4603781852316960384U,  4605224709411790590U,
-	13828596746266566398U,  4603781852316960384U,
-	 4606052795787882823U,  4602406247776385022U,
-	13825778284631160830U,  4606052795787882823U,
-	 4598635880488956483U,  4606833664420673202U,
-	13830205701275449010U,  4598635880488956483U,
-	 4606753451050079834U,  4599164736579548843U,
-	13822536773434324651U,  4606753451050079834U,
-	 4601918851211878557U,  4606183055233559255U,
-	13829555092088335063U,  4601918851211878557U,
-	 4605049409688478101U,  4603995455647851249U,
-	13827367492502627057U,  4605049409688478101U,
-	 4590626485056654602U,  4607151534426937478U,
-	13830523571281713286U,  4590626485056654602U,
-	 4607163731439411601U,  4589303678145802340U,
-	13812675715000578148U,  4607163731439411601U,
-	 4604121000955189926U,  4604941113561600762U,
-	13828313150416376570U,  4604121000955189926U,
-	 4606257600839867033U,  4601622657843474729U,
-	13824994694698250537U,  4606257600839867033U,
-	 4599479600326345459U,  4606701442584137310U,
-	13830073479438913118U,  4599479600326345459U,
-	 4606877885424248132U,  4598316292140394014U,
-	13821688328995169822U,  4606877885424248132U,
-	 4602686793990243041U,  4605971073215153165U,
-	13829343110069928973U,  4602686793990243041U,
-	 4605326714874986465U,  4603651144395358093U,
-	13827023181250133901U,  4605326714874986465U,
-	 4593907249284540294U,  4607104153983298999U,
-	13830476190838074807U,  4593907249284540294U,
-	 4607063608453868552U,  4595000592312171144U,
-	13818372629166946952U,  4607063608453868552U,
-	 4603429196809300824U,  4605491322423429598U,
-	13828863359278205406U,  4603429196809300824U,
-	 4605829012964735987U,  4602923807199184054U,
-	13826295844053959862U,  4605829012964735987U,
-	 4597385183080791534U,  4606945027305114062U,
-	13830317064159889870U,  4597385183080791534U,
-	 4606608350964852124U,  4599999947619525579U,
-	13823371984474301387U,  4606608350964852124U,
-	 4601123065313358619U,  4606375745674388705U,
-	13829747782529164513U,  4601123065313358619U,
-	 4604755543975806820U,  4604325745441780828U,
-	13827697782296556636U,  4604755543975806820U,
-	 4585023436363055487U,  4607177290141793710U,
-	13830549326996569518U,  4585023436363055487U,
-	 4607175255902437396U,  4585907115494236537U,
-	13809279152349012345U,  4607175255902437396U,
-	 4604285253548209224U,  4604793159020491611U,
-	13828165195875267419U,  4604285253548209224U,
-	 4606352730697093817U,  4601223560006786057U,
-	13824595596861561865U,  4606352730697093817U,
-	 4599896339047301634U,  4606627607157935956U,
-	13829999644012711764U,  4599896339047301634U,
-	 4606932257325205256U,  4597600270510262682U,
-	13820972307365038490U,  4606932257325205256U,
-	 4602876755014813164U,  4605858005670328613U,
-	13829230042525104421U,  4602876755014813164U,
-	 4605458946901419122U,  4603473988668005304U,
-	13826846025522781112U,  4605458946901419122U,
-	 4594782329999411347U,  4607072388129742377U,
-	13830444424984518185U,  4594782329999411347U,
-	 4607096716058023245U,  4594126307716900071U,
-	13817498344571675879U,  4607096716058023245U,
-	 4603607160562208225U,  4605360179893335444U,
-	13828732216748111252U,  4603607160562208225U,
-	 4605943243960030558U,  4602734543519989142U,
-	13826106580374764950U,  4605943243960030558U,
-	 4598209407597805010U,  4606891971185517504U,
-	13830264008040293312U,  4598209407597805010U,
-	 4606683463531482757U,  4599584122834874440U,
-	13822956159689650248U,  4606683463531482757U,
-	 4601523323048804569U,  4606281842017099424U,
-	13829653878871875232U,  4601523323048804569U,
-	 4604904503566677638U,  4604162403772767740U,
-	13827534440627543548U,  4604904503566677638U,
-	 4588556721781247689U,  4607167120476811757U,
-	13830539157331587565U,  4588556721781247689U,
-	 4607146792632922887U,  4591066993883984169U,
-	13814439030738759977U,  4607146792632922887U,
-	 4603953166845776383U,  4605084992581147553U,
-	13828457029435923361U,  4603953166845776383U,
-	 4606157602458368090U,  4602016966272225497U,
-	13825389003127001305U,  4606157602458368090U,
-	 4599059363095165615U,  4606770142132396069U,
-	13830142178987171877U,  4599059363095165615U,
-	 4606818271362779153U,  4598742041476147134U,
-	13822114078330922942U,  4606818271362779153U,
-	 4602309411551204896U,  4606079444829232727U,
-	13829451481684008535U,  4602309411551204896U,
-	 4605190175055178825U,  4603825001630339212U,
-	13827197038485115020U,  4605190175055178825U,
-	 4592387007752762956U,  4607130541380624519U,
-	13830502578235400327U,  4592387007752762956U,
-	 4607025146816593591U,  4595871363584150300U,
-	13819243400438926108U,  4607025146816593591U,
-	 4603248068256948438U,  4605618058006716661U,
-	13828990094861492469U,  4603248068256948438U,
-	 4605710171610479304U,  4603110210506737381U,
-	13826482247361513189U,  4605710171610479304U,
-	 4596521820799644122U,  4606992800820440327U,
-	13830364837675216135U,  4596521820799644122U,
-	 4606528158595189433U,  4600411960456200676U,
-	13823783997310976484U,  4606528158595189433U,
-	 4600718319105833937U,  4606464709641375231U,
-	13829836746496151039U,  4600718319105833937U,
-	 4604602620643553229U,  4604485382263976838U,
-	13827857419118752646U,  4604602620643553229U,
-	 4576459225186735875U,  4607182037296057423U,
-	13830554074150833231U,  4576459225186735875U,
-	 4607182037296057423U,  4576459225186735875U,
-	13799831262041511683U,  4607182037296057423U,
-	 4604485382263976838U,  4604602620643553229U,
-	13827974657498329037U,  4604485382263976838U,
-	 4606464709641375231U,  4600718319105833937U,
-	13824090355960609745U,  4606464709641375231U,
-	 4600411960456200676U,  4606528158595189433U,
-	13829900195449965241U,  4600411960456200676U,
-	 4606992800820440327U,  4596521820799644122U,
-	13819893857654419930U,  4606992800820440327U,
-	 4603110210506737381U,  4605710171610479304U,
-	13829082208465255112U,  4603110210506737381U,
-	 4605618058006716661U,  4603248068256948438U,
-	13826620105111724246U,  4605618058006716661U,
-	 4595871363584150300U,  4607025146816593591U,
-	13830397183671369399U,  4595871363584150300U,
-	 4607130541380624519U,  4592387007752762956U,
-	13815759044607538764U,  4607130541380624519U,
-	 4603825001630339212U,  4605190175055178825U,
-	13828562211909954633U,  4603825001630339212U,
-	 4606079444829232727U,  4602309411551204896U,
-	13825681448405980704U,  4606079444829232727U,
-	 4598742041476147134U,  4606818271362779153U,
-	13830190308217554961U,  4598742041476147134U,
-	 4606770142132396069U,  4599059363095165615U,
-	13822431399949941423U,  4606770142132396069U,
-	 4602016966272225497U,  4606157602458368090U,
-	13829529639313143898U,  4602016966272225497U,
-	 4605084992581147553U,  4603953166845776383U,
-	13827325203700552191U,  4605084992581147553U,
-	 4591066993883984169U,  4607146792632922887U,
-	13830518829487698695U,  4591066993883984169U,
-	 4607167120476811757U,  4588556721781247689U,
-	13811928758636023497U,  4607167120476811757U,
-	 4604162403772767740U,  4604904503566677638U,
-	13828276540421453446U,  4604162403772767740U,
-	 4606281842017099424U,  4601523323048804569U,
-	13824895359903580377U,  4606281842017099424U,
-	 4599584122834874440U,  4606683463531482757U,
-	13830055500386258565U,  4599584122834874440U,
-	 4606891971185517504U,  4598209407597805010U,
-	13821581444452580818U,  4606891971185517504U,
-	 4602734543519989142U,  4605943243960030558U,
-	13829315280814806366U,  4602734543519989142U,
-	 4605360179893335444U,  4603607160562208225U,
-	13826979197416984033U,  4605360179893335444U,
-	 4594126307716900071U,  4607096716058023245U,
-	13830468752912799053U,  4594126307716900071U,
-	 4607072388129742377U,  4594782329999411347U,
-	13818154366854187155U,  4607072388129742377U,
-	 4603473988668005304U,  4605458946901419122U,
-	13828830983756194930U,  4603473988668005304U,
-	 4605858005670328613U,  4602876755014813164U,
-	13826248791869588972U,  4605858005670328613U,
-	 4597600270510262682U,  4606932257325205256U,
-	13830304294179981064U,  4597600270510262682U,
-	 4606627607157935956U,  4599896339047301634U,
-	13823268375902077442U,  4606627607157935956U,
-	 4601223560006786057U,  4606352730697093817U,
-	13829724767551869625U,  4601223560006786057U,
-	 4604793159020491611U,  4604285253548209224U,
-	13827657290402985032U,  4604793159020491611U,
-	 4585907115494236537U,  4607175255902437396U,
-	13830547292757213204U,  4585907115494236537U,
-	 4607177290141793710U,  4585023436363055487U,
-	13808395473217831295U,  4607177290141793710U,
-	 4604325745441780828U,  4604755543975806820U,
-	13828127580830582628U,  4604325745441780828U,
-	 4606375745674388705U,  4601123065313358619U,
-	13824495102168134427U,  4606375745674388705U,
-	 4599999947619525579U,  4606608350964852124U,
-	13829980387819627932U,  4599999947619525579U,
-	 4606945027305114062U,  4597385183080791534U,
-	13820757219935567342U,  4606945027305114062U,
-	 4602923807199184054U,  4605829012964735987U,
-	13829201049819511795U,  4602923807199184054U,
-	 4605491322423429598U,  4603429196809300824U,
-	13826801233664076632U,  4605491322423429598U,
-	 4595000592312171144U,  4607063608453868552U,
-	13830435645308644360U,  4595000592312171144U,
-	 4607104153983298999U,  4593907249284540294U,
-	13817279286139316102U,  4607104153983298999U,
-	 4603651144395358093U,  4605326714874986465U,
-	13828698751729762273U,  4603651144395358093U,
-	 4605971073215153165U,  4602686793990243041U,
-	13826058830845018849U,  4605971073215153165U,
-	 4598316292140394014U,  4606877885424248132U,
-	13830249922279023940U,  4598316292140394014U,
-	 4606701442584137310U,  4599479600326345459U,
-	13822851637181121267U,  4606701442584137310U,
-	 4601622657843474729U,  4606257600839867033U,
-	13829629637694642841U,  4601622657843474729U,
-	 4604941113561600762U,  4604121000955189926U,
-	13827493037809965734U,  4604941113561600762U,
-	 4589303678145802340U,  4607163731439411601U,
-	13830535768294187409U,  4589303678145802340U,
-	 4607151534426937478U,  4590626485056654602U,
-	13813998521911430410U,  4607151534426937478U,
-	 4603995455647851249U,  4605049409688478101U,
-	13828421446543253909U,  4603995455647851249U,
-	 4606183055233559255U,  4601918851211878557U,
-	13825290888066654365U,  4606183055233559255U,
-	 4599164736579548843U,  4606753451050079834U,
-	13830125487904855642U,  4599164736579548843U,
-	 4606833664420673202U,  4598635880488956483U,
-	13822007917343732291U,  4606833664420673202U,
-	 4602406247776385022U,  4606052795787882823U,
-	13829424832642658631U,  4602406247776385022U,
-	 4605224709411790590U,  4603781852316960384U,
-	13827153889171736192U,  4605224709411790590U,
-	 4592826452951465409U,  4607124449686274900U,
-	13830496486541050708U,  4592826452951465409U,
-	 4607035262954517034U,  4595654028864046335U,
-	13819026065718822143U,  4607035262954517034U,
-	 4603293641160266722U,  4605586791482848547U,
-	13828958828337624355U,  4603293641160266722U,
-	 4605740310302420207U,  4603063884010218172U,
-	13826435920864993980U,  4605740310302420207U,
-	 4596738097012783531U,  4606981354314050484U,
-	13830353391168826292U,  4596738097012783531U,
-	 4606548680329491866U,  4600309328230211502U,
-	13823681365084987310U,  4606548680329491866U,
-	 4600819913163773071U,  4606442934727379583U,
-	13829814971582155391U,  4600819913163773071U,
-	 4604641218080103285U,  4604445825685214043U,
-	13827817862539989851U,  4604641218080103285U,
-	 4579996072175835083U,  4607181359080094673U,
-	13830553395934870481U,  4579996072175835083U,
-	 4607180341788068727U,  4581846703643734566U,
-	13805218740498510374U,  4607180341788068727U,
-	 4604406033021674239U,  4604679572075463103U,
-	13828051608930238911U,  4604406033021674239U,
-	 4606420848538580260U,  4600921238092511730U,
-	13824293274947287538U,  4606420848538580260U,
-	 4600206446098256018U,  4606568886807728474U,
-	13829940923662504282U,  4600206446098256018U,
-	 4606969576261663845U,  4596954088216812973U,
-	13820326125071588781U,  4606969576261663845U,
-	 4603017373458244943U,  4605770164172969910U,
-	13829142201027745718U,  4603017373458244943U,
-	 4605555245917486022U,  4603339021357904144U,
-	13826711058212679952U,  4605555245917486022U,
-	 4595436449949385485U,  4607045045516813836U,
-	13830417082371589644U,  4595436449949385485U,
-	 4607118021058468598U,  4593265590854265407U,
-	13816637627709041215U,  4607118021058468598U,
-	 4603738491917026584U,  4605258978359093269U,
-	13828631015213869077U,  4603738491917026584U,
-	 4606025850160239809U,  4602502755147763107U,
-	13825874792002538915U,  4606025850160239809U,
-	 4598529532600161144U,  4606848731493011465U,
-	13830220768347787273U,  4598529532600161144U,
-	 4606736437002195879U,  4599269903251194481U,
-	13822641940105970289U,  4606736437002195879U,
-	 4601820425647934753U,  4606208206518262803U,
-	13829580243373038611U,  4601820425647934753U,
-	 4605013567986435066U,  4604037525321326463U,
-	13827409562176102271U,  4605013567986435066U,
-	 4590185751760970393U,  4607155938267770208U,
-	13830527975122546016U,  4590185751760970393U,
-	 4607160003989618959U,  4589744810590291021U,
-	13813116847445066829U,  4607160003989618959U,
-	 4604079374282302598U,  4604977468824438271U,
-	13828349505679214079U,  4604079374282302598U,
-	 4606233055365547081U,  4601721693286060937U,
-	13825093730140836745U,  4606233055365547081U,
-	 4599374859150636784U,  4606719100629313491U,
-	13830091137484089299U,  4599374859150636784U,
-	 4606863472012527185U,  4598423001813699022U,
-	13821795038668474830U,  4606863472012527185U,
-	 4602598930031891166U,  4605998608960791335U,
-	13829370645815567143U,  4602598930031891166U,
-	 4605292980606880364U,  4603694922063032361U,
-	13827066958917808169U,  4605292980606880364U,
-	 4593688012422887515U,  4607111255739239816U,
-	13830483292594015624U,  4593688012422887515U,
-	 4607054494135176056U,  4595218635031890910U,
-	13818590671886666718U,  4607054494135176056U,
-	 4603384207141321914U,  4605523422498301790U,
-	13828895459353077598U,  4603384207141321914U,
-	 4605799732098147061U,  4602970680601913687U,
-	13826342717456689495U,  4605799732098147061U,
-	 4597169786279785693U,  4606957467106717424U,
-	13830329503961493232U,  4597169786279785693U,
-	 4606588777269136769U,  4600103317933788342U,
-	13823475354788564150U,  4606588777269136769U,
-	 4601022290077223616U,  4606398451906509788U,
-	13829770488761285596U,  4601022290077223616U,
-	 4604717681185626434U,  4604366005771528720U,
-	13827738042626304528U,  4604717681185626434U,
-	 4583614727651146525U,  4607178985458280057U,
-	13830551022313055865U,  4583614727651146525U,
-	 4607172882816799076U,  4586790578280679046U,
-	13810162615135454854U,  4607172882816799076U,
-	 4604244531615310815U,  4604830524903495634U,
-	13828202561758271442U,  4604244531615310815U,
-	 4606329407841126011U,  4601323770373937522U,
-	13824695807228713330U,  4606329407841126011U,
-	 4599792496117920694U,  4606646545123403481U,
-	13830018581978179289U,  4599792496117920694U,
-	 4606919157647773535U,  4597815040470278984U,
-	13821187077325054792U,  4606919157647773535U,
-	 4602829525820289164U,  4605886709123365959U,
-	13829258745978141767U,  4602829525820289164U,
-	 4605426297151190466U,  4603518581031047189U,
-	13826890617885822997U,  4605426297151190466U,
-	 4594563856311064231U,  4607080832832247697U,
-	13830452869687023505U,  4594563856311064231U,
-	 4607088942243446236U,  4594345179472540681U,
-	13817717216327316489U,  4607088942243446236U,
-	 4603562972219549215U,  4605393374401988274U,
-	13828765411256764082U,  4603562972219549215U,
-	 4605915122243179241U,  4602782121393764535U,
-	13826154158248540343U,  4605915122243179241U,
-	 4598029484874872834U,  4606905728766014348U,
-	13830277765620790156U,  4598029484874872834U,
-	 4606665164148251002U,  4599688422741010356U,
-	13823060459595786164U,  4606665164148251002U,
-	 4601423692641949331U,  4606305777984577632U,
-	13829677814839353440U,  4601423692641949331U,
-	 4604867640218014515U,  4604203581176243359U,
-	13827575618031019167U,  4604867640218014515U,
-	 4587673791460508439U,  4607170170974224083U,
-	13830542207828999891U,  4587673791460508439U,
-	 4607141713064252300U,  4591507261658050721U,
-	13814879298512826529U,  4607141713064252300U,
-	 4603910660507251362U,  4605120315324767624U,
-	13828492352179543432U,  4603910660507251362U,
-	 4606131849150971908U,  4602114767134999006U,
-	13825486803989774814U,  4606131849150971908U,
-	 4598953786765296928U,  4606786509620734768U,
-	13830158546475510576U,  4598953786765296928U,
-	 4606802552898869248U,  4598848011564831930U,
-	13822220048419607738U,  4606802552898869248U,
-	 4602212250118051877U,  4606105796280968177U,
-	13829477833135743985U,  4602212250118051877U,
-	 4605155376589456981U,  4603867938232615808U,
-	13827239975087391616U,  4605155376589456981U,
-	 4591947271803021404U,  4607136295912168606U,
-	13830508332766944414U,  4591947271803021404U,
-	 4607014697483910382U,  4596088445927168004U,
-	13819460482781943812U,  4607014697483910382U,
-	 4603202304363743346U,  4605649044311923410U,
-	13829021081166699218U,  4603202304363743346U,
-	 4605679749231851918U,  4603156351203636159U,
-	13826528388058411967U,  4605679749231851918U,
-	 4596305267720071930U,  4607003915349878877U,
-	13830375952204654685U,  4596305267720071930U,
-	 4606507322377452870U,  4600514338912178239U,
-	13823886375766954047U,  4606507322377452870U,
-	 4600616459743653188U,  4606486172460753999U,
-	13829858209315529807U,  4600616459743653188U,
-	 4604563781218984604U,  4604524701268679793U,
-	13827896738123455601U,  4604563781218984604U,
-	 4569220649180767418U,  4607182376410422530U,
-	13830554413265198338U,  4569220649180767418U
-};
-
-const fpr fpr_p2_tab[] = {
-	4611686018427387904U,
-	4607182418800017408U,
-	4602678819172646912U,
-	4598175219545276416U,
-	4593671619917905920U,
-	4589168020290535424U,
-	4584664420663164928U,
-	4580160821035794432U,
-	4575657221408423936U,
-	4571153621781053440U,
-	4566650022153682944U
-};
-
-#elif FALCON_FPNATIVE // yyyFPEMU+0 yyyFPNATIVE+1
-
-const fpr fpr_gm_tab[] = {
-	{0}, {0}, /* unused */
-	{-0.000000000000000000000000000}, { 1.000000000000000000000000000},
-	{ 0.707106781186547524400844362}, { 0.707106781186547524400844362},
-	{-0.707106781186547524400844362}, { 0.707106781186547524400844362},
-	{ 0.923879532511286756128183189}, { 0.382683432365089771728459984},
-	{-0.382683432365089771728459984}, { 0.923879532511286756128183189},
-	{ 0.382683432365089771728459984}, { 0.923879532511286756128183189},
-	{-0.923879532511286756128183189}, { 0.382683432365089771728459984},
-	{ 0.980785280403230449126182236}, { 0.195090322016128267848284868},
-	{-0.195090322016128267848284868}, { 0.980785280403230449126182236},
-	{ 0.555570233019602224742830814}, { 0.831469612302545237078788378},
-	{-0.831469612302545237078788378}, { 0.555570233019602224742830814},
-	{ 0.831469612302545237078788378}, { 0.555570233019602224742830814},
-	{-0.555570233019602224742830814}, { 0.831469612302545237078788378},
-	{ 0.195090322016128267848284868}, { 0.980785280403230449126182236},
-	{-0.980785280403230449126182236}, { 0.195090322016128267848284868},
-	{ 0.995184726672196886244836953}, { 0.098017140329560601994195564},
-	{-0.098017140329560601994195564}, { 0.995184726672196886244836953},
-	{ 0.634393284163645498215171613}, { 0.773010453362736960810906610},
-	{-0.773010453362736960810906610}, { 0.634393284163645498215171613},
-	{ 0.881921264348355029712756864}, { 0.471396736825997648556387626},
-	{-0.471396736825997648556387626}, { 0.881921264348355029712756864},
-	{ 0.290284677254462367636192376}, { 0.956940335732208864935797887},
-	{-0.956940335732208864935797887}, { 0.290284677254462367636192376},
-	{ 0.956940335732208864935797887}, { 0.290284677254462367636192376},
-	{-0.290284677254462367636192376}, { 0.956940335732208864935797887},
-	{ 0.471396736825997648556387626}, { 0.881921264348355029712756864},
-	{-0.881921264348355029712756864}, { 0.471396736825997648556387626},
-	{ 0.773010453362736960810906610}, { 0.634393284163645498215171613},
-	{-0.634393284163645498215171613}, { 0.773010453362736960810906610},
-	{ 0.098017140329560601994195564}, { 0.995184726672196886244836953},
-	{-0.995184726672196886244836953}, { 0.098017140329560601994195564},
-	{ 0.998795456205172392714771605}, { 0.049067674327418014254954977},
-	{-0.049067674327418014254954977}, { 0.998795456205172392714771605},
-	{ 0.671558954847018400625376850}, { 0.740951125354959091175616897},
-	{-0.740951125354959091175616897}, { 0.671558954847018400625376850},
-	{ 0.903989293123443331586200297}, { 0.427555093430282094320966857},
-	{-0.427555093430282094320966857}, { 0.903989293123443331586200297},
-	{ 0.336889853392220050689253213}, { 0.941544065183020778412509403},
-	{-0.941544065183020778412509403}, { 0.336889853392220050689253213},
-	{ 0.970031253194543992603984207}, { 0.242980179903263889948274162},
-	{-0.242980179903263889948274162}, { 0.970031253194543992603984207},
-	{ 0.514102744193221726593693839}, { 0.857728610000272069902269984},
-	{-0.857728610000272069902269984}, { 0.514102744193221726593693839},
-	{ 0.803207531480644909806676513}, { 0.595699304492433343467036529},
-	{-0.595699304492433343467036529}, { 0.803207531480644909806676513},
-	{ 0.146730474455361751658850130}, { 0.989176509964780973451673738},
-	{-0.989176509964780973451673738}, { 0.146730474455361751658850130},
-	{ 0.989176509964780973451673738}, { 0.146730474455361751658850130},
-	{-0.146730474455361751658850130}, { 0.989176509964780973451673738},
-	{ 0.595699304492433343467036529}, { 0.803207531480644909806676513},
-	{-0.803207531480644909806676513}, { 0.595699304492433343467036529},
-	{ 0.857728610000272069902269984}, { 0.514102744193221726593693839},
-	{-0.514102744193221726593693839}, { 0.857728610000272069902269984},
-	{ 0.242980179903263889948274162}, { 0.970031253194543992603984207},
-	{-0.970031253194543992603984207}, { 0.242980179903263889948274162},
-	{ 0.941544065183020778412509403}, { 0.336889853392220050689253213},
-	{-0.336889853392220050689253213}, { 0.941544065183020778412509403},
-	{ 0.427555093430282094320966857}, { 0.903989293123443331586200297},
-	{-0.903989293123443331586200297}, { 0.427555093430282094320966857},
-	{ 0.740951125354959091175616897}, { 0.671558954847018400625376850},
-	{-0.671558954847018400625376850}, { 0.740951125354959091175616897},
-	{ 0.049067674327418014254954977}, { 0.998795456205172392714771605},
-	{-0.998795456205172392714771605}, { 0.049067674327418014254954977},
-	{ 0.999698818696204220115765650}, { 0.024541228522912288031734529},
-	{-0.024541228522912288031734529}, { 0.999698818696204220115765650},
-	{ 0.689540544737066924616730630}, { 0.724247082951466920941069243},
-	{-0.724247082951466920941069243}, { 0.689540544737066924616730630},
-	{ 0.914209755703530654635014829}, { 0.405241314004989870908481306},
-	{-0.405241314004989870908481306}, { 0.914209755703530654635014829},
-	{ 0.359895036534988148775104572}, { 0.932992798834738887711660256},
-	{-0.932992798834738887711660256}, { 0.359895036534988148775104572},
-	{ 0.975702130038528544460395766}, { 0.219101240156869797227737547},
-	{-0.219101240156869797227737547}, { 0.975702130038528544460395766},
-	{ 0.534997619887097210663076905}, { 0.844853565249707073259571205},
-	{-0.844853565249707073259571205}, { 0.534997619887097210663076905},
-	{ 0.817584813151583696504920884}, { 0.575808191417845300745972454},
-	{-0.575808191417845300745972454}, { 0.817584813151583696504920884},
-	{ 0.170961888760301226363642357}, { 0.985277642388941244774018433},
-	{-0.985277642388941244774018433}, { 0.170961888760301226363642357},
-	{ 0.992479534598709998156767252}, { 0.122410675199216198498704474},
-	{-0.122410675199216198498704474}, { 0.992479534598709998156767252},
-	{ 0.615231590580626845484913563}, { 0.788346427626606262009164705},
-	{-0.788346427626606262009164705}, { 0.615231590580626845484913563},
-	{ 0.870086991108711418652292404}, { 0.492898192229784036873026689},
-	{-0.492898192229784036873026689}, { 0.870086991108711418652292404},
-	{ 0.266712757474898386325286515}, { 0.963776065795439866686464356},
-	{-0.963776065795439866686464356}, { 0.266712757474898386325286515},
-	{ 0.949528180593036667195936074}, { 0.313681740398891476656478846},
-	{-0.313681740398891476656478846}, { 0.949528180593036667195936074},
-	{ 0.449611329654606600046294579}, { 0.893224301195515320342416447},
-	{-0.893224301195515320342416447}, { 0.449611329654606600046294579},
-	{ 0.757208846506484547575464054}, { 0.653172842953776764084203014},
-	{-0.653172842953776764084203014}, { 0.757208846506484547575464054},
-	{ 0.073564563599667423529465622}, { 0.997290456678690216135597140},
-	{-0.997290456678690216135597140}, { 0.073564563599667423529465622},
-	{ 0.997290456678690216135597140}, { 0.073564563599667423529465622},
-	{-0.073564563599667423529465622}, { 0.997290456678690216135597140},
-	{ 0.653172842953776764084203014}, { 0.757208846506484547575464054},
-	{-0.757208846506484547575464054}, { 0.653172842953776764084203014},
-	{ 0.893224301195515320342416447}, { 0.449611329654606600046294579},
-	{-0.449611329654606600046294579}, { 0.893224301195515320342416447},
-	{ 0.313681740398891476656478846}, { 0.949528180593036667195936074},
-	{-0.949528180593036667195936074}, { 0.313681740398891476656478846},
-	{ 0.963776065795439866686464356}, { 0.266712757474898386325286515},
-	{-0.266712757474898386325286515}, { 0.963776065795439866686464356},
-	{ 0.492898192229784036873026689}, { 0.870086991108711418652292404},
-	{-0.870086991108711418652292404}, { 0.492898192229784036873026689},
-	{ 0.788346427626606262009164705}, { 0.615231590580626845484913563},
-	{-0.615231590580626845484913563}, { 0.788346427626606262009164705},
-	{ 0.122410675199216198498704474}, { 0.992479534598709998156767252},
-	{-0.992479534598709998156767252}, { 0.122410675199216198498704474},
-	{ 0.985277642388941244774018433}, { 0.170961888760301226363642357},
-	{-0.170961888760301226363642357}, { 0.985277642388941244774018433},
-	{ 0.575808191417845300745972454}, { 0.817584813151583696504920884},
-	{-0.817584813151583696504920884}, { 0.575808191417845300745972454},
-	{ 0.844853565249707073259571205}, { 0.534997619887097210663076905},
-	{-0.534997619887097210663076905}, { 0.844853565249707073259571205},
-	{ 0.219101240156869797227737547}, { 0.975702130038528544460395766},
-	{-0.975702130038528544460395766}, { 0.219101240156869797227737547},
-	{ 0.932992798834738887711660256}, { 0.359895036534988148775104572},
-	{-0.359895036534988148775104572}, { 0.932992798834738887711660256},
-	{ 0.405241314004989870908481306}, { 0.914209755703530654635014829},
-	{-0.914209755703530654635014829}, { 0.405241314004989870908481306},
-	{ 0.724247082951466920941069243}, { 0.689540544737066924616730630},
-	{-0.689540544737066924616730630}, { 0.724247082951466920941069243},
-	{ 0.024541228522912288031734529}, { 0.999698818696204220115765650},
-	{-0.999698818696204220115765650}, { 0.024541228522912288031734529},
-	{ 0.999924701839144540921646491}, { 0.012271538285719926079408262},
-	{-0.012271538285719926079408262}, { 0.999924701839144540921646491},
-	{ 0.698376249408972853554813503}, { 0.715730825283818654125532623},
-	{-0.715730825283818654125532623}, { 0.698376249408972853554813503},
-	{ 0.919113851690057743908477789}, { 0.393992040061048108596188661},
-	{-0.393992040061048108596188661}, { 0.919113851690057743908477789},
-	{ 0.371317193951837543411934967}, { 0.928506080473215565937167396},
-	{-0.928506080473215565937167396}, { 0.371317193951837543411934967},
-	{ 0.978317370719627633106240097}, { 0.207111376192218549708116020},
-	{-0.207111376192218549708116020}, { 0.978317370719627633106240097},
-	{ 0.545324988422046422313987347}, { 0.838224705554838043186996856},
-	{-0.838224705554838043186996856}, { 0.545324988422046422313987347},
-	{ 0.824589302785025264474803737}, { 0.565731810783613197389765011},
-	{-0.565731810783613197389765011}, { 0.824589302785025264474803737},
-	{ 0.183039887955140958516532578}, { 0.983105487431216327180301155},
-	{-0.983105487431216327180301155}, { 0.183039887955140958516532578},
-	{ 0.993906970002356041546922813}, { 0.110222207293883058807899140},
-	{-0.110222207293883058807899140}, { 0.993906970002356041546922813},
-	{ 0.624859488142386377084072816}, { 0.780737228572094478301588484},
-	{-0.780737228572094478301588484}, { 0.624859488142386377084072816},
-	{ 0.876070094195406607095844268}, { 0.482183772079122748517344481},
-	{-0.482183772079122748517344481}, { 0.876070094195406607095844268},
-	{ 0.278519689385053105207848526}, { 0.960430519415565811199035138},
-	{-0.960430519415565811199035138}, { 0.278519689385053105207848526},
-	{ 0.953306040354193836916740383}, { 0.302005949319228067003463232},
-	{-0.302005949319228067003463232}, { 0.953306040354193836916740383},
-	{ 0.460538710958240023633181487}, { 0.887639620402853947760181617},
-	{-0.887639620402853947760181617}, { 0.460538710958240023633181487},
-	{ 0.765167265622458925888815999}, { 0.643831542889791465068086063},
-	{-0.643831542889791465068086063}, { 0.765167265622458925888815999},
-	{ 0.085797312344439890461556332}, { 0.996312612182778012627226190},
-	{-0.996312612182778012627226190}, { 0.085797312344439890461556332},
-	{ 0.998118112900149207125155861}, { 0.061320736302208577782614593},
-	{-0.061320736302208577782614593}, { 0.998118112900149207125155861},
-	{ 0.662415777590171761113069817}, { 0.749136394523459325469203257},
-	{-0.749136394523459325469203257}, { 0.662415777590171761113069817},
-	{ 0.898674465693953843041976744}, { 0.438616238538527637647025738},
-	{-0.438616238538527637647025738}, { 0.898674465693953843041976744},
-	{ 0.325310292162262934135954708}, { 0.945607325380521325730945387},
-	{-0.945607325380521325730945387}, { 0.325310292162262934135954708},
-	{ 0.966976471044852109087220226}, { 0.254865659604514571553980779},
-	{-0.254865659604514571553980779}, { 0.966976471044852109087220226},
-	{ 0.503538383725717558691867071}, { 0.863972856121586737918147054},
-	{-0.863972856121586737918147054}, { 0.503538383725717558691867071},
-	{ 0.795836904608883536262791915}, { 0.605511041404325513920626941},
-	{-0.605511041404325513920626941}, { 0.795836904608883536262791915},
-	{ 0.134580708507126186316358409}, { 0.990902635427780025108237011},
-	{-0.990902635427780025108237011}, { 0.134580708507126186316358409},
-	{ 0.987301418157858382399815802}, { 0.158858143333861441684385360},
-	{-0.158858143333861441684385360}, { 0.987301418157858382399815802},
-	{ 0.585797857456438860328080838}, { 0.810457198252594791726703434},
-	{-0.810457198252594791726703434}, { 0.585797857456438860328080838},
-	{ 0.851355193105265142261290312}, { 0.524589682678468906215098464},
-	{-0.524589682678468906215098464}, { 0.851355193105265142261290312},
-	{ 0.231058108280671119643236018}, { 0.972939952205560145467720114},
-	{-0.972939952205560145467720114}, { 0.231058108280671119643236018},
-	{ 0.937339011912574923201899593}, { 0.348418680249434568419308588},
-	{-0.348418680249434568419308588}, { 0.937339011912574923201899593},
-	{ 0.416429560097637182562598911}, { 0.909167983090522376563884788},
-	{-0.909167983090522376563884788}, { 0.416429560097637182562598911},
-	{ 0.732654271672412834615546649}, { 0.680600997795453050594430464},
-	{-0.680600997795453050594430464}, { 0.732654271672412834615546649},
-	{ 0.036807222941358832324332691}, { 0.999322384588349500896221011},
-	{-0.999322384588349500896221011}, { 0.036807222941358832324332691},
-	{ 0.999322384588349500896221011}, { 0.036807222941358832324332691},
-	{-0.036807222941358832324332691}, { 0.999322384588349500896221011},
-	{ 0.680600997795453050594430464}, { 0.732654271672412834615546649},
-	{-0.732654271672412834615546649}, { 0.680600997795453050594430464},
-	{ 0.909167983090522376563884788}, { 0.416429560097637182562598911},
-	{-0.416429560097637182562598911}, { 0.909167983090522376563884788},
-	{ 0.348418680249434568419308588}, { 0.937339011912574923201899593},
-	{-0.937339011912574923201899593}, { 0.348418680249434568419308588},
-	{ 0.972939952205560145467720114}, { 0.231058108280671119643236018},
-	{-0.231058108280671119643236018}, { 0.972939952205560145467720114},
-	{ 0.524589682678468906215098464}, { 0.851355193105265142261290312},
-	{-0.851355193105265142261290312}, { 0.524589682678468906215098464},
-	{ 0.810457198252594791726703434}, { 0.585797857456438860328080838},
-	{-0.585797857456438860328080838}, { 0.810457198252594791726703434},
-	{ 0.158858143333861441684385360}, { 0.987301418157858382399815802},
-	{-0.987301418157858382399815802}, { 0.158858143333861441684385360},
-	{ 0.990902635427780025108237011}, { 0.134580708507126186316358409},
-	{-0.134580708507126186316358409}, { 0.990902635427780025108237011},
-	{ 0.605511041404325513920626941}, { 0.795836904608883536262791915},
-	{-0.795836904608883536262791915}, { 0.605511041404325513920626941},
-	{ 0.863972856121586737918147054}, { 0.503538383725717558691867071},
-	{-0.503538383725717558691867071}, { 0.863972856121586737918147054},
-	{ 0.254865659604514571553980779}, { 0.966976471044852109087220226},
-	{-0.966976471044852109087220226}, { 0.254865659604514571553980779},
-	{ 0.945607325380521325730945387}, { 0.325310292162262934135954708},
-	{-0.325310292162262934135954708}, { 0.945607325380521325730945387},
-	{ 0.438616238538527637647025738}, { 0.898674465693953843041976744},
-	{-0.898674465693953843041976744}, { 0.438616238538527637647025738},
-	{ 0.749136394523459325469203257}, { 0.662415777590171761113069817},
-	{-0.662415777590171761113069817}, { 0.749136394523459325469203257},
-	{ 0.061320736302208577782614593}, { 0.998118112900149207125155861},
-	{-0.998118112900149207125155861}, { 0.061320736302208577782614593},
-	{ 0.996312612182778012627226190}, { 0.085797312344439890461556332},
-	{-0.085797312344439890461556332}, { 0.996312612182778012627226190},
-	{ 0.643831542889791465068086063}, { 0.765167265622458925888815999},
-	{-0.765167265622458925888815999}, { 0.643831542889791465068086063},
-	{ 0.887639620402853947760181617}, { 0.460538710958240023633181487},
-	{-0.460538710958240023633181487}, { 0.887639620402853947760181617},
-	{ 0.302005949319228067003463232}, { 0.953306040354193836916740383},
-	{-0.953306040354193836916740383}, { 0.302005949319228067003463232},
-	{ 0.960430519415565811199035138}, { 0.278519689385053105207848526},
-	{-0.278519689385053105207848526}, { 0.960430519415565811199035138},
-	{ 0.482183772079122748517344481}, { 0.876070094195406607095844268},
-	{-0.876070094195406607095844268}, { 0.482183772079122748517344481},
-	{ 0.780737228572094478301588484}, { 0.624859488142386377084072816},
-	{-0.624859488142386377084072816}, { 0.780737228572094478301588484},
-	{ 0.110222207293883058807899140}, { 0.993906970002356041546922813},
-	{-0.993906970002356041546922813}, { 0.110222207293883058807899140},
-	{ 0.983105487431216327180301155}, { 0.183039887955140958516532578},
-	{-0.183039887955140958516532578}, { 0.983105487431216327180301155},
-	{ 0.565731810783613197389765011}, { 0.824589302785025264474803737},
-	{-0.824589302785025264474803737}, { 0.565731810783613197389765011},
-	{ 0.838224705554838043186996856}, { 0.545324988422046422313987347},
-	{-0.545324988422046422313987347}, { 0.838224705554838043186996856},
-	{ 0.207111376192218549708116020}, { 0.978317370719627633106240097},
-	{-0.978317370719627633106240097}, { 0.207111376192218549708116020},
-	{ 0.928506080473215565937167396}, { 0.371317193951837543411934967},
-	{-0.371317193951837543411934967}, { 0.928506080473215565937167396},
-	{ 0.393992040061048108596188661}, { 0.919113851690057743908477789},
-	{-0.919113851690057743908477789}, { 0.393992040061048108596188661},
-	{ 0.715730825283818654125532623}, { 0.698376249408972853554813503},
-	{-0.698376249408972853554813503}, { 0.715730825283818654125532623},
-	{ 0.012271538285719926079408262}, { 0.999924701839144540921646491},
-	{-0.999924701839144540921646491}, { 0.012271538285719926079408262},
-	{ 0.999981175282601142656990438}, { 0.006135884649154475359640235},
-	{-0.006135884649154475359640235}, { 0.999981175282601142656990438},
-	{ 0.702754744457225302452914421}, { 0.711432195745216441522130290},
-	{-0.711432195745216441522130290}, { 0.702754744457225302452914421},
-	{ 0.921514039342041943465396332}, { 0.388345046698826291624993541},
-	{-0.388345046698826291624993541}, { 0.921514039342041943465396332},
-	{ 0.377007410216418256726567823}, { 0.926210242138311341974793388},
-	{-0.926210242138311341974793388}, { 0.377007410216418256726567823},
-	{ 0.979569765685440534439326110}, { 0.201104634842091911558443546},
-	{-0.201104634842091911558443546}, { 0.979569765685440534439326110},
-	{ 0.550457972936604802977289893}, { 0.834862874986380056304401383},
-	{-0.834862874986380056304401383}, { 0.550457972936604802977289893},
-	{ 0.828045045257755752067527592}, { 0.560661576197336023839710223},
-	{-0.560661576197336023839710223}, { 0.828045045257755752067527592},
-	{ 0.189068664149806212754997837}, { 0.981963869109555264072848154},
-	{-0.981963869109555264072848154}, { 0.189068664149806212754997837},
-	{ 0.994564570734255452119106243}, { 0.104121633872054579120943880},
-	{-0.104121633872054579120943880}, { 0.994564570734255452119106243},
-	{ 0.629638238914927025372981341}, { 0.776888465673232450040827983},
-	{-0.776888465673232450040827983}, { 0.629638238914927025372981341},
-	{ 0.879012226428633477831323711}, { 0.476799230063322133342158117},
-	{-0.476799230063322133342158117}, { 0.879012226428633477831323711},
-	{ 0.284407537211271843618310615}, { 0.958703474895871555374645792},
-	{-0.958703474895871555374645792}, { 0.284407537211271843618310615},
-	{ 0.955141168305770721498157712}, { 0.296150888243623824121786128},
-	{-0.296150888243623824121786128}, { 0.955141168305770721498157712},
-	{ 0.465976495767966177902756065}, { 0.884797098430937780104007041},
-	{-0.884797098430937780104007041}, { 0.465976495767966177902756065},
-	{ 0.769103337645579639346626069}, { 0.639124444863775743801488193},
-	{-0.639124444863775743801488193}, { 0.769103337645579639346626069},
-	{ 0.091908956497132728624990979}, { 0.995767414467659793982495643},
-	{-0.995767414467659793982495643}, { 0.091908956497132728624990979},
-	{ 0.998475580573294752208559038}, { 0.055195244349689939809447526},
-	{-0.055195244349689939809447526}, { 0.998475580573294752208559038},
-	{ 0.666999922303637506650154222}, { 0.745057785441465962407907310},
-	{-0.745057785441465962407907310}, { 0.666999922303637506650154222},
-	{ 0.901348847046022014570746093}, { 0.433093818853151968484222638},
-	{-0.433093818853151968484222638}, { 0.901348847046022014570746093},
-	{ 0.331106305759876401737190737}, { 0.943593458161960361495301445},
-	{-0.943593458161960361495301445}, { 0.331106305759876401737190737},
-	{ 0.968522094274417316221088329}, { 0.248927605745720168110682816},
-	{-0.248927605745720168110682816}, { 0.968522094274417316221088329},
-	{ 0.508830142543107036931749324}, { 0.860866938637767279344583877},
-	{-0.860866938637767279344583877}, { 0.508830142543107036931749324},
-	{ 0.799537269107905033500246232}, { 0.600616479383868926653875896},
-	{-0.600616479383868926653875896}, { 0.799537269107905033500246232},
-	{ 0.140658239332849230714788846}, { 0.990058210262297105505906464},
-	{-0.990058210262297105505906464}, { 0.140658239332849230714788846},
-	{ 0.988257567730749491404792538}, { 0.152797185258443427720336613},
-	{-0.152797185258443427720336613}, { 0.988257567730749491404792538},
-	{ 0.590759701858874228423887908}, { 0.806847553543799272206514313},
-	{-0.806847553543799272206514313}, { 0.590759701858874228423887908},
-	{ 0.854557988365400520767862276}, { 0.519355990165589587361829932},
-	{-0.519355990165589587361829932}, { 0.854557988365400520767862276},
-	{ 0.237023605994367206867735915}, { 0.971503890986251775537099622},
-	{-0.971503890986251775537099622}, { 0.237023605994367206867735915},
-	{ 0.939459223602189911962669246}, { 0.342660717311994397592781983},
-	{-0.342660717311994397592781983}, { 0.939459223602189911962669246},
-	{ 0.422000270799799685941287941}, { 0.906595704514915365332960588},
-	{-0.906595704514915365332960588}, { 0.422000270799799685941287941},
-	{ 0.736816568877369875090132520}, { 0.676092703575315960360419228},
-	{-0.676092703575315960360419228}, { 0.736816568877369875090132520},
-	{ 0.042938256934940823077124540}, { 0.999077727752645382888781997},
-	{-0.999077727752645382888781997}, { 0.042938256934940823077124540},
-	{ 0.999529417501093163079703322}, { 0.030674803176636625934021028},
-	{-0.030674803176636625934021028}, { 0.999529417501093163079703322},
-	{ 0.685083667772700381362052545}, { 0.728464390448225196492035438},
-	{-0.728464390448225196492035438}, { 0.685083667772700381362052545},
-	{ 0.911706032005429851404397325}, { 0.410843171057903942183466675},
-	{-0.410843171057903942183466675}, { 0.911706032005429851404397325},
-	{ 0.354163525420490382357395796}, { 0.935183509938947577642207480},
-	{-0.935183509938947577642207480}, { 0.354163525420490382357395796},
-	{ 0.974339382785575860518721668}, { 0.225083911359792835991642120},
-	{-0.225083911359792835991642120}, { 0.974339382785575860518721668},
-	{ 0.529803624686294668216054671}, { 0.848120344803297251279133563},
-	{-0.848120344803297251279133563}, { 0.529803624686294668216054671},
-	{ 0.814036329705948361654516690}, { 0.580813958095764545075595272},
-	{-0.580813958095764545075595272}, { 0.814036329705948361654516690},
-	{ 0.164913120489969921418189113}, { 0.986308097244598647863297524},
-	{-0.986308097244598647863297524}, { 0.164913120489969921418189113},
-	{ 0.991709753669099522860049931}, { 0.128498110793793172624415589},
-	{-0.128498110793793172624415589}, { 0.991709753669099522860049931},
-	{ 0.610382806276309452716352152}, { 0.792106577300212351782342879},
-	{-0.792106577300212351782342879}, { 0.610382806276309452716352152},
-	{ 0.867046245515692651480195629}, { 0.498227666972781852410983869},
-	{-0.498227666972781852410983869}, { 0.867046245515692651480195629},
-	{ 0.260794117915275518280186509}, { 0.965394441697689374550843858},
-	{-0.965394441697689374550843858}, { 0.260794117915275518280186509},
-	{ 0.947585591017741134653387321}, { 0.319502030816015677901518272},
-	{-0.319502030816015677901518272}, { 0.947585591017741134653387321},
-	{ 0.444122144570429231642069418}, { 0.895966249756185155914560282},
-	{-0.895966249756185155914560282}, { 0.444122144570429231642069418},
-	{ 0.753186799043612482483430486}, { 0.657806693297078656931182264},
-	{-0.657806693297078656931182264}, { 0.753186799043612482483430486},
-	{ 0.067443919563664057897972422}, { 0.997723066644191609848546728},
-	{-0.997723066644191609848546728}, { 0.067443919563664057897972422},
-	{ 0.996820299291165714972629398}, { 0.079682437971430121147120656},
-	{-0.079682437971430121147120656}, { 0.996820299291165714972629398},
-	{ 0.648514401022112445084560551}, { 0.761202385484261814029709836},
-	{-0.761202385484261814029709836}, { 0.648514401022112445084560551},
-	{ 0.890448723244757889952150560}, { 0.455083587126343823535869268},
-	{-0.455083587126343823535869268}, { 0.890448723244757889952150560},
-	{ 0.307849640041534893682063646}, { 0.951435020969008369549175569},
-	{-0.951435020969008369549175569}, { 0.307849640041534893682063646},
-	{ 0.962121404269041595429604316}, { 0.272621355449948984493347477},
-	{-0.272621355449948984493347477}, { 0.962121404269041595429604316},
-	{ 0.487550160148435954641485027}, { 0.873094978418290098636085973},
-	{-0.873094978418290098636085973}, { 0.487550160148435954641485027},
-	{ 0.784556597155575233023892575}, { 0.620057211763289178646268191},
-	{-0.620057211763289178646268191}, { 0.784556597155575233023892575},
-	{ 0.116318630911904767252544319}, { 0.993211949234794533104601012},
-	{-0.993211949234794533104601012}, { 0.116318630911904767252544319},
-	{ 0.984210092386929073193874387}, { 0.177004220412148756196839844},
-	{-0.177004220412148756196839844}, { 0.984210092386929073193874387},
-	{ 0.570780745886967280232652864}, { 0.821102514991104679060430820},
-	{-0.821102514991104679060430820}, { 0.570780745886967280232652864},
-	{ 0.841554977436898409603499520}, { 0.540171472729892881297845480},
-	{-0.540171472729892881297845480}, { 0.841554977436898409603499520},
-	{ 0.213110319916091373967757518}, { 0.977028142657754351485866211},
-	{-0.977028142657754351485866211}, { 0.213110319916091373967757518},
-	{ 0.930766961078983731944872340}, { 0.365612997804773870011745909},
-	{-0.365612997804773870011745909}, { 0.930766961078983731944872340},
-	{ 0.399624199845646828544117031}, { 0.916679059921042663116457013},
-	{-0.916679059921042663116457013}, { 0.399624199845646828544117031},
-	{ 0.720002507961381629076682999}, { 0.693971460889654009003734389},
-	{-0.693971460889654009003734389}, { 0.720002507961381629076682999},
-	{ 0.018406729905804820927366313}, { 0.999830581795823422015722275},
-	{-0.999830581795823422015722275}, { 0.018406729905804820927366313},
-	{ 0.999830581795823422015722275}, { 0.018406729905804820927366313},
-	{-0.018406729905804820927366313}, { 0.999830581795823422015722275},
-	{ 0.693971460889654009003734389}, { 0.720002507961381629076682999},
-	{-0.720002507961381629076682999}, { 0.693971460889654009003734389},
-	{ 0.916679059921042663116457013}, { 0.399624199845646828544117031},
-	{-0.399624199845646828544117031}, { 0.916679059921042663116457013},
-	{ 0.365612997804773870011745909}, { 0.930766961078983731944872340},
-	{-0.930766961078983731944872340}, { 0.365612997804773870011745909},
-	{ 0.977028142657754351485866211}, { 0.213110319916091373967757518},
-	{-0.213110319916091373967757518}, { 0.977028142657754351485866211},
-	{ 0.540171472729892881297845480}, { 0.841554977436898409603499520},
-	{-0.841554977436898409603499520}, { 0.540171472729892881297845480},
-	{ 0.821102514991104679060430820}, { 0.570780745886967280232652864},
-	{-0.570780745886967280232652864}, { 0.821102514991104679060430820},
-	{ 0.177004220412148756196839844}, { 0.984210092386929073193874387},
-	{-0.984210092386929073193874387}, { 0.177004220412148756196839844},
-	{ 0.993211949234794533104601012}, { 0.116318630911904767252544319},
-	{-0.116318630911904767252544319}, { 0.993211949234794533104601012},
-	{ 0.620057211763289178646268191}, { 0.784556597155575233023892575},
-	{-0.784556597155575233023892575}, { 0.620057211763289178646268191},
-	{ 0.873094978418290098636085973}, { 0.487550160148435954641485027},
-	{-0.487550160148435954641485027}, { 0.873094978418290098636085973},
-	{ 0.272621355449948984493347477}, { 0.962121404269041595429604316},
-	{-0.962121404269041595429604316}, { 0.272621355449948984493347477},
-	{ 0.951435020969008369549175569}, { 0.307849640041534893682063646},
-	{-0.307849640041534893682063646}, { 0.951435020969008369549175569},
-	{ 0.455083587126343823535869268}, { 0.890448723244757889952150560},
-	{-0.890448723244757889952150560}, { 0.455083587126343823535869268},
-	{ 0.761202385484261814029709836}, { 0.648514401022112445084560551},
-	{-0.648514401022112445084560551}, { 0.761202385484261814029709836},
-	{ 0.079682437971430121147120656}, { 0.996820299291165714972629398},
-	{-0.996820299291165714972629398}, { 0.079682437971430121147120656},
-	{ 0.997723066644191609848546728}, { 0.067443919563664057897972422},
-	{-0.067443919563664057897972422}, { 0.997723066644191609848546728},
-	{ 0.657806693297078656931182264}, { 0.753186799043612482483430486},
-	{-0.753186799043612482483430486}, { 0.657806693297078656931182264},
-	{ 0.895966249756185155914560282}, { 0.444122144570429231642069418},
-	{-0.444122144570429231642069418}, { 0.895966249756185155914560282},
-	{ 0.319502030816015677901518272}, { 0.947585591017741134653387321},
-	{-0.947585591017741134653387321}, { 0.319502030816015677901518272},
-	{ 0.965394441697689374550843858}, { 0.260794117915275518280186509},
-	{-0.260794117915275518280186509}, { 0.965394441697689374550843858},
-	{ 0.498227666972781852410983869}, { 0.867046245515692651480195629},
-	{-0.867046245515692651480195629}, { 0.498227666972781852410983869},
-	{ 0.792106577300212351782342879}, { 0.610382806276309452716352152},
-	{-0.610382806276309452716352152}, { 0.792106577300212351782342879},
-	{ 0.128498110793793172624415589}, { 0.991709753669099522860049931},
-	{-0.991709753669099522860049931}, { 0.128498110793793172624415589},
-	{ 0.986308097244598647863297524}, { 0.164913120489969921418189113},
-	{-0.164913120489969921418189113}, { 0.986308097244598647863297524},
-	{ 0.580813958095764545075595272}, { 0.814036329705948361654516690},
-	{-0.814036329705948361654516690}, { 0.580813958095764545075595272},
-	{ 0.848120344803297251279133563}, { 0.529803624686294668216054671},
-	{-0.529803624686294668216054671}, { 0.848120344803297251279133563},
-	{ 0.225083911359792835991642120}, { 0.974339382785575860518721668},
-	{-0.974339382785575860518721668}, { 0.225083911359792835991642120},
-	{ 0.935183509938947577642207480}, { 0.354163525420490382357395796},
-	{-0.354163525420490382357395796}, { 0.935183509938947577642207480},
-	{ 0.410843171057903942183466675}, { 0.911706032005429851404397325},
-	{-0.911706032005429851404397325}, { 0.410843171057903942183466675},
-	{ 0.728464390448225196492035438}, { 0.685083667772700381362052545},
-	{-0.685083667772700381362052545}, { 0.728464390448225196492035438},
-	{ 0.030674803176636625934021028}, { 0.999529417501093163079703322},
-	{-0.999529417501093163079703322}, { 0.030674803176636625934021028},
-	{ 0.999077727752645382888781997}, { 0.042938256934940823077124540},
-	{-0.042938256934940823077124540}, { 0.999077727752645382888781997},
-	{ 0.676092703575315960360419228}, { 0.736816568877369875090132520},
-	{-0.736816568877369875090132520}, { 0.676092703575315960360419228},
-	{ 0.906595704514915365332960588}, { 0.422000270799799685941287941},
-	{-0.422000270799799685941287941}, { 0.906595704514915365332960588},
-	{ 0.342660717311994397592781983}, { 0.939459223602189911962669246},
-	{-0.939459223602189911962669246}, { 0.342660717311994397592781983},
-	{ 0.971503890986251775537099622}, { 0.237023605994367206867735915},
-	{-0.237023605994367206867735915}, { 0.971503890986251775537099622},
-	{ 0.519355990165589587361829932}, { 0.854557988365400520767862276},
-	{-0.854557988365400520767862276}, { 0.519355990165589587361829932},
-	{ 0.806847553543799272206514313}, { 0.590759701858874228423887908},
-	{-0.590759701858874228423887908}, { 0.806847553543799272206514313},
-	{ 0.152797185258443427720336613}, { 0.988257567730749491404792538},
-	{-0.988257567730749491404792538}, { 0.152797185258443427720336613},
-	{ 0.990058210262297105505906464}, { 0.140658239332849230714788846},
-	{-0.140658239332849230714788846}, { 0.990058210262297105505906464},
-	{ 0.600616479383868926653875896}, { 0.799537269107905033500246232},
-	{-0.799537269107905033500246232}, { 0.600616479383868926653875896},
-	{ 0.860866938637767279344583877}, { 0.508830142543107036931749324},
-	{-0.508830142543107036931749324}, { 0.860866938637767279344583877},
-	{ 0.248927605745720168110682816}, { 0.968522094274417316221088329},
-	{-0.968522094274417316221088329}, { 0.248927605745720168110682816},
-	{ 0.943593458161960361495301445}, { 0.331106305759876401737190737},
-	{-0.331106305759876401737190737}, { 0.943593458161960361495301445},
-	{ 0.433093818853151968484222638}, { 0.901348847046022014570746093},
-	{-0.901348847046022014570746093}, { 0.433093818853151968484222638},
-	{ 0.745057785441465962407907310}, { 0.666999922303637506650154222},
-	{-0.666999922303637506650154222}, { 0.745057785441465962407907310},
-	{ 0.055195244349689939809447526}, { 0.998475580573294752208559038},
-	{-0.998475580573294752208559038}, { 0.055195244349689939809447526},
-	{ 0.995767414467659793982495643}, { 0.091908956497132728624990979},
-	{-0.091908956497132728624990979}, { 0.995767414467659793982495643},
-	{ 0.639124444863775743801488193}, { 0.769103337645579639346626069},
-	{-0.769103337645579639346626069}, { 0.639124444863775743801488193},
-	{ 0.884797098430937780104007041}, { 0.465976495767966177902756065},
-	{-0.465976495767966177902756065}, { 0.884797098430937780104007041},
-	{ 0.296150888243623824121786128}, { 0.955141168305770721498157712},
-	{-0.955141168305770721498157712}, { 0.296150888243623824121786128},
-	{ 0.958703474895871555374645792}, { 0.284407537211271843618310615},
-	{-0.284407537211271843618310615}, { 0.958703474895871555374645792},
-	{ 0.476799230063322133342158117}, { 0.879012226428633477831323711},
-	{-0.879012226428633477831323711}, { 0.476799230063322133342158117},
-	{ 0.776888465673232450040827983}, { 0.629638238914927025372981341},
-	{-0.629638238914927025372981341}, { 0.776888465673232450040827983},
-	{ 0.104121633872054579120943880}, { 0.994564570734255452119106243},
-	{-0.994564570734255452119106243}, { 0.104121633872054579120943880},
-	{ 0.981963869109555264072848154}, { 0.189068664149806212754997837},
-	{-0.189068664149806212754997837}, { 0.981963869109555264072848154},
-	{ 0.560661576197336023839710223}, { 0.828045045257755752067527592},
-	{-0.828045045257755752067527592}, { 0.560661576197336023839710223},
-	{ 0.834862874986380056304401383}, { 0.550457972936604802977289893},
-	{-0.550457972936604802977289893}, { 0.834862874986380056304401383},
-	{ 0.201104634842091911558443546}, { 0.979569765685440534439326110},
-	{-0.979569765685440534439326110}, { 0.201104634842091911558443546},
-	{ 0.926210242138311341974793388}, { 0.377007410216418256726567823},
-	{-0.377007410216418256726567823}, { 0.926210242138311341974793388},
-	{ 0.388345046698826291624993541}, { 0.921514039342041943465396332},
-	{-0.921514039342041943465396332}, { 0.388345046698826291624993541},
-	{ 0.711432195745216441522130290}, { 0.702754744457225302452914421},
-	{-0.702754744457225302452914421}, { 0.711432195745216441522130290},
-	{ 0.006135884649154475359640235}, { 0.999981175282601142656990438},
-	{-0.999981175282601142656990438}, { 0.006135884649154475359640235},
-	{ 0.999995293809576171511580126}, { 0.003067956762965976270145365},
-	{-0.003067956762965976270145365}, { 0.999995293809576171511580126},
-	{ 0.704934080375904908852523758}, { 0.709272826438865651316533772},
-	{-0.709272826438865651316533772}, { 0.704934080375904908852523758},
-	{ 0.922701128333878570437264227}, { 0.385516053843918864075607949},
-	{-0.385516053843918864075607949}, { 0.922701128333878570437264227},
-	{ 0.379847208924051170576281147}, { 0.925049240782677590302371869},
-	{-0.925049240782677590302371869}, { 0.379847208924051170576281147},
-	{ 0.980182135968117392690210009}, { 0.198098410717953586179324918},
-	{-0.198098410717953586179324918}, { 0.980182135968117392690210009},
-	{ 0.553016705580027531764226988}, { 0.833170164701913186439915922},
-	{-0.833170164701913186439915922}, { 0.553016705580027531764226988},
-	{ 0.829761233794523042469023765}, { 0.558118531220556115693702964},
-	{-0.558118531220556115693702964}, { 0.829761233794523042469023765},
-	{ 0.192080397049892441679288205}, { 0.981379193313754574318224190},
-	{-0.981379193313754574318224190}, { 0.192080397049892441679288205},
-	{ 0.994879330794805620591166107}, { 0.101069862754827824987887585},
-	{-0.101069862754827824987887585}, { 0.994879330794805620591166107},
-	{ 0.632018735939809021909403706}, { 0.774953106594873878359129282},
-	{-0.774953106594873878359129282}, { 0.632018735939809021909403706},
-	{ 0.880470889052160770806542929}, { 0.474100214650550014398580015},
-	{-0.474100214650550014398580015}, { 0.880470889052160770806542929},
-	{ 0.287347459544729526477331841}, { 0.957826413027532890321037029},
-	{-0.957826413027532890321037029}, { 0.287347459544729526477331841},
-	{ 0.956045251349996443270479823}, { 0.293219162694258650606608599},
-	{-0.293219162694258650606608599}, { 0.956045251349996443270479823},
-	{ 0.468688822035827933697617870}, { 0.883363338665731594736308015},
-	{-0.883363338665731594736308015}, { 0.468688822035827933697617870},
-	{ 0.771060524261813773200605759}, { 0.636761861236284230413943435},
-	{-0.636761861236284230413943435}, { 0.771060524261813773200605759},
-	{ 0.094963495329638998938034312}, { 0.995480755491926941769171600},
-	{-0.995480755491926941769171600}, { 0.094963495329638998938034312},
-	{ 0.998640218180265222418199049}, { 0.052131704680283321236358216},
-	{-0.052131704680283321236358216}, { 0.998640218180265222418199049},
-	{ 0.669282588346636065720696366}, { 0.743007952135121693517362293},
-	{-0.743007952135121693517362293}, { 0.669282588346636065720696366},
-	{ 0.902673318237258806751502391}, { 0.430326481340082633908199031},
-	{-0.430326481340082633908199031}, { 0.902673318237258806751502391},
-	{ 0.333999651442009404650865481}, { 0.942573197601446879280758735},
-	{-0.942573197601446879280758735}, { 0.333999651442009404650865481},
-	{ 0.969281235356548486048290738}, { 0.245955050335794611599924709},
-	{-0.245955050335794611599924709}, { 0.969281235356548486048290738},
-	{ 0.511468850437970399504391001}, { 0.859301818357008404783582139},
-	{-0.859301818357008404783582139}, { 0.511468850437970399504391001},
-	{ 0.801376171723140219430247777}, { 0.598160706996342311724958652},
-	{-0.598160706996342311724958652}, { 0.801376171723140219430247777},
-	{ 0.143695033150294454819773349}, { 0.989622017463200834623694454},
-	{-0.989622017463200834623694454}, { 0.143695033150294454819773349},
-	{ 0.988721691960323767604516485}, { 0.149764534677321517229695737},
-	{-0.149764534677321517229695737}, { 0.988721691960323767604516485},
-	{ 0.593232295039799808047809426}, { 0.805031331142963597922659282},
-	{-0.805031331142963597922659282}, { 0.593232295039799808047809426},
-	{ 0.856147328375194481019630732}, { 0.516731799017649881508753876},
-	{-0.516731799017649881508753876}, { 0.856147328375194481019630732},
-	{ 0.240003022448741486568922365}, { 0.970772140728950302138169611},
-	{-0.970772140728950302138169611}, { 0.240003022448741486568922365},
-	{ 0.940506070593268323787291309}, { 0.339776884406826857828825803},
-	{-0.339776884406826857828825803}, { 0.940506070593268323787291309},
-	{ 0.424779681209108833357226189}, { 0.905296759318118774354048329},
-	{-0.905296759318118774354048329}, { 0.424779681209108833357226189},
-	{ 0.738887324460615147933116508}, { 0.673829000378756060917568372},
-	{-0.673829000378756060917568372}, { 0.738887324460615147933116508},
-	{ 0.046003182130914628814301788}, { 0.998941293186856850633930266},
-	{-0.998941293186856850633930266}, { 0.046003182130914628814301788},
-	{ 0.999618822495178597116830637}, { 0.027608145778965741612354872},
-	{-0.027608145778965741612354872}, { 0.999618822495178597116830637},
-	{ 0.687315340891759108199186948}, { 0.726359155084345976817494315},
-	{-0.726359155084345976817494315}, { 0.687315340891759108199186948},
-	{ 0.912962190428398164628018233}, { 0.408044162864978680820747499},
-	{-0.408044162864978680820747499}, { 0.912962190428398164628018233},
-	{ 0.357030961233430032614954036}, { 0.934092550404258914729877883},
-	{-0.934092550404258914729877883}, { 0.357030961233430032614954036},
-	{ 0.975025345066994146844913468}, { 0.222093620973203534094094721},
-	{-0.222093620973203534094094721}, { 0.975025345066994146844913468},
-	{ 0.532403127877197971442805218}, { 0.846490938774052078300544488},
-	{-0.846490938774052078300544488}, { 0.532403127877197971442805218},
-	{ 0.815814410806733789010772660}, { 0.578313796411655563342245019},
-	{-0.578313796411655563342245019}, { 0.815814410806733789010772660},
-	{ 0.167938294974731178054745536}, { 0.985797509167567424700995000},
-	{-0.985797509167567424700995000}, { 0.167938294974731178054745536},
-	{ 0.992099313142191757112085445}, { 0.125454983411546238542336453},
-	{-0.125454983411546238542336453}, { 0.992099313142191757112085445},
-	{ 0.612810082429409703935211936}, { 0.790230221437310055030217152},
-	{-0.790230221437310055030217152}, { 0.612810082429409703935211936},
-	{ 0.868570705971340895340449876}, { 0.495565261825772531150266670},
-	{-0.495565261825772531150266670}, { 0.868570705971340895340449876},
-	{ 0.263754678974831383611349322}, { 0.964589793289812723836432159},
-	{-0.964589793289812723836432159}, { 0.263754678974831383611349322},
-	{ 0.948561349915730288158494826}, { 0.316593375556165867243047035},
-	{-0.316593375556165867243047035}, { 0.948561349915730288158494826},
-	{ 0.446868840162374195353044389}, { 0.894599485631382678433072126},
-	{-0.894599485631382678433072126}, { 0.446868840162374195353044389},
-	{ 0.755201376896536527598710756}, { 0.655492852999615385312679701},
-	{-0.655492852999615385312679701}, { 0.755201376896536527598710756},
-	{ 0.070504573389613863027351471}, { 0.997511456140303459699448390},
-	{-0.997511456140303459699448390}, { 0.070504573389613863027351471},
-	{ 0.997060070339482978987989949}, { 0.076623861392031492278332463},
-	{-0.076623861392031492278332463}, { 0.997060070339482978987989949},
-	{ 0.650846684996380915068975573}, { 0.759209188978388033485525443},
-	{-0.759209188978388033485525443}, { 0.650846684996380915068975573},
-	{ 0.891840709392342727796478697}, { 0.452349587233770874133026703},
-	{-0.452349587233770874133026703}, { 0.891840709392342727796478697},
-	{ 0.310767152749611495835997250}, { 0.950486073949481721759926101},
-	{-0.950486073949481721759926101}, { 0.310767152749611495835997250},
-	{ 0.962953266873683886347921481}, { 0.269668325572915106525464462},
-	{-0.269668325572915106525464462}, { 0.962953266873683886347921481},
-	{ 0.490226483288291154229598449}, { 0.871595086655951034842481435},
-	{-0.871595086655951034842481435}, { 0.490226483288291154229598449},
-	{ 0.786455213599085757522319464}, { 0.617647307937803932403979402},
-	{-0.617647307937803932403979402}, { 0.786455213599085757522319464},
-	{ 0.119365214810991364593637790}, { 0.992850414459865090793563344},
-	{-0.992850414459865090793563344}, { 0.119365214810991364593637790},
-	{ 0.984748501801904218556553176}, { 0.173983873387463827950700807},
-	{-0.173983873387463827950700807}, { 0.984748501801904218556553176},
-	{ 0.573297166698042212820171239}, { 0.819347520076796960824689637},
-	{-0.819347520076796960824689637}, { 0.573297166698042212820171239},
-	{ 0.843208239641845437161743865}, { 0.537587076295645482502214932},
-	{-0.537587076295645482502214932}, { 0.843208239641845437161743865},
-	{ 0.216106797076219509948385131}, { 0.976369731330021149312732194},
-	{-0.976369731330021149312732194}, { 0.216106797076219509948385131},
-	{ 0.931884265581668106718557199}, { 0.362755724367397216204854462},
-	{-0.362755724367397216204854462}, { 0.931884265581668106718557199},
-	{ 0.402434650859418441082533934}, { 0.915448716088267819566431292},
-	{-0.915448716088267819566431292}, { 0.402434650859418441082533934},
-	{ 0.722128193929215321243607198}, { 0.691759258364157774906734132},
-	{-0.691759258364157774906734132}, { 0.722128193929215321243607198},
-	{ 0.021474080275469507418374898}, { 0.999769405351215321657617036},
-	{-0.999769405351215321657617036}, { 0.021474080275469507418374898},
-	{ 0.999882347454212525633049627}, { 0.015339206284988101044151868},
-	{-0.015339206284988101044151868}, { 0.999882347454212525633049627},
-	{ 0.696177131491462944788582591}, { 0.717870045055731736211325329},
-	{-0.717870045055731736211325329}, { 0.696177131491462944788582591},
-	{ 0.917900775621390457642276297}, { 0.396809987416710328595290911},
-	{-0.396809987416710328595290911}, { 0.917900775621390457642276297},
-	{ 0.368466829953372331712746222}, { 0.929640895843181265457918066},
-	{-0.929640895843181265457918066}, { 0.368466829953372331712746222},
-	{ 0.977677357824509979943404762}, { 0.210111836880469621717489972},
-	{-0.210111836880469621717489972}, { 0.977677357824509979943404762},
-	{ 0.542750784864515906586768661}, { 0.839893794195999504583383987},
-	{-0.839893794195999504583383987}, { 0.542750784864515906586768661},
-	{ 0.822849781375826332046780034}, { 0.568258952670131549790548489},
-	{-0.568258952670131549790548489}, { 0.822849781375826332046780034},
-	{ 0.180022901405699522679906590}, { 0.983662419211730274396237776},
-	{-0.983662419211730274396237776}, { 0.180022901405699522679906590},
-	{ 0.993564135520595333782021697}, { 0.113270952177564349018228733},
-	{-0.113270952177564349018228733}, { 0.993564135520595333782021697},
-	{ 0.622461279374149972519166721}, { 0.782650596166575738458949301},
-	{-0.782650596166575738458949301}, { 0.622461279374149972519166721},
-	{ 0.874586652278176112634431897}, { 0.484869248000791101822951699},
-	{-0.484869248000791101822951699}, { 0.874586652278176112634431897},
-	{ 0.275571819310958163076425168}, { 0.961280485811320641748659653},
-	{-0.961280485811320641748659653}, { 0.275571819310958163076425168},
-	{ 0.952375012719765858529893608}, { 0.304929229735402406490728633},
-	{-0.304929229735402406490728633}, { 0.952375012719765858529893608},
-	{ 0.457813303598877221904961155}, { 0.889048355854664562540777729},
-	{-0.889048355854664562540777729}, { 0.457813303598877221904961155},
-	{ 0.763188417263381271704838297}, { 0.646176012983316364832802220},
-	{-0.646176012983316364832802220}, { 0.763188417263381271704838297},
-	{ 0.082740264549375693111987083}, { 0.996571145790554847093566910},
-	{-0.996571145790554847093566910}, { 0.082740264549375693111987083},
-	{ 0.997925286198596012623025462}, { 0.064382630929857460819324537},
-	{-0.064382630929857460819324537}, { 0.997925286198596012623025462},
-	{ 0.660114342067420478559490747}, { 0.751165131909686411205819422},
-	{-0.751165131909686411205819422}, { 0.660114342067420478559490747},
-	{ 0.897324580705418281231391836}, { 0.441371268731716692879988968},
-	{-0.441371268731716692879988968}, { 0.897324580705418281231391836},
-	{ 0.322407678801069848384807478}, { 0.946600913083283570044599823},
-	{-0.946600913083283570044599823}, { 0.322407678801069848384807478},
-	{ 0.966190003445412555433832961}, { 0.257831102162159005614471295},
-	{-0.257831102162159005614471295}, { 0.966190003445412555433832961},
-	{ 0.500885382611240786241285004}, { 0.865513624090569082825488358},
-	{-0.865513624090569082825488358}, { 0.500885382611240786241285004},
-	{ 0.793975477554337164895083757}, { 0.607949784967773667243642671},
-	{-0.607949784967773667243642671}, { 0.793975477554337164895083757},
-	{ 0.131540028702883111103387493}, { 0.991310859846115418957349799},
-	{-0.991310859846115418957349799}, { 0.131540028702883111103387493},
-	{ 0.986809401814185476970235952}, { 0.161886393780111837641387995},
-	{-0.161886393780111837641387995}, { 0.986809401814185476970235952},
-	{ 0.583308652937698294392830961}, { 0.812250586585203913049744181},
-	{-0.812250586585203913049744181}, { 0.583308652937698294392830961},
-	{ 0.849741768000852489471268395}, { 0.527199134781901348464274575},
-	{-0.527199134781901348464274575}, { 0.849741768000852489471268395},
-	{ 0.228072083170885739254457379}, { 0.973644249650811925318383912},
-	{-0.973644249650811925318383912}, { 0.228072083170885739254457379},
-	{ 0.936265667170278246576310996}, { 0.351292756085567125601307623},
-	{-0.351292756085567125601307623}, { 0.936265667170278246576310996},
-	{ 0.413638312238434547471944324}, { 0.910441292258067196934095369},
-	{-0.910441292258067196934095369}, { 0.413638312238434547471944324},
-	{ 0.730562769227827561177758850}, { 0.682845546385248068164596123},
-	{-0.682845546385248068164596123}, { 0.730562769227827561177758850},
-	{ 0.033741171851377584833716112}, { 0.999430604555461772019008327},
-	{-0.999430604555461772019008327}, { 0.033741171851377584833716112},
-	{ 0.999204758618363895492950001}, { 0.039872927587739811128578738},
-	{-0.039872927587739811128578738}, { 0.999204758618363895492950001},
-	{ 0.678350043129861486873655042}, { 0.734738878095963464563223604},
-	{-0.734738878095963464563223604}, { 0.678350043129861486873655042},
-	{ 0.907886116487666212038681480}, { 0.419216888363223956433010020},
-	{-0.419216888363223956433010020}, { 0.907886116487666212038681480},
-	{ 0.345541324963989065539191723}, { 0.938403534063108112192420774},
-	{-0.938403534063108112192420774}, { 0.345541324963989065539191723},
-	{ 0.972226497078936305708321144}, { 0.234041958583543423191242045},
-	{-0.234041958583543423191242045}, { 0.972226497078936305708321144},
-	{ 0.521975292937154342694258318}, { 0.852960604930363657746588082},
-	{-0.852960604930363657746588082}, { 0.521975292937154342694258318},
-	{ 0.808656181588174991946968128}, { 0.588281548222645304786439813},
-	{-0.588281548222645304786439813}, { 0.808656181588174991946968128},
-	{ 0.155828397654265235743101486}, { 0.987784141644572154230969032},
-	{-0.987784141644572154230969032}, { 0.155828397654265235743101486},
-	{ 0.990485084256457037998682243}, { 0.137620121586486044948441663},
-	{-0.137620121586486044948441663}, { 0.990485084256457037998682243},
-	{ 0.603066598540348201693430617}, { 0.797690840943391108362662755},
-	{-0.797690840943391108362662755}, { 0.603066598540348201693430617},
-	{ 0.862423956111040538690933878}, { 0.506186645345155291048942344},
-	{-0.506186645345155291048942344}, { 0.862423956111040538690933878},
-	{ 0.251897818154216950498106628}, { 0.967753837093475465243391912},
-	{-0.967753837093475465243391912}, { 0.251897818154216950498106628},
-	{ 0.944604837261480265659265493}, { 0.328209843579092526107916817},
-	{-0.328209843579092526107916817}, { 0.944604837261480265659265493},
-	{ 0.435857079922255491032544080}, { 0.900015892016160228714535267},
-	{-0.900015892016160228714535267}, { 0.435857079922255491032544080},
-	{ 0.747100605980180144323078847}, { 0.664710978203344868130324985},
-	{-0.664710978203344868130324985}, { 0.747100605980180144323078847},
-	{ 0.058258264500435759613979782}, { 0.998301544933892840738782163},
-	{-0.998301544933892840738782163}, { 0.058258264500435759613979782},
-	{ 0.996044700901251989887944810}, { 0.088853552582524596561586535},
-	{-0.088853552582524596561586535}, { 0.996044700901251989887944810},
-	{ 0.641481012808583151988739898}, { 0.767138911935820381181694573},
-	{-0.767138911935820381181694573}, { 0.641481012808583151988739898},
-	{ 0.886222530148880631647990821}, { 0.463259783551860197390719637},
-	{-0.463259783551860197390719637}, { 0.886222530148880631647990821},
-	{ 0.299079826308040476750336973}, { 0.954228095109105629780430732},
-	{-0.954228095109105629780430732}, { 0.299079826308040476750336973},
-	{ 0.959571513081984528335528181}, { 0.281464937925757984095231007},
-	{-0.281464937925757984095231007}, { 0.959571513081984528335528181},
-	{ 0.479493757660153026679839798}, { 0.877545290207261291668470750},
-	{-0.877545290207261291668470750}, { 0.479493757660153026679839798},
-	{ 0.778816512381475953374724325}, { 0.627251815495144113509622565},
-	{-0.627251815495144113509622565}, { 0.778816512381475953374724325},
-	{ 0.107172424956808849175529148}, { 0.994240449453187946358413442},
-	{-0.994240449453187946358413442}, { 0.107172424956808849175529148},
-	{ 0.982539302287441255907040396}, { 0.186055151663446648105438304},
-	{-0.186055151663446648105438304}, { 0.982539302287441255907040396},
-	{ 0.563199344013834115007363772}, { 0.826321062845663480311195452},
-	{-0.826321062845663480311195452}, { 0.563199344013834115007363772},
-	{ 0.836547727223511984524285790}, { 0.547894059173100165608820571},
-	{-0.547894059173100165608820571}, { 0.836547727223511984524285790},
-	{ 0.204108966092816874181696950}, { 0.978948175319062194715480124},
-	{-0.978948175319062194715480124}, { 0.204108966092816874181696950},
-	{ 0.927362525650401087274536959}, { 0.374164062971457997104393020},
-	{-0.374164062971457997104393020}, { 0.927362525650401087274536959},
-	{ 0.391170384302253888687512949}, { 0.920318276709110566440076541},
-	{-0.920318276709110566440076541}, { 0.391170384302253888687512949},
-	{ 0.713584868780793592903125099}, { 0.700568793943248366792866380},
-	{-0.700568793943248366792866380}, { 0.713584868780793592903125099},
-	{ 0.009203754782059819315102378}, { 0.999957644551963866333120920},
-	{-0.999957644551963866333120920}, { 0.009203754782059819315102378},
-	{ 0.999957644551963866333120920}, { 0.009203754782059819315102378},
-	{-0.009203754782059819315102378}, { 0.999957644551963866333120920},
-	{ 0.700568793943248366792866380}, { 0.713584868780793592903125099},
-	{-0.713584868780793592903125099}, { 0.700568793943248366792866380},
-	{ 0.920318276709110566440076541}, { 0.391170384302253888687512949},
-	{-0.391170384302253888687512949}, { 0.920318276709110566440076541},
-	{ 0.374164062971457997104393020}, { 0.927362525650401087274536959},
-	{-0.927362525650401087274536959}, { 0.374164062971457997104393020},
-	{ 0.978948175319062194715480124}, { 0.204108966092816874181696950},
-	{-0.204108966092816874181696950}, { 0.978948175319062194715480124},
-	{ 0.547894059173100165608820571}, { 0.836547727223511984524285790},
-	{-0.836547727223511984524285790}, { 0.547894059173100165608820571},
-	{ 0.826321062845663480311195452}, { 0.563199344013834115007363772},
-	{-0.563199344013834115007363772}, { 0.826321062845663480311195452},
-	{ 0.186055151663446648105438304}, { 0.982539302287441255907040396},
-	{-0.982539302287441255907040396}, { 0.186055151663446648105438304},
-	{ 0.994240449453187946358413442}, { 0.107172424956808849175529148},
-	{-0.107172424956808849175529148}, { 0.994240449453187946358413442},
-	{ 0.627251815495144113509622565}, { 0.778816512381475953374724325},
-	{-0.778816512381475953374724325}, { 0.627251815495144113509622565},
-	{ 0.877545290207261291668470750}, { 0.479493757660153026679839798},
-	{-0.479493757660153026679839798}, { 0.877545290207261291668470750},
-	{ 0.281464937925757984095231007}, { 0.959571513081984528335528181},
-	{-0.959571513081984528335528181}, { 0.281464937925757984095231007},
-	{ 0.954228095109105629780430732}, { 0.299079826308040476750336973},
-	{-0.299079826308040476750336973}, { 0.954228095109105629780430732},
-	{ 0.463259783551860197390719637}, { 0.886222530148880631647990821},
-	{-0.886222530148880631647990821}, { 0.463259783551860197390719637},
-	{ 0.767138911935820381181694573}, { 0.641481012808583151988739898},
-	{-0.641481012808583151988739898}, { 0.767138911935820381181694573},
-	{ 0.088853552582524596561586535}, { 0.996044700901251989887944810},
-	{-0.996044700901251989887944810}, { 0.088853552582524596561586535},
-	{ 0.998301544933892840738782163}, { 0.058258264500435759613979782},
-	{-0.058258264500435759613979782}, { 0.998301544933892840738782163},
-	{ 0.664710978203344868130324985}, { 0.747100605980180144323078847},
-	{-0.747100605980180144323078847}, { 0.664710978203344868130324985},
-	{ 0.900015892016160228714535267}, { 0.435857079922255491032544080},
-	{-0.435857079922255491032544080}, { 0.900015892016160228714535267},
-	{ 0.328209843579092526107916817}, { 0.944604837261480265659265493},
-	{-0.944604837261480265659265493}, { 0.328209843579092526107916817},
-	{ 0.967753837093475465243391912}, { 0.251897818154216950498106628},
-	{-0.251897818154216950498106628}, { 0.967753837093475465243391912},
-	{ 0.506186645345155291048942344}, { 0.862423956111040538690933878},
-	{-0.862423956111040538690933878}, { 0.506186645345155291048942344},
-	{ 0.797690840943391108362662755}, { 0.603066598540348201693430617},
-	{-0.603066598540348201693430617}, { 0.797690840943391108362662755},
-	{ 0.137620121586486044948441663}, { 0.990485084256457037998682243},
-	{-0.990485084256457037998682243}, { 0.137620121586486044948441663},
-	{ 0.987784141644572154230969032}, { 0.155828397654265235743101486},
-	{-0.155828397654265235743101486}, { 0.987784141644572154230969032},
-	{ 0.588281548222645304786439813}, { 0.808656181588174991946968128},
-	{-0.808656181588174991946968128}, { 0.588281548222645304786439813},
-	{ 0.852960604930363657746588082}, { 0.521975292937154342694258318},
-	{-0.521975292937154342694258318}, { 0.852960604930363657746588082},
-	{ 0.234041958583543423191242045}, { 0.972226497078936305708321144},
-	{-0.972226497078936305708321144}, { 0.234041958583543423191242045},
-	{ 0.938403534063108112192420774}, { 0.345541324963989065539191723},
-	{-0.345541324963989065539191723}, { 0.938403534063108112192420774},
-	{ 0.419216888363223956433010020}, { 0.907886116487666212038681480},
-	{-0.907886116487666212038681480}, { 0.419216888363223956433010020},
-	{ 0.734738878095963464563223604}, { 0.678350043129861486873655042},
-	{-0.678350043129861486873655042}, { 0.734738878095963464563223604},
-	{ 0.039872927587739811128578738}, { 0.999204758618363895492950001},
-	{-0.999204758618363895492950001}, { 0.039872927587739811128578738},
-	{ 0.999430604555461772019008327}, { 0.033741171851377584833716112},
-	{-0.033741171851377584833716112}, { 0.999430604555461772019008327},
-	{ 0.682845546385248068164596123}, { 0.730562769227827561177758850},
-	{-0.730562769227827561177758850}, { 0.682845546385248068164596123},
-	{ 0.910441292258067196934095369}, { 0.413638312238434547471944324},
-	{-0.413638312238434547471944324}, { 0.910441292258067196934095369},
-	{ 0.351292756085567125601307623}, { 0.936265667170278246576310996},
-	{-0.936265667170278246576310996}, { 0.351292756085567125601307623},
-	{ 0.973644249650811925318383912}, { 0.228072083170885739254457379},
-	{-0.228072083170885739254457379}, { 0.973644249650811925318383912},
-	{ 0.527199134781901348464274575}, { 0.849741768000852489471268395},
-	{-0.849741768000852489471268395}, { 0.527199134781901348464274575},
-	{ 0.812250586585203913049744181}, { 0.583308652937698294392830961},
-	{-0.583308652937698294392830961}, { 0.812250586585203913049744181},
-	{ 0.161886393780111837641387995}, { 0.986809401814185476970235952},
-	{-0.986809401814185476970235952}, { 0.161886393780111837641387995},
-	{ 0.991310859846115418957349799}, { 0.131540028702883111103387493},
-	{-0.131540028702883111103387493}, { 0.991310859846115418957349799},
-	{ 0.607949784967773667243642671}, { 0.793975477554337164895083757},
-	{-0.793975477554337164895083757}, { 0.607949784967773667243642671},
-	{ 0.865513624090569082825488358}, { 0.500885382611240786241285004},
-	{-0.500885382611240786241285004}, { 0.865513624090569082825488358},
-	{ 0.257831102162159005614471295}, { 0.966190003445412555433832961},
-	{-0.966190003445412555433832961}, { 0.257831102162159005614471295},
-	{ 0.946600913083283570044599823}, { 0.322407678801069848384807478},
-	{-0.322407678801069848384807478}, { 0.946600913083283570044599823},
-	{ 0.441371268731716692879988968}, { 0.897324580705418281231391836},
-	{-0.897324580705418281231391836}, { 0.441371268731716692879988968},
-	{ 0.751165131909686411205819422}, { 0.660114342067420478559490747},
-	{-0.660114342067420478559490747}, { 0.751165131909686411205819422},
-	{ 0.064382630929857460819324537}, { 0.997925286198596012623025462},
-	{-0.997925286198596012623025462}, { 0.064382630929857460819324537},
-	{ 0.996571145790554847093566910}, { 0.082740264549375693111987083},
-	{-0.082740264549375693111987083}, { 0.996571145790554847093566910},
-	{ 0.646176012983316364832802220}, { 0.763188417263381271704838297},
-	{-0.763188417263381271704838297}, { 0.646176012983316364832802220},
-	{ 0.889048355854664562540777729}, { 0.457813303598877221904961155},
-	{-0.457813303598877221904961155}, { 0.889048355854664562540777729},
-	{ 0.304929229735402406490728633}, { 0.952375012719765858529893608},
-	{-0.952375012719765858529893608}, { 0.304929229735402406490728633},
-	{ 0.961280485811320641748659653}, { 0.275571819310958163076425168},
-	{-0.275571819310958163076425168}, { 0.961280485811320641748659653},
-	{ 0.484869248000791101822951699}, { 0.874586652278176112634431897},
-	{-0.874586652278176112634431897}, { 0.484869248000791101822951699},
-	{ 0.782650596166575738458949301}, { 0.622461279374149972519166721},
-	{-0.622461279374149972519166721}, { 0.782650596166575738458949301},
-	{ 0.113270952177564349018228733}, { 0.993564135520595333782021697},
-	{-0.993564135520595333782021697}, { 0.113270952177564349018228733},
-	{ 0.983662419211730274396237776}, { 0.180022901405699522679906590},
-	{-0.180022901405699522679906590}, { 0.983662419211730274396237776},
-	{ 0.568258952670131549790548489}, { 0.822849781375826332046780034},
-	{-0.822849781375826332046780034}, { 0.568258952670131549790548489},
-	{ 0.839893794195999504583383987}, { 0.542750784864515906586768661},
-	{-0.542750784864515906586768661}, { 0.839893794195999504583383987},
-	{ 0.210111836880469621717489972}, { 0.977677357824509979943404762},
-	{-0.977677357824509979943404762}, { 0.210111836880469621717489972},
-	{ 0.929640895843181265457918066}, { 0.368466829953372331712746222},
-	{-0.368466829953372331712746222}, { 0.929640895843181265457918066},
-	{ 0.396809987416710328595290911}, { 0.917900775621390457642276297},
-	{-0.917900775621390457642276297}, { 0.396809987416710328595290911},
-	{ 0.717870045055731736211325329}, { 0.696177131491462944788582591},
-	{-0.696177131491462944788582591}, { 0.717870045055731736211325329},
-	{ 0.015339206284988101044151868}, { 0.999882347454212525633049627},
-	{-0.999882347454212525633049627}, { 0.015339206284988101044151868},
-	{ 0.999769405351215321657617036}, { 0.021474080275469507418374898},
-	{-0.021474080275469507418374898}, { 0.999769405351215321657617036},
-	{ 0.691759258364157774906734132}, { 0.722128193929215321243607198},
-	{-0.722128193929215321243607198}, { 0.691759258364157774906734132},
-	{ 0.915448716088267819566431292}, { 0.402434650859418441082533934},
-	{-0.402434650859418441082533934}, { 0.915448716088267819566431292},
-	{ 0.362755724367397216204854462}, { 0.931884265581668106718557199},
-	{-0.931884265581668106718557199}, { 0.362755724367397216204854462},
-	{ 0.976369731330021149312732194}, { 0.216106797076219509948385131},
-	{-0.216106797076219509948385131}, { 0.976369731330021149312732194},
-	{ 0.537587076295645482502214932}, { 0.843208239641845437161743865},
-	{-0.843208239641845437161743865}, { 0.537587076295645482502214932},
-	{ 0.819347520076796960824689637}, { 0.573297166698042212820171239},
-	{-0.573297166698042212820171239}, { 0.819347520076796960824689637},
-	{ 0.173983873387463827950700807}, { 0.984748501801904218556553176},
-	{-0.984748501801904218556553176}, { 0.173983873387463827950700807},
-	{ 0.992850414459865090793563344}, { 0.119365214810991364593637790},
-	{-0.119365214810991364593637790}, { 0.992850414459865090793563344},
-	{ 0.617647307937803932403979402}, { 0.786455213599085757522319464},
-	{-0.786455213599085757522319464}, { 0.617647307937803932403979402},
-	{ 0.871595086655951034842481435}, { 0.490226483288291154229598449},
-	{-0.490226483288291154229598449}, { 0.871595086655951034842481435},
-	{ 0.269668325572915106525464462}, { 0.962953266873683886347921481},
-	{-0.962953266873683886347921481}, { 0.269668325572915106525464462},
-	{ 0.950486073949481721759926101}, { 0.310767152749611495835997250},
-	{-0.310767152749611495835997250}, { 0.950486073949481721759926101},
-	{ 0.452349587233770874133026703}, { 0.891840709392342727796478697},
-	{-0.891840709392342727796478697}, { 0.452349587233770874133026703},
-	{ 0.759209188978388033485525443}, { 0.650846684996380915068975573},
-	{-0.650846684996380915068975573}, { 0.759209188978388033485525443},
-	{ 0.076623861392031492278332463}, { 0.997060070339482978987989949},
-	{-0.997060070339482978987989949}, { 0.076623861392031492278332463},
-	{ 0.997511456140303459699448390}, { 0.070504573389613863027351471},
-	{-0.070504573389613863027351471}, { 0.997511456140303459699448390},
-	{ 0.655492852999615385312679701}, { 0.755201376896536527598710756},
-	{-0.755201376896536527598710756}, { 0.655492852999615385312679701},
-	{ 0.894599485631382678433072126}, { 0.446868840162374195353044389},
-	{-0.446868840162374195353044389}, { 0.894599485631382678433072126},
-	{ 0.316593375556165867243047035}, { 0.948561349915730288158494826},
-	{-0.948561349915730288158494826}, { 0.316593375556165867243047035},
-	{ 0.964589793289812723836432159}, { 0.263754678974831383611349322},
-	{-0.263754678974831383611349322}, { 0.964589793289812723836432159},
-	{ 0.495565261825772531150266670}, { 0.868570705971340895340449876},
-	{-0.868570705971340895340449876}, { 0.495565261825772531150266670},
-	{ 0.790230221437310055030217152}, { 0.612810082429409703935211936},
-	{-0.612810082429409703935211936}, { 0.790230221437310055030217152},
-	{ 0.125454983411546238542336453}, { 0.992099313142191757112085445},
-	{-0.992099313142191757112085445}, { 0.125454983411546238542336453},
-	{ 0.985797509167567424700995000}, { 0.167938294974731178054745536},
-	{-0.167938294974731178054745536}, { 0.985797509167567424700995000},
-	{ 0.578313796411655563342245019}, { 0.815814410806733789010772660},
-	{-0.815814410806733789010772660}, { 0.578313796411655563342245019},
-	{ 0.846490938774052078300544488}, { 0.532403127877197971442805218},
-	{-0.532403127877197971442805218}, { 0.846490938774052078300544488},
-	{ 0.222093620973203534094094721}, { 0.975025345066994146844913468},
-	{-0.975025345066994146844913468}, { 0.222093620973203534094094721},
-	{ 0.934092550404258914729877883}, { 0.357030961233430032614954036},
-	{-0.357030961233430032614954036}, { 0.934092550404258914729877883},
-	{ 0.408044162864978680820747499}, { 0.912962190428398164628018233},
-	{-0.912962190428398164628018233}, { 0.408044162864978680820747499},
-	{ 0.726359155084345976817494315}, { 0.687315340891759108199186948},
-	{-0.687315340891759108199186948}, { 0.726359155084345976817494315},
-	{ 0.027608145778965741612354872}, { 0.999618822495178597116830637},
-	{-0.999618822495178597116830637}, { 0.027608145778965741612354872},
-	{ 0.998941293186856850633930266}, { 0.046003182130914628814301788},
-	{-0.046003182130914628814301788}, { 0.998941293186856850633930266},
-	{ 0.673829000378756060917568372}, { 0.738887324460615147933116508},
-	{-0.738887324460615147933116508}, { 0.673829000378756060917568372},
-	{ 0.905296759318118774354048329}, { 0.424779681209108833357226189},
-	{-0.424779681209108833357226189}, { 0.905296759318118774354048329},
-	{ 0.339776884406826857828825803}, { 0.940506070593268323787291309},
-	{-0.940506070593268323787291309}, { 0.339776884406826857828825803},
-	{ 0.970772140728950302138169611}, { 0.240003022448741486568922365},
-	{-0.240003022448741486568922365}, { 0.970772140728950302138169611},
-	{ 0.516731799017649881508753876}, { 0.856147328375194481019630732},
-	{-0.856147328375194481019630732}, { 0.516731799017649881508753876},
-	{ 0.805031331142963597922659282}, { 0.593232295039799808047809426},
-	{-0.593232295039799808047809426}, { 0.805031331142963597922659282},
-	{ 0.149764534677321517229695737}, { 0.988721691960323767604516485},
-	{-0.988721691960323767604516485}, { 0.149764534677321517229695737},
-	{ 0.989622017463200834623694454}, { 0.143695033150294454819773349},
-	{-0.143695033150294454819773349}, { 0.989622017463200834623694454},
-	{ 0.598160706996342311724958652}, { 0.801376171723140219430247777},
-	{-0.801376171723140219430247777}, { 0.598160706996342311724958652},
-	{ 0.859301818357008404783582139}, { 0.511468850437970399504391001},
-	{-0.511468850437970399504391001}, { 0.859301818357008404783582139},
-	{ 0.245955050335794611599924709}, { 0.969281235356548486048290738},
-	{-0.969281235356548486048290738}, { 0.245955050335794611599924709},
-	{ 0.942573197601446879280758735}, { 0.333999651442009404650865481},
-	{-0.333999651442009404650865481}, { 0.942573197601446879280758735},
-	{ 0.430326481340082633908199031}, { 0.902673318237258806751502391},
-	{-0.902673318237258806751502391}, { 0.430326481340082633908199031},
-	{ 0.743007952135121693517362293}, { 0.669282588346636065720696366},
-	{-0.669282588346636065720696366}, { 0.743007952135121693517362293},
-	{ 0.052131704680283321236358216}, { 0.998640218180265222418199049},
-	{-0.998640218180265222418199049}, { 0.052131704680283321236358216},
-	{ 0.995480755491926941769171600}, { 0.094963495329638998938034312},
-	{-0.094963495329638998938034312}, { 0.995480755491926941769171600},
-	{ 0.636761861236284230413943435}, { 0.771060524261813773200605759},
-	{-0.771060524261813773200605759}, { 0.636761861236284230413943435},
-	{ 0.883363338665731594736308015}, { 0.468688822035827933697617870},
-	{-0.468688822035827933697617870}, { 0.883363338665731594736308015},
-	{ 0.293219162694258650606608599}, { 0.956045251349996443270479823},
-	{-0.956045251349996443270479823}, { 0.293219162694258650606608599},
-	{ 0.957826413027532890321037029}, { 0.287347459544729526477331841},
-	{-0.287347459544729526477331841}, { 0.957826413027532890321037029},
-	{ 0.474100214650550014398580015}, { 0.880470889052160770806542929},
-	{-0.880470889052160770806542929}, { 0.474100214650550014398580015},
-	{ 0.774953106594873878359129282}, { 0.632018735939809021909403706},
-	{-0.632018735939809021909403706}, { 0.774953106594873878359129282},
-	{ 0.101069862754827824987887585}, { 0.994879330794805620591166107},
-	{-0.994879330794805620591166107}, { 0.101069862754827824987887585},
-	{ 0.981379193313754574318224190}, { 0.192080397049892441679288205},
-	{-0.192080397049892441679288205}, { 0.981379193313754574318224190},
-	{ 0.558118531220556115693702964}, { 0.829761233794523042469023765},
-	{-0.829761233794523042469023765}, { 0.558118531220556115693702964},
-	{ 0.833170164701913186439915922}, { 0.553016705580027531764226988},
-	{-0.553016705580027531764226988}, { 0.833170164701913186439915922},
-	{ 0.198098410717953586179324918}, { 0.980182135968117392690210009},
-	{-0.980182135968117392690210009}, { 0.198098410717953586179324918},
-	{ 0.925049240782677590302371869}, { 0.379847208924051170576281147},
-	{-0.379847208924051170576281147}, { 0.925049240782677590302371869},
-	{ 0.385516053843918864075607949}, { 0.922701128333878570437264227},
-	{-0.922701128333878570437264227}, { 0.385516053843918864075607949},
-	{ 0.709272826438865651316533772}, { 0.704934080375904908852523758},
-	{-0.704934080375904908852523758}, { 0.709272826438865651316533772},
-	{ 0.003067956762965976270145365}, { 0.999995293809576171511580126},
-	{-0.999995293809576171511580126}, { 0.003067956762965976270145365}
-};
-
-const fpr fpr_p2_tab[] = {
-	{ 2.00000000000 },
-	{ 1.00000000000 },
-	{ 0.50000000000 },
-	{ 0.25000000000 },
-	{ 0.12500000000 },
-	{ 0.06250000000 },
-	{ 0.03125000000 },
-	{ 0.01562500000 },
-	{ 0.00781250000 },
-	{ 0.00390625000 },
-	{ 0.00195312500 }
-};
-
-#else // yyyFPNATIVE+0 yyyFPEMU+0
-
-#error No FP implementation selected
-
-#endif // yyyFPNATIVE- yyyFPEMU-
diff --git a/crypto_sign/falcon-512-tree/m4-ct/fpr.h b/crypto_sign/falcon-512-tree/m4-ct/fpr.h
deleted file mode 100644
index 8176212d..00000000
--- a/crypto_sign/falcon-512-tree/m4-ct/fpr.h
+++ /dev/null
@@ -1,893 +0,0 @@
-/*
- * Floating-point operations.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#if FALCON_FPEMU  // yyyFPEMU+1 yyyFPNATIVE+0
-
-/* ====================================================================== */
-/*
- * Custom floating-point implementation with integer arithmetics. We
- * use IEEE-754 "binary64" format, with some simplifications:
- *
- *   - Top bit is s = 1 for negative, 0 for positive.
- *
- *   - Exponent e uses the next 11 bits (bits 52 to 62, inclusive).
- *
- *   - Mantissa m uses the 52 low bits.
- *
- * Encoded value is, in general: (-1)^s * 2^(e-1023) * (1 + m*2^(-52))
- * i.e. the mantissa really is a 53-bit number (less than 2.0, but not
- * less than 1.0), but the top bit (equal to 1 by definition) is omitted
- * in the encoding.
- *
- * In IEEE-754, there are some special values:
- *
- *   - If e = 2047, then the value is either an infinite (m = 0) or
- *     a NaN (m != 0).
- *
- *   - If e = 0, then the value is either a zero (m = 0) or a subnormal,
- *     aka "denormalized number" (m != 0).
- *
- * Of these, we only need the zeros. The caller is responsible for not
- * providing operands that would lead to infinites, NaNs or subnormals.
- * If inputs are such that values go out of range, then indeterminate
- * values are returned (it would still be deterministic, but no specific
- * value may be relied upon).
- *
- * At the C level, the three parts are stored in a 64-bit unsigned
- * word.
- *
- * One may note that a property of the IEEE-754 format is that order
- * is preserved for positive values: if two positive floating-point
- * values x and y are such that x < y, then their respective encodings
- * as _signed_ 64-bit integers i64(x) and i64(y) will be such that
- * i64(x) < i64(y). For negative values, order is reversed: if x < 0,
- * y < 0, and x < y, then ia64(x) > ia64(y).
- *
- * IMPORTANT ASSUMPTIONS:
- * ======================
- *
- * For proper computations, and constant-time behaviour, we assume the
- * following:
- *
- *   - 32x32->64 multiplication (unsigned) has an execution time that
- *     is independent of its operands. This is true of most modern
- *     x86 and ARM cores. Notable exceptions are the ARM Cortex M0, M0+
- *     and M3 (in the M0 and M0+, this is done in software, so it depends
- *     on that routine), and the PowerPC cores from the G3/G4 lines.
- *     For more info, see: https://www.bearssl.org/ctmul.html
- *
- *   - Left-shifts and right-shifts of 32-bit values have an execution
- *     time which does not depend on the shifted value nor on the
- *     shift count. An historical exception is the Pentium IV, but most
- *     modern CPU have barrel shifters. Some small microcontrollers
- *     might have varying-time shifts (not the ARM Cortex M*, though).
- *
- *   - Right-shift of a signed negative value performs a sign extension.
- *     As per the C standard, this operation returns an
- *     implementation-defined result (this is NOT an "undefined
- *     behaviour"). On most/all systems, an arithmetic shift is
- *     performed, because this is what makes most sense.
- */
-
-/*
- * Normally we should declare the 'fpr' type to be a struct or union
- * around the internal 64-bit value; however, we want to use the
- * direct 64-bit integer type to enable a lighter call convention on
- * ARM platforms. This means that direct (invalid) use of operators
- * such as '*' or '+' will not be caught by the compiler. We rely on
- * the "normal" (non-emulated) code to detect such instances.
- */
-typedef uint64_t fpr;
-
-/*
- * For computations, we split values into an integral mantissa in the
- * 2^54..2^55 range, and an (adjusted) exponent. The lowest bit is
- * "sticky" (it is set to 1 if any of the bits below it is 1); when
- * re-encoding, the low two bits are dropped, but may induce an
- * increment in the value for proper rounding.
- */
-
-/*
- * Right-shift a 64-bit unsigned value by a possibly secret shift count.
- * We assumed that the underlying architecture had a barrel shifter for
- * 32-bit shifts, but for 64-bit shifts on a 32-bit system, this will
- * typically invoke a software routine that is not necessarily
- * constant-time; hence the function below.
- *
- * Shift count n MUST be in the 0..63 range.
- */
-static inline uint64_t
-fpr_ursh(uint64_t x, int n)
-{
-	x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5);
-	return x >> (n & 31);
-}
-
-/*
- * Right-shift a 64-bit signed value by a possibly secret shift count
- * (see fpr_ursh() for the rationale).
- *
- * Shift count n MUST be in the 0..63 range.
- */
-static inline int64_t
-fpr_irsh(int64_t x, int n)
-{
-	x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5);
-	return x >> (n & 31);
-}
-
-/*
- * Left-shift a 64-bit unsigned value by a possibly secret shift count
- * (see fpr_ursh() for the rationale).
- *
- * Shift count n MUST be in the 0..63 range.
- */
-static inline uint64_t
-fpr_ulsh(uint64_t x, int n)
-{
-	x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5);
-	return x << (n & 31);
-}
-
-/*
- * Expectations:
- *   s = 0 or 1
- *   exponent e is "arbitrary" and unbiased
- *   2^54 <= m < 2^55
- * Numerical value is (-1)^2 * m * 2^e
- *
- * Exponents which are too low lead to value zero. If the exponent is
- * too large, the returned value is indeterminate.
- *
- * If m = 0, then a zero is returned (using the provided sign).
- * If e < -1076, then a zero is returned (regardless of the value of m).
- * If e >= -1076 and e != 0, m must be within the expected range
- * (2^54 to 2^55-1).
- */
-static inline fpr
-FPR(int s, int e, uint64_t m)
-{
-	fpr x;
-	uint32_t t;
-	unsigned f;
-
-	/*
-	 * If e >= -1076, then the value is "normal"; otherwise, it
-	 * should be a subnormal, which we clamp down to zero.
-	 */
-	e += 1076;
-	t = (uint32_t)e >> 31;
-	m &= (uint64_t)t - 1;
-
-	/*
-	 * If m = 0 then we want a zero; make e = 0 too, but conserve
-	 * the sign.
-	 */
-	t = (uint32_t)(m >> 54);
-	e &= -(int)t;
-
-	/*
-	 * The 52 mantissa bits come from m. Value m has its top bit set
-	 * (unless it is a zero); we leave it "as is": the top bit will
-	 * increment the exponent by 1, except when m = 0, which is
-	 * exactly what we want.
-	 */
-	x = (((uint64_t)s << 63) | (m >> 2)) + ((uint64_t)(uint32_t)e << 52);
-
-	/*
-	 * Rounding: if the low three bits of m are 011, 110 or 111,
-	 * then the value should be incremented to get the next
-	 * representable value. This implements the usual
-	 * round-to-nearest rule (with preference to even values in case
-	 * of a tie). Note that the increment may make a carry spill
-	 * into the exponent field, which is again exactly what we want
-	 * in that case.
-	 */
-	f = (unsigned)m & 7U;
-	x += (0xC8U >> f) & 1;
-	return x;
-}
-
-#define fpr_scaled   Zf(fpr_scaled)
-fpr fpr_scaled(int64_t i, int sc);
-
-static inline fpr
-fpr_of(int64_t i)
-{
-	return fpr_scaled(i, 0);
-}
-
-static const fpr fpr_q = 4667981563525332992;
-static const fpr fpr_inverse_of_q = 4545632735260551042;
-static const fpr fpr_inv_2sqrsigma0 = 4594603506513722306;
-static const fpr fpr_inv_sigma = 4573359825155195350;
-static const fpr fpr_sigma_min_9 = 4608495221497168882;
-static const fpr fpr_sigma_min_10 = 4608586345619182117;
-static const fpr fpr_log2 = 4604418534313441775;
-static const fpr fpr_inv_log2 = 4609176140021203710;
-static const fpr fpr_bnorm_max = 4670353323383631276;
-static const fpr fpr_zero = 0;
-static const fpr fpr_one = 4607182418800017408;
-static const fpr fpr_two = 4611686018427387904;
-static const fpr fpr_onehalf = 4602678819172646912;
-static const fpr fpr_invsqrt2 = 4604544271217802189;
-static const fpr fpr_invsqrt8 = 4600040671590431693;
-static const fpr fpr_ptwo31 = 4746794007248502784;
-static const fpr fpr_ptwo31m1 = 4746794007244308480;
-static const fpr fpr_mtwo31m1 = 13970166044099084288U;
-static const fpr fpr_ptwo63m1 = 4890909195324358656;
-static const fpr fpr_mtwo63m1 = 14114281232179134464U;
-static const fpr fpr_ptwo63 = 4890909195324358656;
-
-static inline int64_t
-fpr_rint(fpr x)
-{
-	uint64_t m, d;
-	int e;
-	uint32_t s, dd, f;
-
-	/*
-	 * We assume that the value fits in -(2^63-1)..+(2^63-1). We can
-	 * thus extract the mantissa as a 63-bit integer, then right-shift
-	 * it as needed.
-	 */
-	m = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
-	e = 1085 - ((int)(x >> 52) & 0x7FF);
-
-	/*
-	 * If a shift of more than 63 bits is needed, then simply set m
-	 * to zero. This also covers the case of an input operand equal
-	 * to zero.
-	 */
-	m &= -(uint64_t)((uint32_t)(e - 64) >> 31);
-	e &= 63;
-
-	/*
-	 * Right-shift m as needed. Shift count is e. Proper rounding
-	 * mandates that:
-	 *   - If the highest dropped bit is zero, then round low.
-	 *   - If the highest dropped bit is one, and at least one of the
-	 *     other dropped bits is one, then round up.
-	 *   - If the highest dropped bit is one, and all other dropped
-	 *     bits are zero, then round up if the lowest kept bit is 1,
-	 *     or low otherwise (i.e. ties are broken by "rounding to even").
-	 *
-	 * We thus first extract a word consisting of all the dropped bit
-	 * AND the lowest kept bit; then we shrink it down to three bits,
-	 * the lowest being "sticky".
-	 */
-	d = fpr_ulsh(m, 63 - e);
-	dd = (uint32_t)d | ((uint32_t)(d >> 32) & 0x1FFFFFFF);
-	f = (uint32_t)(d >> 61) | ((dd | -dd) >> 31);
-	m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U);
-
-	/*
-	 * Apply the sign bit.
-	 */
-	s = (uint32_t)(x >> 63);
-	return ((int64_t)m ^ -(int64_t)s) + (int64_t)s;
-}
-
-static inline int64_t
-fpr_floor(fpr x)
-{
-	uint64_t t;
-	int64_t xi;
-	int e, cc;
-
-	/*
-	 * We extract the integer as a _signed_ 64-bit integer with
-	 * a scaling factor. Since we assume that the value fits
-	 * in the -(2^63-1)..+(2^63-1) range, we can left-shift the
-	 * absolute value to make it in the 2^62..2^63-1 range: we
-	 * will only need a right-shift afterwards.
-	 */
-	e = (int)(x >> 52) & 0x7FF;
-	t = x >> 63;
-	xi = (int64_t)(((x << 10) | ((uint64_t)1 << 62))
-		& (((uint64_t)1 << 63) - 1));
-	xi = (xi ^ -(int64_t)t) + (int64_t)t;
-	cc = 1085 - e;
-
-	/*
-	 * We perform an arithmetic right-shift on the value. This
-	 * applies floor() semantics on both positive and negative values
-	 * (rounding toward minus infinity).
-	 */
-	xi = fpr_irsh(xi, cc & 63);
-
-	/*
-	 * If the true shift count was 64 or more, then we should instead
-	 * replace xi with 0 (if nonnegative) or -1 (if negative). Edge
-	 * case: -0 will be floored to -1, not 0 (whether this is correct
-	 * is debatable; in any case, the other functions normalize zero
-	 * to +0).
-	 *
-	 * For an input of zero, the non-shifted xi was incorrect (we used
-	 * a top implicit bit of value 1, not 0), but this does not matter
-	 * since this operation will clamp it down.
-	 */
-	xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31);
-	return xi;
-}
-
-static inline int64_t
-fpr_trunc(fpr x)
-{
-	uint64_t t, xu;
-	int e, cc;
-
-	/*
-	 * Extract the absolute value. Since we assume that the value
-	 * fits in the -(2^63-1)..+(2^63-1) range, we can left-shift
-	 * the absolute value into the 2^62..2^63-1 range, and then
-	 * do a right shift afterwards.
-	 */
-	e = (int)(x >> 52) & 0x7FF;
-	xu = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
-	cc = 1085 - e;
-	xu = fpr_ursh(xu, cc & 63);
-
-	/*
-	 * If the exponent is too low (cc > 63), then the shift was wrong
-	 * and we must clamp the value to 0. This also covers the case
-	 * of an input equal to zero.
-	 */
-	xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31);
-
-	/*
-	 * Apply back the sign, if the source value is negative.
-	 */
-	t = x >> 63;
-	xu = (xu ^ -t) + t;
-	return *(int64_t *)&xu;
-}
-
-#define fpr_add   Zf(fpr_add)
-fpr fpr_add(fpr x, fpr y);
-
-static inline fpr
-fpr_sub(fpr x, fpr y)
-{
-	y ^= (uint64_t)1 << 63;
-	return fpr_add(x, y);
-}
-
-static inline fpr
-fpr_neg(fpr x)
-{
-	x ^= (uint64_t)1 << 63;
-	return x;
-}
-
-static inline fpr
-fpr_half(fpr x)
-{
-	/*
-	 * To divide a value by 2, we just have to subtract 1 from its
-	 * exponent, but we have to take care of zero.
-	 */
-	uint32_t t;
-
-	x -= (uint64_t)1 << 52;
-	t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11;
-	x &= (uint64_t)t - 1;
-	return x;
-}
-
-static inline fpr
-fpr_double(fpr x)
-{
-	/*
-	 * To double a value, we just increment by one the exponent. We
-	 * don't care about infinites or NaNs; however, 0 is a
-	 * special case.
-	 */
-	x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52;
-	return x;
-}
-
-#define fpr_mul   Zf(fpr_mul)
-fpr fpr_mul(fpr x, fpr y);
-
-static inline fpr
-fpr_sqr(fpr x)
-{
-	return fpr_mul(x, x);
-}
-
-#define fpr_div   Zf(fpr_div)
-fpr fpr_div(fpr x, fpr y);
-
-static inline fpr
-fpr_inv(fpr x)
-{
-	return fpr_div(4607182418800017408u, x);
-}
-
-#define fpr_sqrt   Zf(fpr_sqrt)
-fpr fpr_sqrt(fpr x);
-
-static inline int
-fpr_lt(fpr x, fpr y)
-{
-	/*
-	 * If x >= 0 or y >= 0, a signed comparison yields the proper
-	 * result:
-	 *   - For positive values, the order is preserved.
-	 *   - The sign bit is at the same place as in integers, so
-	 *     sign is preserved.
-	 *
-	 * If both x and y are negative, then the order is reversed.
-	 * We cannot simply invert the comparison result in that case
-	 * because it would not handle the edge case x = y properly.
-	 */
-	int cc0, cc1;
-
-	cc0 = *(int64_t *)&x < *(int64_t *)&y;
-	cc1 = *(int64_t *)&x > *(int64_t *)&y;
-	return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63));
-}
-
-/*
- * Compute exp(x) for x such that |x| <= ln 2. We want a precision of 50
- * bits or so.
- */
-#define fpr_expm_p63   Zf(fpr_expm_p63)
-uint64_t fpr_expm_p63(fpr x, fpr ccs);
-
-#define fpr_gm_tab   Zf(fpr_gm_tab)
-extern const fpr fpr_gm_tab[];
-
-#define fpr_p2_tab   Zf(fpr_p2_tab)
-extern const fpr fpr_p2_tab[];
-
-/* ====================================================================== */
-
-#elif FALCON_FPNATIVE  // yyyFPEMU+0 yyyFPNATIVE+1
-
-/* ====================================================================== */
-
-#include <math.h>
-
-/*
- * We wrap the native 'double' type into a structure so that the C compiler
- * complains if we inadvertently use raw arithmetic operators on the 'fpr'
- * type instead of using the inline functions below. This should have no
- * extra runtime cost, since all the functions below are 'inline'.
- */
-typedef struct { double v; } fpr;
-
-static inline fpr
-FPR(double v)
-{
-	fpr x;
-
-	x.v = v;
-	return x;
-}
-
-static inline fpr
-fpr_of(int64_t i)
-{
-	return FPR((double)i);
-}
-
-static const fpr fpr_q = { 12289.0 };
-static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 };
-static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 };
-static const fpr fpr_inv_sigma = { .005819826392951607426919370871 };
-static const fpr fpr_sigma_min_9 = { 1.291500756233514568549480827642 };
-static const fpr fpr_sigma_min_10 = { 1.311734375905083682667395805765 };
-static const fpr fpr_log2 = { 0.69314718055994530941723212146 };
-static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 };
-static const fpr fpr_bnorm_max = { 16822.4121 };
-static const fpr fpr_zero = { 0.0 };
-static const fpr fpr_one = { 1.0 };
-static const fpr fpr_two = { 2.0 };
-static const fpr fpr_onehalf = { 0.5 };
-static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 };
-static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 };
-static const fpr fpr_ptwo31 = { 2147483648.0 };
-static const fpr fpr_ptwo31m1 = { 2147483647.0 };
-static const fpr fpr_mtwo31m1 = { -2147483647.0 };
-static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 };
-static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 };
-static const fpr fpr_ptwo63 = { 9223372036854775808.0 };
-
-static inline int64_t
-fpr_rint(fpr x)
-{
-	/*
-	 * We do not want to use llrint() since it might be not
-	 * constant-time.
-	 *
-	 * Suppose that x >= 0. If x >= 2^52, then it is already an
-	 * integer. Otherwise, if x < 2^52, then computing x+2^52 will
-	 * yield a value that will be rounded to the nearest integer
-	 * with exactly the right rules (round-to-nearest-even).
-	 *
-	 * In order to have constant-time processing, we must do the
-	 * computation for both x >= 0 and x < 0 cases, and use a
-	 * cast to an integer to access the sign and select the proper
-	 * value. Such casts also allow us to find out if |x| < 2^52.
-	 */
-	int64_t sx, tx, rp, rn, m;
-	uint32_t ub;
-
-	sx = (int64_t)(x.v - 1.0);
-	tx = (int64_t)x.v;
-	rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496;
-	rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496;
-
-	/*
-	 * If tx >= 2^52 or tx < -2^52, then result is tx.
-	 * Otherwise, if sx >= 0, then result is rp.
-	 * Otherwise, result is rn. We use the fact that when x is
-	 * close to 0 (|x| <= 0.25) then both rp and rn are correct;
-	 * and if x is not close to 0, then trunc(x-1.0) yields the
-	 * appropriate sign.
-	 */
-
-	/*
-	 * Clamp rp to zero if tx < 0.
-	 * Clamp rn to zero if tx >= 0.
-	 */
-	m = sx >> 63;
-	rn &= m;
-	rp &= ~m;
-
-	/*
-	 * Get the 12 upper bits of tx; if they are not all zeros or
-	 * all ones, then tx >= 2^52 or tx < -2^52, and we clamp both
-	 * rp and rn to zero. Otherwise, we clamp tx to zero.
-	 */
-	ub = (uint32_t)((uint64_t)tx >> 52);
-	m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31);
-	rp &= m;
-	rn &= m;
-	tx &= ~m;
-
-	/*
-	 * Only one of tx, rn or rp (at most) can be non-zero at this
-	 * point.
-	 */
-	return tx | rn | rp;
-}
-
-static inline int64_t
-fpr_floor(fpr x)
-{
-	int64_t r;
-
-	/*
-	 * The cast performs a trunc() (rounding toward 0) and thus is
-	 * wrong by 1 for most negative values. The correction below is
-	 * constant-time as long as the compiler turns the
-	 * floating-point conversion result into a 0/1 integer without a
-	 * conditional branch or another non-constant-time construction.
-	 * This should hold on all modern architectures with an FPU (and
-	 * if it is false on a given arch, then chances are that the FPU
-	 * itself is not constant-time, making the point moot).
-	 */
-	r = (int64_t)x.v;
-	return r - (x.v < (double)r);
-}
-
-static inline int64_t
-fpr_trunc(fpr x)
-{
-	return (int64_t)x.v;
-}
-
-static inline fpr
-fpr_add(fpr x, fpr y)
-{
-	return FPR(x.v + y.v);
-}
-
-static inline fpr
-fpr_sub(fpr x, fpr y)
-{
-	return FPR(x.v - y.v);
-}
-
-static inline fpr
-fpr_neg(fpr x)
-{
-	return FPR(-x.v);
-}
-
-static inline fpr
-fpr_half(fpr x)
-{
-	return FPR(x.v * 0.5);
-}
-
-static inline fpr
-fpr_double(fpr x)
-{
-	return FPR(x.v + x.v);
-}
-
-static inline fpr
-fpr_mul(fpr x, fpr y)
-{
-	return FPR(x.v * y.v);
-}
-
-static inline fpr
-fpr_sqr(fpr x)
-{
-	return FPR(x.v * x.v);
-}
-
-static inline fpr
-fpr_inv(fpr x)
-{
-	return FPR(1.0 / x.v);
-}
-
-static inline fpr
-fpr_div(fpr x, fpr y)
-{
-	return FPR(x.v / y.v);
-}
-
-#if FALCON_AVX2  // yyyAVX2+1
-TARGET_AVX2
-static inline void
-fpr_sqrt_avx2(double *t)
-{
-	__m128d x;
-
-	x = _mm_load1_pd(t);
-	x = _mm_sqrt_pd(x);
-	_mm_storel_pd(t, x);
-}
-#endif  // yyyAVX2-
-
-static inline fpr
-fpr_sqrt(fpr x)
-{
-	/*
-	 * We prefer not to have a dependency on libm when it can be
-	 * avoided. On x86, calling the sqrt() libm function inlines
-	 * the relevant opcode (fsqrt or sqrtsd, depending on whether
-	 * the 387 FPU or SSE2 is used for floating-point operations)
-	 * but then makes an optional call to the library function
-	 * for proper error handling, in case the operand is negative.
-	 *
-	 * To avoid this dependency, we use intrinsics or inline assembly
-	 * on recognized platforms:
-	 *
-	 *  - If AVX2 is explicitly enabled, then we use SSE2 intrinsics.
-	 *
-	 *  - On GCC/Clang with SSE maths, we use SSE2 intrinsics.
-	 *
-	 *  - On GCC/Clang on i386, or MSVC on i386, we use inline assembly
-	 *    to call the 387 FPU fsqrt opcode.
-	 *
-	 *  - On GCC/Clang/XLC on PowerPC, we use inline assembly to call
-	 *    the fsqrt opcode (Clang needs a special hack).
-	 *
-	 *  - On GCC/Clang on ARM with hardware floating-point, we use
-	 *    inline assembly to call the vqsrt.f64 opcode. Due to a
-	 *    complex ecosystem of compilers and assembly syntaxes, we
-	 *    have to call it "fsqrt" or "fsqrtd", depending on case.
-	 *
-	 * If the platform is not recognized, a call to the system
-	 * library function sqrt() is performed. On some compilers, this
-	 * may actually inline the relevant opcode, and call the library
-	 * function only when the input is invalid (e.g. negative);
-	 * Falcon never actually calls sqrt() on a negative value, but
-	 * the dependency to libm will still be there.
-	 */
-
-#if FALCON_AVX2  // yyyAVX2+1
-	fpr_sqrt_avx2(&x.v);
-	return x;
-#else  // yyyAVX2+0
-#if defined __GNUC__ && defined __SSE2_MATH__
-	return FPR(_mm_cvtsd_f64(_mm_sqrt_pd(_mm_set1_pd(x.v))));
-#elif defined __GNUC__ && defined __i386__
-	__asm__ __volatile__ (
-		"fldl   %0\n\t"
-		"fsqrt\n\t"
-		"fstpl  %0\n\t"
-		: "+m" (x.v) : : );
-	return x;
-#elif defined _M_IX86
-	__asm {
-		fld x.v
-		fsqrt
-		fstp x.v
-	}
-	return x;
-#elif defined __PPC__ && defined __GNUC__
-	fpr y;
-
-#if defined __clang__
-	/*
-	 * Normally we should use a 'd' constraint (register that contains
-	 * a 'double' value) but Clang 3.8.1 chokes on it. Instead we use
-	 * an 'f' constraint, counting on the fact that 'float' values
-	 * are managed in double-precision registers anyway, and the
-	 * compiler will not add extra rounding steps.
-	 */
-	__asm__ ( "fsqrt  %0, %1" : "=f" (y.v) : "f" (x.v) : );
-#else
-	__asm__ ( "fsqrt  %0, %1" : "=d" (y.v) : "d" (x.v) : );
-#endif
-	return y;
-#elif (defined __ARM_FP && ((__ARM_FP & 0x08) == 0x08)) \
-	|| (!defined __ARM_FP && defined __ARM_VFPV2__)
-	/*
-	 * On ARM, assembly syntaxes are a bit of a mess, depending on
-	 * whether GCC or Clang is used, and the binutils version, and
-	 * whether this is 32-bit or 64-bit mode. The code below appears
-	 * to work on:
-	 *    32-bit   GCC-4.9.2   Clang-3.5   Binutils-2.25
-	 *    64-bit   GCC-6.3.0   Clang-3.9   Binutils-2.28
-	 */
-#if defined __aarch64__ && __aarch64__
-	__asm__ ( "fsqrt   %d0, %d0" : "+w" (x.v) : : );
-#else
-	__asm__ ( "fsqrtd  %P0, %P0" : "+w" (x.v) : : );
-#endif
-	return x;
-#else
-	return FPR(sqrt(x.v));
-#endif
-#endif  // yyyAVX2-
-}
-
-static inline int
-fpr_lt(fpr x, fpr y)
-{
-	return x.v < y.v;
-}
-
-TARGET_AVX2
-static inline uint64_t
-fpr_expm_p63(fpr x, fpr ccs)
-{
-	/*
-	 * Polynomial approximation of exp(-x) is taken from FACCT:
-	 *   https://eprint.iacr.org/2018/1234
-	 * Specifically, values are extracted from the implementation
-	 * referenced from the FACCT article, and available at:
-	 *   https://github.com/raykzhao/gaussian
-	 * Tests over more than 24 billions of random inputs in the
-	 * 0..log(2) range have never shown a deviation larger than
-	 * 2^(-50) from the true mathematical value.
-	 */
-
-#if FALCON_AVX2  // yyyAVX2+1
-
-	/*
-	 * AVX2 implementation uses more operations than Horner's method,
-	 * but with a lower expression tree depth. This helps because
-	 * additions and multiplications have a latency of 4 cycles on
-	 * a Skylake, but the CPU can issue two of them per cycle.
-	 */
-
-	static const union {
-		double d[12];
-		__m256d v[3];
-	} c = {
-		{
-			0.999999999999994892974086724280,
-			0.500000000000019206858326015208,
-			0.166666666666984014666397229121,
-			0.041666666666110491190622155955,
-			0.008333333327800835146903501993,
-			0.001388888894063186997887560103,
-			0.000198412739277311890541063977,
-			0.000024801566833585381209939524,
-			0.000002755586350219122514855659,
-			0.000000275607356160477811864927,
-			0.000000025299506379442070029551,
-			0.000000002073772366009083061987
-		}
-	};
-
-	double d1, d2, d4, d8, y;
-	__m256d d14, d58, d9c;
-
-	d1 = -x.v;
-	d2 = d1 * d1;
-	d4 = d2 * d2;
-	d8 = d4 * d4;
-	d14 = _mm256_set_pd(d4, d2 * d1, d2, d1);
-	d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4));
-	d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8));
-	d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0]));
-	d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14);
-	d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58);
-	d9c = _mm256_hadd_pd(d9c, d9c);
-	y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c)
-		+ _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1));
-	y *= ccs.v;
-
-	/*
-	 * Final conversion goes through int64_t first, because that's what
-	 * the underlying opcode (vcvttsd2si) will do, and we know that the
-	 * result will fit, since x >= 0 and ccs < 1. If we did the
-	 * conversion directly to uint64_t, then the compiler would add some
-	 * extra code to cover the case of a source value of 2^63 or more,
-	 * and though the alternate path would never be exercised, the
-	 * extra comparison would cost us some cycles.
-	 */
-	return (uint64_t)(int64_t)(y * fpr_ptwo63.v);
-
-#else  // yyyAVX2+0
-
-	/*
-	 * Normal implementation uses Horner's method, which minimizes
-	 * the number of operations.
-	 */
-
-	double d, y;
-
-	d = x.v;
-	y = 0.000000002073772366009083061987;
-	y = 0.000000025299506379442070029551 - y * d;
-	y = 0.000000275607356160477811864927 - y * d;
-	y = 0.000002755586350219122514855659 - y * d;
-	y = 0.000024801566833585381209939524 - y * d;
-	y = 0.000198412739277311890541063977 - y * d;
-	y = 0.001388888894063186997887560103 - y * d;
-	y = 0.008333333327800835146903501993 - y * d;
-	y = 0.041666666666110491190622155955 - y * d;
-	y = 0.166666666666984014666397229121 - y * d;
-	y = 0.500000000000019206858326015208 - y * d;
-	y = 0.999999999999994892974086724280 - y * d;
-	y = 1.000000000000000000000000000000 - y * d;
-	y *= ccs.v;
-	return (uint64_t)(y * fpr_ptwo63.v);
-
-#endif  // yyyAVX2-
-}
-
-#define fpr_gm_tab   Zf(fpr_gm_tab)
-extern const fpr fpr_gm_tab[];
-
-#define fpr_p2_tab   Zf(fpr_p2_tab)
-extern const fpr fpr_p2_tab[];
-
-/* ====================================================================== */
-
-#else  // yyyFPEMU+0 yyyFPNATIVE+0
-
-#error No FP implementation selected
-
-#endif  // yyyFPEMU- yyyFPNATIVE-
diff --git a/crypto_sign/falcon-512-tree/m4-ct/inner.h b/crypto_sign/falcon-512-tree/m4-ct/inner.h
deleted file mode 100644
index 1f7d0819..00000000
--- a/crypto_sign/falcon-512-tree/m4-ct/inner.h
+++ /dev/null
@@ -1,1168 +0,0 @@
-#ifndef FALCON_INNER_H__
-#define FALCON_INNER_H__
-
-/*
- * Internal functions for Falcon. This is not the API intended to be
- * used by applications; instead, this internal API provides all the
- * primitives on which wrappers build to provide external APIs.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-/*
- * IMPORTANT API RULES
- * -------------------
- *
- * This API has some non-trivial usage rules:
- *
- *
- *  - All public functions (i.e. the non-static ones) must be referenced
- *    with the Zf() macro (e.g. Zf(verify_raw) for the verify_raw()
- *    function). That macro adds a prefix to the name, which is
- *    configurable with the FALCON_PREFIX macro. This allows compiling
- *    the code into a specific "namespace" and potentially including
- *    several versions of this code into a single application (e.g. to
- *    have an AVX2 and a non-AVX2 variants and select the one to use at
- *    runtime based on availability of AVX2 opcodes).
- *
- *  - Functions that need temporary buffers expects them as a final
- *    tmp[] array of type uint8_t*, with a size which is documented for
- *    each function. However, most have some alignment requirements,
- *    because they will use the array to store 16-bit, 32-bit or 64-bit
- *    values (e.g. uint64_t or double). The caller must ensure proper
- *    alignment. What happens on unaligned access depends on the
- *    underlying architecture, ranging from a slight time penalty
- *    to immediate termination of the process.
- *
- *  - Some functions rely on specific rounding rules and precision for
- *    floating-point numbers. On some systems (in particular 32-bit x86
- *    with the 387 FPU), this requires setting an hardware control
- *    word. The caller MUST use set_fpu_cw() to ensure proper precision:
- *
- *      oldcw = set_fpu_cw(2);
- *      Zf(sign_dyn)(...);
- *      set_fpu_cw(oldcw);
- *
- *    On systems where the native floating-point precision is already
- *    proper, or integer-based emulation is used, the set_fpu_cw()
- *    function does nothing, so it can be called systematically.
- */
-
-// yyyPQCLEAN+0 yyyNIST+0 yyySUPERCOP+0
-#include "config.h"
-// yyyPQCLEAN- yyyNIST- yyySUPERCOP-
-// yyySUPERCOP+1
-// yyyCONF*
-// yyySUPERCOP-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#if defined FALCON_AVX2 && FALCON_AVX2 // yyyAVX2+1
-/*
- * This implementation uses AVX2 and optionally FMA intrinsics.
- */
-#include <immintrin.h>
-#ifndef FALCON_LE
-#define FALCON_LE   1
-#endif
-#ifndef FALCON_UNALIGNED
-#define FALCON_UNALIGNED   1
-#endif
-#if defined __GNUC__
-#if defined FALCON_FMA && FALCON_FMA
-#define TARGET_AVX2   __attribute__((target("avx2,fma")))
-#else
-#define TARGET_AVX2   __attribute__((target("avx2")))
-#endif
-#elif defined _MSC_VER && _MSC_VER
-#pragma warning( disable : 4752 )
-#endif
-#if defined FALCON_FMA && FALCON_FMA
-#define FMADD(a, b, c)   _mm256_fmadd_pd(a, b, c)
-#define FMSUB(a, b, c)   _mm256_fmsub_pd(a, b, c)
-#else
-#define FMADD(a, b, c)   _mm256_add_pd(_mm256_mul_pd(a, b), c)
-#define FMSUB(a, b, c)   _mm256_sub_pd(_mm256_mul_pd(a, b), c)
-#endif
-#endif // yyyAVX2-
-
-// yyyNIST+0 yyyPQCLEAN+0
-/*
- * On MSVC, disable warning about applying unary minus on an unsigned
- * type: this is perfectly defined standard behaviour and we do it
- * quite often.
- */
-#if defined _MSC_VER && _MSC_VER
-#pragma warning( disable : 4146 )
-#endif
-
-// yyySUPERCOP+0
-/*
- * Enable ARM assembly on any ARMv7m platform (if it was not done before).
- */
-#ifndef FALCON_ASM_CORTEXM4
-#if (defined __ARM_ARCH_7EM__ && __ARM_ARCH_7EM__) \
-	&& (defined __ARM_FEATURE_DSP && __ARM_FEATURE_DSP)
-#define FALCON_ASM_CORTEXM4   1
-#else
-#define FALCON_ASM_CORTEXM4   0
-#endif
-#endif
-// yyySUPERCOP-
-
-#if defined __i386__ || defined _M_IX86 \
-	|| defined __x86_64__ || defined _M_X64 || \
-	(defined _ARCH_PWR8 && \
-		(defined __LITTLE_ENDIAN || defined __LITTLE_ENDIAN__))
-
-#ifndef FALCON_LE
-#define FALCON_LE     1
-#endif
-#ifndef FALCON_UNALIGNED
-#define FALCON_UNALIGNED   1
-#endif
-
-#elif defined FALCON_ASM_CORTEXM4 && FALCON_ASM_CORTEXM4
-
-#ifndef FALCON_LE
-#define FALCON_LE     1
-#endif
-#ifndef FALCON_UNALIGNED
-#define FALCON_UNALIGNED   0
-#endif
-
-#elif (defined __LITTLE_ENDIAN__ && __LITTLE_ENDIAN__) \
-	|| (defined __BYTE_ORDER__ && defined __ORDER_LITTLE_ENDIAN__ \
-		&& __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-
-#ifndef FALCON_LE
-#define FALCON_LE     1
-#endif
-#ifndef FALCON_UNALIGNED
-#define FALCON_UNALIGNED   0
-#endif
-
-#else
-
-#ifndef FALCON_LE
-#define FALCON_LE     0
-#endif
-#ifndef FALCON_UNALIGNED
-#define FALCON_UNALIGNED   0
-#endif
-
-#endif
-
-/*
- * We ensure that both FALCON_FPEMU and FALCON_FPNATIVE are defined,
- * with compatible values (exactly one of them must be non-zero).
- * If none is defined, then default FP implementation is 'native'
- * except on ARM Cortex M4.
- */
-#if !defined FALCON_FPEMU && !defined FALCON_FPNATIVE
-
-#if (defined __ARM_FP && ((__ARM_FP & 0x08) == 0x08)) \
-	|| (!defined __ARM_FP && defined __ARM_VFPV2__)
-#define FALCON_FPEMU      0
-#define FALCON_FPNATIVE   1
-#elif defined FALCON_ASM_CORTEXM4 && FALCON_ASM_CORTEXM4
-#define FALCON_FPEMU      1
-#define FALCON_FPNATIVE   0
-#else
-#define FALCON_FPEMU      0
-#define FALCON_FPNATIVE   1
-#endif
-
-#elif defined FALCON_FPEMU && !defined FALCON_FPNATIVE
-
-#if FALCON_FPEMU
-#define FALCON_FPNATIVE   0
-#else
-#define FALCON_FPNATIVE   1
-#endif
-
-#elif defined FALCON_FPNATIVE && !defined FALCON_FPEMU
-
-#if FALCON_FPNATIVE
-#define FALCON_FPEMU   0
-#else
-#define FALCON_FPEMU   1
-#endif
-
-#endif
-
-#if (FALCON_FPEMU && FALCON_FPNATIVE) || (!FALCON_FPEMU && !FALCON_FPNATIVE)
-#error Exactly one of FALCON_FPEMU and FALCON_FPNATIVE must be selected
-#endif
-
-// yyySUPERCOP+0
-/*
- * For seed generation from the operating system:
- *  - On Linux and glibc-2.25+, FreeBSD 12+ and OpenBSD, use getentropy().
- *  - On Unix-like systems, use /dev/urandom (including as a fallback
- *    for failed getentropy() calls).
- *  - On Windows, use CryptGenRandom().
- */
-
-#ifndef FALCON_RAND_GETENTROPY
-#if (defined __linux__ && defined __GLIBC__ \
-	&& (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 25))) \
-	|| (defined __FreeBSD__ && __FreeBSD__ >= 12) \
-	|| defined __OpenBSD__
-#define FALCON_RAND_GETENTROPY   1
-#else
-#define FALCON_RAND_GETENTROPY   0
-#endif
-#endif
-
-#ifndef FALCON_RAND_URANDOM
-#if defined _AIX \
-	|| defined __ANDROID__ \
-	|| defined __FreeBSD__ \
-	|| defined __NetBSD__ \
-	|| defined __OpenBSD__ \
-	|| defined __DragonFly__ \
-	|| defined __linux__ \
-	|| (defined __sun && (defined __SVR4 || defined __svr4__)) \
-	|| (defined __APPLE__ && defined __MACH__)
-#define FALCON_RAND_URANDOM   1
-#else
-#define FALCON_RAND_URANDOM   0
-#endif
-#endif
-
-#ifndef FALCON_RAND_WIN32
-#if defined _WIN32 || defined _WIN64
-#define FALCON_RAND_WIN32   1
-#else
-#define FALCON_RAND_WIN32   0
-#endif
-#endif
-// yyySUPERCOP-
-
-/*
- * For still undefined compile-time macros, define them to 0 to avoid
- * warnings with -Wundef.
- */
-#ifndef FALCON_AVX2
-#define FALCON_AVX2   0
-#endif
-#ifndef FALCON_FMA
-#define FALCON_FMA   0
-#endif
-#ifndef FALCON_KG_CHACHA20
-#define FALCON_KG_CHACHA20   0
-#endif
-// yyyNIST- yyyPQCLEAN-
-
-// yyyPQCLEAN+0 yyySUPERCOP+0
-/*
- * "Naming" macro used to apply a consistent prefix over all global
- * symbols.
- */
-#ifndef FALCON_PREFIX
-#define FALCON_PREFIX   falcon_inner
-#endif
-#define Zf(name)             Zf_(FALCON_PREFIX, name)
-#define Zf_(prefix, name)    Zf__(prefix, name)
-#define Zf__(prefix, name)   prefix ## _ ## name
-// yyyPQCLEAN- yyySUPERCOP-
-
-// yyyAVX2+1
-/*
- * We use the TARGET_AVX2 macro to tag some functions which, in some
- * configurations, may use AVX2 and FMA intrinsics; this depends on
- * the compiler. In all other cases, we just define it to emptiness
- * (i.e. it will have no effect).
- */
-#ifndef TARGET_AVX2
-#define TARGET_AVX2
-#endif
-// yyyAVX2-
-
-/*
- * Some computations with floating-point elements, in particular
- * rounding to the nearest integer, rely on operations using _exactly_
- * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
- * x86, the 387 FPU may be used (depending on the target OS) and, in
- * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
- * total type length); to prevent miscomputations, we define an explicit
- * function that modifies the precision in the FPU control word.
- *
- * set_fpu_cw() sets the precision to the provided value, and returns
- * the previously set precision; callers are supposed to restore the
- * previous precision on exit. The correct (52-bit) precision is
- * configured with the value "2". On unsupported compilers, or on
- * targets other than 32-bit x86, or when the native 'double' type is
- * not used, the set_fpu_cw() function does nothing at all.
- */
-#if FALCON_FPNATIVE  // yyyFPNATIVE+1
-#if defined __GNUC__ && defined __i386__
-static inline unsigned
-set_fpu_cw(unsigned x)
-{
-	unsigned short t;
-	unsigned old;
-
-	__asm__ __volatile__ ("fstcw %0" : "=m" (t) : : );
-	old = (t & 0x0300u) >> 8;
-	t = (unsigned short)((t & ~0x0300u) | (x << 8));
-	__asm__ __volatile__ ("fldcw %0" : : "m" (t) : );
-	return old;
-}
-#elif defined _M_IX86
-static inline unsigned
-set_fpu_cw(unsigned x)
-{
-	unsigned short t;
-	unsigned old;
-
-	__asm { fstcw t }
-	old = (t & 0x0300u) >> 8;
-	t = (unsigned short)((t & ~0x0300u) | (x << 8));
-	__asm { fldcw t }
-	return old;
-}
-#else
-static inline unsigned
-set_fpu_cw(unsigned x)
-{
-	return x;
-}
-#endif
-#else  // yyyFPNATIVE+0
-static inline unsigned
-set_fpu_cw(unsigned x)
-{
-	return x;
-}
-#endif  // yyyFPNATIVE-
-
-#if FALCON_FPNATIVE && !FALCON_AVX2  // yyyFPNATIVE+1 yyyAVX2+0
-/*
- * If using the native 'double' type but not AVX2 code, on an x86
- * machine with SSE2 activated for maths, then we will use the
- * SSE2 intrinsics.
- */
-#if defined __GNUC__ && defined __SSE2_MATH__
-#include <immintrin.h>
-#endif
-#endif  // yyyFPNATIVE- yyyAVX2-
-
-#if FALCON_FPNATIVE  // yyyFPNATIVE+1
-/*
- * For optimal reproducibility of values, we need to disable contraction
- * of floating-point expressions; otherwise, on some architectures (e.g.
- * PowerPC), the compiler may generate fused-multiply-add opcodes that
- * may round differently than two successive separate opcodes. C99 defines
- * a standard pragma for that, but GCC-6.2.2 appears to ignore it,
- * hence the GCC-specific pragma (that Clang does not support).
- */
-#if defined __clang__
-#pragma STDC FP_CONTRACT OFF
-#elif defined __GNUC__
-#pragma GCC optimize ("fp-contract=off")
-#endif
-#endif  // yyyFPNATIVE-
-
-// yyyPQCLEAN+0
-/*
- * MSVC 2015 does not know the C99 keyword 'restrict'.
- */
-#if defined _MSC_VER && _MSC_VER
-#ifndef restrict
-#define restrict   __restrict
-#endif
-#endif
-// yyyPQCLEAN-
-
-/* ==================================================================== */
-/*
- * SHAKE256 implementation (shake.c).
- *
- * API is defined to be easily replaced with the fips202.h API defined
- * as part of PQClean.
- */
-
-// yyyPQCLEAN+0
-/*
-typedef struct {
-	union {
-		uint64_t A[25];
-		uint8_t dbuf[200];
-	} st;
-	uint64_t dptr;
-} inner_shake256_context;
-
-#define inner_shake256_init      Zf(i_shake256_init)
-#define inner_shake256_inject    Zf(i_shake256_inject)
-#define inner_shake256_flip      Zf(i_shake256_flip)
-#define inner_shake256_extract   Zf(i_shake256_extract)
-
-void Zf(i_shake256_init)(
-	inner_shake256_context *sc);
-void Zf(i_shake256_inject)(
-	inner_shake256_context *sc, const uint8_t *in, size_t len);
-void Zf(i_shake256_flip)(
-	inner_shake256_context *sc);
-void Zf(i_shake256_extract)(
-	inner_shake256_context *sc, uint8_t *out, size_t len);
-*/
-
-// yyyPQCLEAN+1
-
-#include "fips202.h"
-
-#define inner_shake256_context                shake256incctx
-#define inner_shake256_init(sc)               shake256_inc_init(sc)
-#define inner_shake256_inject(sc, in, len)    shake256_inc_absorb(sc, in, len)
-#define inner_shake256_flip(sc)               shake256_inc_finalize(sc)
-#define inner_shake256_extract(sc, out, len)  shake256_inc_squeeze(out, len, sc)
-
-// yyyPQCLEAN+0
-
-// yyyPQCLEAN-
-
-/* ==================================================================== */
-/*
- * Encoding/decoding functions (codec.c).
- *
- * Encoding functions take as parameters an output buffer (out) with
- * a given maximum length (max_out_len); returned value is the actual
- * number of bytes which have been written. If the output buffer is
- * not large enough, then 0 is returned (some bytes may have been
- * written to the buffer). If 'out' is NULL, then 'max_out_len' is
- * ignored; instead, the function computes and returns the actual
- * required output length (in bytes).
- *
- * Decoding functions take as parameters an input buffer (in) with
- * its maximum length (max_in_len); returned value is the actual number
- * of bytes that have been read from the buffer. If the provided length
- * is too short, then 0 is returned.
- *
- * Values to encode or decode are vectors of integers, with N = 2^logn
- * elements.
- *
- * Three encoding formats are defined:
- *
- *   - modq: sequence of values modulo 12289, each encoded over exactly
- *     14 bits. The encoder and decoder verify that integers are within
- *     the valid range (0..12288). Values are arrays of uint16.
- *
- *   - trim: sequence of signed integers, a specified number of bits
- *     each. The number of bits is provided as parameter and includes
- *     the sign bit. Each integer x must be such that |x| < 2^(bits-1)
- *     (which means that the -2^(bits-1) value is forbidden); encode and
- *     decode functions check that property. Values are arrays of
- *     int16_t or int8_t, corresponding to names 'trim_i16' and
- *     'trim_i8', respectively.
- *
- *   - comp: variable-length encoding for signed integers; each integer
- *     uses a minimum of 9 bits, possibly more. This is normally used
- *     only for signatures.
- *
- */
-
-size_t Zf(modq_encode)(void *out, size_t max_out_len,
-	const uint16_t *x, unsigned logn);
-size_t Zf(trim_i16_encode)(void *out, size_t max_out_len,
-	const int16_t *x, unsigned logn, unsigned bits);
-size_t Zf(trim_i8_encode)(void *out, size_t max_out_len,
-	const int8_t *x, unsigned logn, unsigned bits);
-size_t Zf(comp_encode)(void *out, size_t max_out_len,
-	const int16_t *x, unsigned logn);
-
-size_t Zf(modq_decode)(uint16_t *x, unsigned logn,
-	const void *in, size_t max_in_len);
-size_t Zf(trim_i16_decode)(int16_t *x, unsigned logn, unsigned bits,
-	const void *in, size_t max_in_len);
-size_t Zf(trim_i8_decode)(int8_t *x, unsigned logn, unsigned bits,
-	const void *in, size_t max_in_len);
-size_t Zf(comp_decode)(int16_t *x, unsigned logn,
-	const void *in, size_t max_in_len);
-
-/*
- * Number of bits for key elements, indexed by logn (1 to 10). This
- * is at most 8 bits for all degrees, but some degrees may have shorter
- * elements.
- */
-extern const uint8_t Zf(max_fg_bits)[];
-extern const uint8_t Zf(max_FG_bits)[];
-
-/*
- * Maximum size, in bits, of elements in a signature, indexed by logn
- * (1 to 10). The size includes the sign bit.
- */
-extern const uint8_t Zf(max_sig_bits)[];
-
-/* ==================================================================== */
-/*
- * Support functions used for both signature generation and signature
- * verification (common.c).
- */
-
-/*
- * From a SHAKE256 context (must be already flipped), produce a new
- * point. This is the non-constant-time version, which may leak enough
- * information to serve as a stop condition on a brute force attack on
- * the hashed message (provided that the nonce value is known).
- */
-void Zf(hash_to_point_vartime)(inner_shake256_context *sc,
-	uint16_t *x, unsigned logn);
-
-/*
- * From a SHAKE256 context (must be already flipped), produce a new
- * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
- * This function is constant-time but is typically more expensive than
- * Zf(hash_to_point_vartime)().
- *
- * tmp[] must have 16-bit alignment.
- */
-void Zf(hash_to_point_ct)(inner_shake256_context *sc,
-	uint16_t *x, unsigned logn, uint8_t *tmp);
-
-/*
- * Tell whether a given vector (2N coordinates, in two halves) is
- * acceptable as a signature. This compares the appropriate norm of the
- * vector with the acceptance bound. Returned value is 1 on success
- * (vector is short enough to be acceptable), 0 otherwise.
- */
-int Zf(is_short)(const int16_t *s1, const int16_t *s2, unsigned logn);
-
-/*
- * Tell whether a given vector (2N coordinates, in two halves) is
- * acceptable as a signature. Instead of the first half s1, this
- * function receives the "saturated squared norm" of s1, i.e. the
- * sum of the squares of the coordinates of s1 (saturated at 2^32-1
- * if the sum exceeds 2^31-1).
- *
- * Returned value is 1 on success (vector is short enough to be
- * acceptable), 0 otherwise.
- */
-int Zf(is_short_half)(uint32_t sqn, const int16_t *s2, unsigned logn);
-
-/* ==================================================================== */
-/*
- * Signature verification functions (vrfy.c).
- */
-
-/*
- * Convert a public key to NTT + Montgomery format. Conversion is done
- * in place.
- */
-void Zf(to_ntt_monty)(uint16_t *h, unsigned logn);
-
-/*
- * Internal signature verification code:
- *   c0[]      contains the hashed nonce+message
- *   s2[]      is the decoded signature
- *   h[]       contains the public key, in NTT + Montgomery format
- *   logn      is the degree log
- *   tmp[]     temporary, must have at least 2*2^logn bytes
- * Returned value is 1 on success, 0 on error.
- *
- * tmp[] must have 16-bit alignment.
- */
-int Zf(verify_raw)(const uint16_t *c0, const int16_t *s2,
-	const uint16_t *h, unsigned logn, uint8_t *tmp);
-
-/*
- * Compute the public key h[], given the private key elements f[] and
- * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial
- * modulus. This function returns 1 on success, 0 on error (an error is
- * reported if f is not invertible mod phi mod q).
- *
- * The tmp[] array must have room for at least 2*2^logn elements.
- * tmp[] must have 16-bit alignment.
- */
-int Zf(compute_public)(uint16_t *h,
-	const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp);
-
-/*
- * Recompute the fourth private key element. Private key consists in
- * four polynomials with small coefficients f, g, F and G, which are
- * such that fG - gF = q mod phi; furthermore, f is invertible modulo
- * phi and modulo q. This function recomputes G from f, g and F.
- *
- * The tmp[] array must have room for at least 4*2^logn bytes.
- *
- * Returned value is 1 in success, 0 on error (f not invertible).
- * tmp[] must have 16-bit alignment.
- */
-int Zf(complete_private)(int8_t *G,
-	const int8_t *f, const int8_t *g, const int8_t *F,
-	unsigned logn, uint8_t *tmp);
-
-/*
- * Test whether a given polynomial is invertible modulo phi and q.
- * Polynomial coefficients are small integers.
- *
- * tmp[] must have 16-bit alignment.
- */
-int Zf(is_invertible)(
-	const int16_t *s2, unsigned logn, uint8_t *tmp);
-
-/*
- * Count the number of elements of value zero in the NTT representation
- * of the given polynomial: this is the number of primitive 2n-th roots
- * of unity (modulo q = 12289) that are roots of the provided polynomial
- * (taken modulo q).
- *
- * tmp[] must have 16-bit alignment.
- */
-int Zf(count_nttzero)(const int16_t *sig, unsigned logn, uint8_t *tmp);
-
-/*
- * Internal signature verification with public key recovery:
- *   h[]       receives the public key (NOT in NTT/Montgomery format)
- *   c0[]      contains the hashed nonce+message
- *   s1[]      is the first signature half
- *   s2[]      is the second signature half
- *   logn      is the degree log
- *   tmp[]     temporary, must have at least 2*2^logn bytes
- * Returned value is 1 on success, 0 on error. Success is returned if
- * the signature is a short enough vector; in that case, the public
- * key has been written to h[]. However, the caller must still
- * verify that h[] is the correct value (e.g. with regards to a known
- * hash of the public key).
- *
- * h[] may not overlap with any of the other arrays.
- *
- * tmp[] must have 16-bit alignment.
- */
-int Zf(verify_recover)(uint16_t *h,
-	const uint16_t *c0, const int16_t *s1, const int16_t *s2,
-	unsigned logn, uint8_t *tmp);
-
-/* ==================================================================== */
-/*
- * Implementation of floating-point real numbers (fpr.h, fpr.c).
- */
-
-/*
- * Real numbers are implemented by an extra header file, included below.
- * This is meant to support pluggable implementations. The default
- * implementation relies on the C type 'double'.
- *
- * The included file must define the following types, functions and
- * constants:
- *
- *   fpr
- *         type for a real number
- *
- *   fpr fpr_of(int64_t i)
- *         cast an integer into a real number; source must be in the
- *         -(2^63-1)..+(2^63-1) range
- *
- *   fpr fpr_scaled(int64_t i, int sc)
- *         compute i*2^sc as a real number; source 'i' must be in the
- *         -(2^63-1)..+(2^63-1) range
- *
- *   fpr fpr_ldexp(fpr x, int e)
- *         compute x*2^e
- *
- *   int64_t fpr_rint(fpr x)
- *         round x to the nearest integer; x must be in the -(2^63-1)
- *         to +(2^63-1) range
- *
- *   int64_t fpr_trunc(fpr x)
- *         round to an integer; this rounds towards zero; value must
- *         be in the -(2^63-1) to +(2^63-1) range
- *
- *   fpr fpr_add(fpr x, fpr y)
- *         compute x + y
- *
- *   fpr fpr_sub(fpr x, fpr y)
- *         compute x - y
- *
- *   fpr fpr_neg(fpr x)
- *         compute -x
- *
- *   fpr fpr_half(fpr x)
- *         compute x/2
- *
- *   fpr fpr_double(fpr x)
- *         compute x*2
- *
- *   fpr fpr_mul(fpr x, fpr y)
- *         compute x * y
- *
- *   fpr fpr_sqr(fpr x)
- *         compute x * x
- *
- *   fpr fpr_inv(fpr x)
- *         compute 1/x
- *
- *   fpr fpr_div(fpr x, fpr y)
- *         compute x/y
- *
- *   fpr fpr_sqrt(fpr x)
- *         compute the square root of x
- *
- *   int fpr_lt(fpr x, fpr y)
- *         return 1 if x < y, 0 otherwise
- *
- *   uint64_t fpr_expm_p63(fpr x)
- *         return exp(x), assuming that 0 <= x < log(2). Returned value
- *         is scaled to 63 bits (i.e. it really returns 2^63*exp(-x),
- *         rounded to the nearest integer). Computation should have a
- *         precision of at least 45 bits.
- *
- *   const fpr fpr_gm_tab[]
- *         array of constants for FFT / iFFT
- *
- *   const fpr fpr_p2_tab[]
- *         precomputed powers of 2 (by index, 0 to 10)
- *
- * Constants of type 'fpr':
- *
- *   fpr fpr_q                 12289
- *   fpr fpr_inverse_of_q      1/12289
- *   fpr fpr_inv_2sqrsigma0    1/(2*(1.8205^2))
- *   fpr fpr_inv_sigma         1/(1.55*sqrt(12289))
- *   fpr fpr_sigma_min_9       1.291500756233514568549480827642
- *   fpr fpr_sigma_min_10      1.311734375905083682667395805765
- *   fpr fpr_log2              log(2)
- *   fpr fpr_inv_log2          1/log(2)
- *   fpr fpr_bnorm_max         16822.4121
- *   fpr fpr_zero              0
- *   fpr fpr_one               1
- *   fpr fpr_two               2
- *   fpr fpr_onehalf           0.5
- *   fpr fpr_ptwo31            2^31
- *   fpr fpr_ptwo31m1          2^31-1
- *   fpr fpr_mtwo31m1          -(2^31-1)
- *   fpr fpr_ptwo63m1          2^63-1
- *   fpr fpr_mtwo63m1          -(2^63-1)
- *   fpr fpr_ptwo63            2^63
- */
-#include "fpr.h"
-
-/* ==================================================================== */
-/*
- * RNG (rng.c).
- *
- * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256
- * context (flipped) and is used for bulk pseudorandom generation.
- * A system-dependent seed generator is also provided.
- */
-
-/*
- * Obtain a random seed from the system RNG.
- *
- * Returned value is 1 on success, 0 on error.
- */
-int Zf(get_seed)(void *seed, size_t seed_len);
-
-/*
- * Structure for a PRNG. This includes a large buffer so that values
- * get generated in advance. The 'state' is used to keep the current
- * PRNG algorithm state (contents depend on the selected algorithm).
- *
- * The unions with 'dummy_u64' are there to ensure proper alignment for
- * 64-bit direct access.
- */
-typedef struct {
-	union {
-		uint8_t d[512]; /* MUST be 512, exactly */
-		uint64_t dummy_u64;
-	} buf;
-	size_t ptr;
-	union {
-		uint8_t d[256];
-		uint64_t dummy_u64;
-	} state;
-	int type;
-} prng;
-
-/*
- * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
- * context (in "flipped" state) to obtain its initial state.
- */
-void Zf(prng_init)(prng *p, inner_shake256_context *src);
-
-/*
- * Refill the PRNG buffer. This is normally invoked automatically, and
- * is declared here only so that prng_get_u64() may be inlined.
- */
-void Zf(prng_refill)(prng *p);
-
-/*
- * Get some bytes from a PRNG.
- */
-void Zf(prng_get_bytes)(prng *p, void *dst, size_t len);
-
-/*
- * Get a 64-bit random value from a PRNG.
- */
-static inline uint64_t
-prng_get_u64(prng *p)
-{
-	size_t u;
-
-	/*
-	 * If there are less than 9 bytes in the buffer, we refill it.
-	 * This means that we may drop the last few bytes, but this allows
-	 * for faster extraction code. Also, it means that we never leave
-	 * an empty buffer.
-	 */
-	u = p->ptr;
-	if (u >= (sizeof p->buf.d) - 9) {
-		Zf(prng_refill)(p);
-		u = 0;
-	}
-	p->ptr = u + 8;
-
-	/*
-	 * On systems that use little-endian encoding and allow
-	 * unaligned accesses, we can simply read the data where it is.
-	 */
-#if FALCON_LE && FALCON_UNALIGNED  // yyyLEU+1
-	return *(uint64_t *)(p->buf.d + u);
-#else  // yyyLEU+0
-	return (uint64_t)p->buf.d[u + 0]
-		| ((uint64_t)p->buf.d[u + 1] << 8)
-		| ((uint64_t)p->buf.d[u + 2] << 16)
-		| ((uint64_t)p->buf.d[u + 3] << 24)
-		| ((uint64_t)p->buf.d[u + 4] << 32)
-		| ((uint64_t)p->buf.d[u + 5] << 40)
-		| ((uint64_t)p->buf.d[u + 6] << 48)
-		| ((uint64_t)p->buf.d[u + 7] << 56);
-#endif  // yyyLEU-
-}
-
-/*
- * Get an 8-bit random value from a PRNG.
- */
-static inline unsigned
-prng_get_u8(prng *p)
-{
-	unsigned v;
-
-	v = p->buf.d[p->ptr ++];
-	if (p->ptr == sizeof p->buf.d) {
-		Zf(prng_refill)(p);
-	}
-	return v;
-}
-
-/* ==================================================================== */
-/*
- * FFT (falcon-fft.c).
- *
- * A real polynomial is represented as an array of N 'fpr' elements.
- * The FFT representation of a real polynomial contains N/2 complex
- * elements; each is stored as two real numbers, for the real and
- * imaginary parts, respectively. See falcon-fft.c for details on the
- * internal representation.
- */
-
-/*
- * Compute FFT in-place: the source array should contain a real
- * polynomial (N coefficients); its storage area is reused to store
- * the FFT representation of that polynomial (N/2 complex numbers).
- *
- * 'logn' MUST lie between 1 and 10 (inclusive).
- */
-void Zf(FFT)(fpr *f, unsigned logn);
-
-/*
- * Compute the inverse FFT in-place: the source array should contain the
- * FFT representation of a real polynomial (N/2 elements); the resulting
- * real polynomial (N coefficients of type 'fpr') is written over the
- * array.
- *
- * 'logn' MUST lie between 1 and 10 (inclusive).
- */
-void Zf(iFFT)(fpr *f, unsigned logn);
-
-/*
- * Add polynomial b to polynomial a. a and b MUST NOT overlap. This
- * function works in both normal and FFT representations.
- */
-void Zf(poly_add)(fpr *restrict a, const fpr *restrict b, unsigned logn);
-
-/*
- * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This
- * function works in both normal and FFT representations.
- */
-void Zf(poly_sub)(fpr *restrict a, const fpr *restrict b, unsigned logn);
-
-/*
- * Negate polynomial a. This function works in both normal and FFT
- * representations.
- */
-void Zf(poly_neg)(fpr *a, unsigned logn);
-
-/*
- * Compute adjoint of polynomial a. This function works only in FFT
- * representation.
- */
-void Zf(poly_adj_fft)(fpr *a, unsigned logn);
-
-/*
- * Multiply polynomial a with polynomial b. a and b MUST NOT overlap.
- * This function works only in FFT representation.
- */
-void Zf(poly_mul_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn);
-
-/*
- * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT
- * overlap. This function works only in FFT representation.
- */
-void Zf(poly_muladj_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn);
-
-/*
- * Multiply polynomial with its own adjoint. This function works only in FFT
- * representation.
- */
-void Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn);
-
-/*
- * Multiply polynomial with a real constant. This function works in both
- * normal and FFT representations.
- */
-void Zf(poly_mulconst)(fpr *a, fpr x, unsigned logn);
-
-/*
- * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation).
- * a and b MUST NOT overlap.
- */
-void Zf(poly_div_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn);
-
-/*
- * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g))
- * (also in FFT representation). Since the result is auto-adjoint, all its
- * coordinates in FFT representation are real; as such, only the first N/2
- * values of d[] are filled (the imaginary parts are skipped).
- *
- * Array d MUST NOT overlap with either a or b.
- */
-void Zf(poly_invnorm2_fft)(fpr *restrict d,
-	const fpr *restrict a, const fpr *restrict b, unsigned logn);
-
-/*
- * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g)
- * (also in FFT representation). Destination d MUST NOT overlap with
- * any of the source arrays.
- */
-void Zf(poly_add_muladj_fft)(fpr *restrict d,
-	const fpr *restrict F, const fpr *restrict G,
-	const fpr *restrict f, const fpr *restrict g, unsigned logn);
-
-/*
- * Multiply polynomial a by polynomial b, where b is autoadjoint. Both
- * a and b are in FFT representation. Since b is autoadjoint, all its
- * FFT coefficients are real, and the array b contains only N/2 elements.
- * a and b MUST NOT overlap.
- */
-void Zf(poly_mul_autoadj_fft)(fpr *restrict a,
-	const fpr *restrict b, unsigned logn);
-
-/*
- * Divide polynomial a by polynomial b, where b is autoadjoint. Both
- * a and b are in FFT representation. Since b is autoadjoint, all its
- * FFT coefficients are real, and the array b contains only N/2 elements.
- * a and b MUST NOT overlap.
- */
-void Zf(poly_div_autoadj_fft)(fpr *restrict a,
-	const fpr *restrict b, unsigned logn);
-
-/*
- * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
- * representation. On input, g00, g01 and g11 are provided (where the
- * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10
- * and d11 values are written in g00, g01 and g11, respectively
- * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]).
- * (In fact, d00 = g00, so the g00 operand is left unmodified.)
- */
-void Zf(poly_LDL_fft)(const fpr *restrict g00,
-	fpr *restrict g01, fpr *restrict g11, unsigned logn);
-
-/*
- * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
- * representation. This is identical to poly_LDL_fft() except that
- * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written
- * in two other separate buffers provided as extra parameters.
- */
-void Zf(poly_LDLmv_fft)(fpr *restrict d11, fpr *restrict l10,
-	const fpr *restrict g00, const fpr *restrict g01,
-	const fpr *restrict g11, unsigned logn);
-
-/*
- * Apply "split" operation on a polynomial in FFT representation:
- * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1
- * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap.
- */
-void Zf(poly_split_fft)(fpr *restrict f0, fpr *restrict f1,
-	const fpr *restrict f, unsigned logn);
-
-/*
- * Apply "merge" operation on two polynomials in FFT representation:
- * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes
- * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1.
- * f MUST NOT overlap with either f0 or f1.
- */
-void Zf(poly_merge_fft)(fpr *restrict f,
-	const fpr *restrict f0, const fpr *restrict f1, unsigned logn);
-
-/* ==================================================================== */
-/*
- * Key pair generation.
- */
-
-/*
- * Required sizes of the temporary buffer (in bytes).
- *
- * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
- * or 2) where it is slightly greater.
- */
-#define FALCON_KEYGEN_TEMP_1      136
-#define FALCON_KEYGEN_TEMP_2      272
-#define FALCON_KEYGEN_TEMP_3      224
-#define FALCON_KEYGEN_TEMP_4      448
-#define FALCON_KEYGEN_TEMP_5      896
-#define FALCON_KEYGEN_TEMP_6     1792
-#define FALCON_KEYGEN_TEMP_7     3584
-#define FALCON_KEYGEN_TEMP_8     7168
-#define FALCON_KEYGEN_TEMP_9    14336
-#define FALCON_KEYGEN_TEMP_10   28672
-
-/*
- * Generate a new key pair. Randomness is extracted from the provided
- * SHAKE256 context, which must have already been seeded and flipped.
- * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_*
- * macros) and be aligned for the uint32_t, uint64_t and fpr types.
- *
- * The private key elements are written in f, g, F and G, and the
- * public key is written in h. Either or both of G and h may be NULL,
- * in which case the corresponding element is not returned (they can
- * be recomputed from f, g and F).
- *
- * tmp[] must have 64-bit alignment.
- * This function uses floating-point rounding (see set_fpu_cw()).
- */
-void Zf(keygen)(inner_shake256_context *rng,
-	int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
-	unsigned logn, uint8_t *tmp);
-
-/* ==================================================================== */
-/*
- * Signature generation.
- */
-
-/*
- * Expand a private key into the B0 matrix in FFT representation and
- * the LDL tree. All the values are written in 'expanded_key', for
- * a total of (8*logn+40)*2^logn bytes.
- *
- * The tmp[] array must have room for at least 48*2^logn bytes.
- *
- * tmp[] must have 64-bit alignment.
- * This function uses floating-point rounding (see set_fpu_cw()).
- */
-void Zf(expand_privkey)(fpr *restrict expanded_key,
-	const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
-	unsigned logn, uint8_t *restrict tmp);
-
-/*
- * Compute a signature over the provided hashed message (hm); the
- * signature value is one short vector. This function uses an
- * expanded key (as generated by Zf(expand_privkey)()).
- *
- * The sig[] and hm[] buffers may overlap.
- *
- * On successful output, the start of the tmp[] buffer contains the s1
- * vector (as int16_t elements).
- *
- * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
- *
- * tmp[] must have 64-bit alignment.
- * This function uses floating-point rounding (see set_fpu_cw()).
- */
-void Zf(sign_tree)(int16_t *sig, inner_shake256_context *rng,
-	const fpr *restrict expanded_key,
-	const uint16_t *hm, unsigned logn, uint8_t *tmp);
-
-/*
- * Compute a signature over the provided hashed message (hm); the
- * signature value is one short vector. This function uses a raw
- * key and dynamically recompute the B0 matrix and LDL tree; this
- * saves RAM since there is no needed for an expanded key, but
- * increases the signature cost.
- *
- * The sig[] and hm[] buffers may overlap.
- *
- * On successful output, the start of the tmp[] buffer contains the s1
- * vector (as int16_t elements).
- *
- * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
- *
- * tmp[] must have 64-bit alignment.
- * This function uses floating-point rounding (see set_fpu_cw()).
- */
-void Zf(sign_dyn)(int16_t *sig, inner_shake256_context *rng,
-	const int8_t *restrict f, const int8_t *restrict g,
-	const int8_t *restrict F, const int8_t *restrict G,
-	const uint16_t *hm, unsigned logn, uint8_t *tmp);
-
-/*
- * Internal sampler engine. Exported for tests.
- *
- * sampler_context wraps around a source of random numbers (PRNG) and
- * the sigma_min value (nominally dependent on the degree).
- *
- * sampler() takes as parameters:
- *   ctx      pointer to the sampler_context structure
- *   mu       center for the distribution
- *   isigma   inverse of the distribution standard deviation
- * It returns an integer sampled along the Gaussian distribution centered
- * on mu and of standard deviation sigma = 1/isigma.
- *
- * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
- * returns an integer sampled along a half-Gaussian with standard
- * deviation sigma0 = 1.8205 (center is 0, returned value is
- * nonnegative).
- */
-
-typedef struct {
-	prng p;
-	fpr sigma_min;
-} sampler_context;
-
-TARGET_AVX2
-int Zf(sampler)(void *ctx, fpr mu, fpr isigma);
-
-TARGET_AVX2
-int Zf(gaussian0_sampler)(prng *p);
-
-/* ==================================================================== */
-
-#endif
diff --git a/crypto_sign/falcon-512-tree/m4-ct/keygen.c b/crypto_sign/falcon-512-tree/m4-ct/keygen.c
deleted file mode 100644
index cf7de008..00000000
--- a/crypto_sign/falcon-512-tree/m4-ct/keygen.c
+++ /dev/null
@@ -1,4301 +0,0 @@
-/*
- * Falcon key pair generation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-#define MKN(logn)   ((size_t)1 << (logn))
-
-/* ==================================================================== */
-/*
- * Modular arithmetics.
- *
- * We implement a few functions for computing modulo a small integer p.
- *
- * All functions require that 2^30 < p < 2^31. Moreover, operands must
- * be in the 0..p-1 range.
- *
- * Modular addition and subtraction work for all such p.
- *
- * Montgomery multiplication requires that p is odd, and must be provided
- * with an additional value p0i = -1/p mod 2^31. See below for some basics
- * on Montgomery multiplication.
- *
- * Division computes an inverse modulo p by an exponentiation (with
- * exponent p-2): this works only if p is prime. Multiplication
- * requirements also apply, i.e. p must be odd and p0i must be provided.
- *
- * The NTT and inverse NTT need all of the above, and also that
- * p = 1 mod 2048.
- *
- * -----------------------------------------------------------------------
- *
- * We use Montgomery representation with 31-bit values:
- *
- *   Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p.
- *   Montgomery representation of an integer x modulo p is x*R mod p.
- *
- *   Montgomery multiplication computes (x*y)/R mod p for
- *   operands x and y. Therefore:
- *
- *    - if operands are x*R and y*R (Montgomery representations of x and
- *      y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R
- *      mod p, which is the Montgomery representation of the product x*y;
- *
- *    - if operands are x*R and y (or x and y*R), then Montgomery
- *      multiplication returns x*y mod p: mixed-representation
- *      multiplications yield results in normal representation.
- *
- * To convert to Montgomery representation, we multiply by R, which is done
- * by Montgomery-multiplying by R^2. Stand-alone conversion back from
- * Montgomery representation is Montgomery-multiplication by 1.
- */
-
-/*
- * Precomputed small primes. Each element contains the following:
- *
- *  p   The prime itself.
- *
- *  g   A primitive root of phi = X^N+1 (in field Z_p).
- *
- *  s   The inverse of the product of all previous primes in the array,
- *      computed modulo p and in Montgomery representation.
- *
- * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
- * are listed in decreasing order.
- */
-
-typedef struct {
-	uint32_t p;
-	uint32_t g;
-	uint32_t s;
-} small_prime;
-
-static const small_prime PRIMES[] = {
-	{ 2147473409,  383167813,      10239 },
-	{ 2147389441,  211808905,  471403745 },
-	{ 2147387393,   37672282, 1329335065 },
-	{ 2147377153, 1977035326,  968223422 },
-	{ 2147358721, 1067163706,  132460015 },
-	{ 2147352577, 1606082042,  598693809 },
-	{ 2147346433, 2033915641, 1056257184 },
-	{ 2147338241, 1653770625,  421286710 },
-	{ 2147309569,  631200819, 1111201074 },
-	{ 2147297281, 2038364663, 1042003613 },
-	{ 2147295233, 1962540515,   19440033 },
-	{ 2147239937, 2100082663,  353296760 },
-	{ 2147235841, 1991153006, 1703918027 },
-	{ 2147217409,  516405114, 1258919613 },
-	{ 2147205121,  409347988, 1089726929 },
-	{ 2147196929,  927788991, 1946238668 },
-	{ 2147178497, 1136922411, 1347028164 },
-	{ 2147100673,  868626236,  701164723 },
-	{ 2147082241, 1897279176,  617820870 },
-	{ 2147074049, 1888819123,  158382189 },
-	{ 2147051521,   25006327,  522758543 },
-	{ 2147043329,  327546255,   37227845 },
-	{ 2147039233,  766324424, 1133356428 },
-	{ 2146988033, 1862817362,   73861329 },
-	{ 2146963457,  404622040,  653019435 },
-	{ 2146959361, 1936581214,  995143093 },
-	{ 2146938881, 1559770096,  634921513 },
-	{ 2146908161,  422623708, 1985060172 },
-	{ 2146885633, 1751189170,  298238186 },
-	{ 2146871297,  578919515,  291810829 },
-	{ 2146846721, 1114060353,  915902322 },
-	{ 2146834433, 2069565474,   47859524 },
-	{ 2146818049, 1552824584,  646281055 },
-	{ 2146775041, 1906267847, 1597832891 },
-	{ 2146756609, 1847414714, 1228090888 },
-	{ 2146744321, 1818792070, 1176377637 },
-	{ 2146738177, 1118066398, 1054971214 },
-	{ 2146736129,   52057278,  933422153 },
-	{ 2146713601,  592259376, 1406621510 },
-	{ 2146695169,  263161877, 1514178701 },
-	{ 2146656257,  685363115,  384505091 },
-	{ 2146650113,  927727032,  537575289 },
-	{ 2146646017,   52575506, 1799464037 },
-	{ 2146643969, 1276803876, 1348954416 },
-	{ 2146603009,  814028633, 1521547704 },
-	{ 2146572289, 1846678872, 1310832121 },
-	{ 2146547713,  919368090, 1019041349 },
-	{ 2146508801,  671847612,   38582496 },
-	{ 2146492417,  283911680,  532424562 },
-	{ 2146490369, 1780044827,  896447978 },
-	{ 2146459649,  327980850, 1327906900 },
-	{ 2146447361, 1310561493,  958645253 },
-	{ 2146441217,  412148926,  287271128 },
-	{ 2146437121,  293186449, 2009822534 },
-	{ 2146430977,  179034356, 1359155584 },
-	{ 2146418689, 1517345488, 1790248672 },
-	{ 2146406401, 1615820390, 1584833571 },
-	{ 2146404353,  826651445,  607120498 },
-	{ 2146379777,    3816988, 1897049071 },
-	{ 2146363393, 1221409784, 1986921567 },
-	{ 2146355201, 1388081168,  849968120 },
-	{ 2146336769, 1803473237, 1655544036 },
-	{ 2146312193, 1023484977,  273671831 },
-	{ 2146293761, 1074591448,  467406983 },
-	{ 2146283521,  831604668, 1523950494 },
-	{ 2146203649,  712865423, 1170834574 },
-	{ 2146154497, 1764991362, 1064856763 },
-	{ 2146142209,  627386213, 1406840151 },
-	{ 2146127873, 1638674429, 2088393537 },
-	{ 2146099201, 1516001018,  690673370 },
-	{ 2146093057, 1294931393,  315136610 },
-	{ 2146091009, 1942399533,  973539425 },
-	{ 2146078721, 1843461814, 2132275436 },
-	{ 2146060289, 1098740778,  360423481 },
-	{ 2146048001, 1617213232, 1951981294 },
-	{ 2146041857, 1805783169, 2075683489 },
-	{ 2146019329,  272027909, 1753219918 },
-	{ 2145986561, 1206530344, 2034028118 },
-	{ 2145976321, 1243769360, 1173377644 },
-	{ 2145964033,  887200839, 1281344586 },
-	{ 2145906689, 1651026455,  906178216 },
-	{ 2145875969, 1673238256, 1043521212 },
-	{ 2145871873, 1226591210, 1399796492 },
-	{ 2145841153, 1465353397, 1324527802 },
-	{ 2145832961, 1150638905,  554084759 },
-	{ 2145816577,  221601706,  427340863 },
-	{ 2145785857,  608896761,  316590738 },
-	{ 2145755137, 1712054942, 1684294304 },
-	{ 2145742849, 1302302867,  724873116 },
-	{ 2145728513,  516717693,  431671476 },
-	{ 2145699841,  524575579, 1619722537 },
-	{ 2145691649, 1925625239,  982974435 },
-	{ 2145687553,  463795662, 1293154300 },
-	{ 2145673217,  771716636,  881778029 },
-	{ 2145630209, 1509556977,  837364988 },
-	{ 2145595393,  229091856,  851648427 },
-	{ 2145587201, 1796903241,  635342424 },
-	{ 2145525761,  715310882, 1677228081 },
-	{ 2145495041, 1040930522,  200685896 },
-	{ 2145466369,  949804237, 1809146322 },
-	{ 2145445889, 1673903706,   95316881 },
-	{ 2145390593,  806941852, 1428671135 },
-	{ 2145372161, 1402525292,  159350694 },
-	{ 2145361921, 2124760298, 1589134749 },
-	{ 2145359873, 1217503067, 1561543010 },
-	{ 2145355777,  338341402,   83865711 },
-	{ 2145343489, 1381532164,  641430002 },
-	{ 2145325057, 1883895478, 1528469895 },
-	{ 2145318913, 1335370424,   65809740 },
-	{ 2145312769, 2000008042, 1919775760 },
-	{ 2145300481,  961450962, 1229540578 },
-	{ 2145282049,  910466767, 1964062701 },
-	{ 2145232897,  816527501,  450152063 },
-	{ 2145218561, 1435128058, 1794509700 },
-	{ 2145187841,   33505311, 1272467582 },
-	{ 2145181697,  269767433, 1380363849 },
-	{ 2145175553,   56386299, 1316870546 },
-	{ 2145079297, 2106880293, 1391797340 },
-	{ 2145021953, 1347906152,  720510798 },
-	{ 2145015809,  206769262, 1651459955 },
-	{ 2145003521, 1885513236, 1393381284 },
-	{ 2144960513, 1810381315,   31937275 },
-	{ 2144944129, 1306487838, 2019419520 },
-	{ 2144935937,   37304730, 1841489054 },
-	{ 2144894977, 1601434616,  157985831 },
-	{ 2144888833,   98749330, 2128592228 },
-	{ 2144880641, 1772327002, 2076128344 },
-	{ 2144864257, 1404514762, 2029969964 },
-	{ 2144827393,  801236594,  406627220 },
-	{ 2144806913,  349217443, 1501080290 },
-	{ 2144796673, 1542656776, 2084736519 },
-	{ 2144778241, 1210734884, 1746416203 },
-	{ 2144759809, 1146598851,  716464489 },
-	{ 2144757761,  286328400, 1823728177 },
-	{ 2144729089, 1347555695, 1836644881 },
-	{ 2144727041, 1795703790,  520296412 },
-	{ 2144696321, 1302475157,  852964281 },
-	{ 2144667649, 1075877614,  504992927 },
-	{ 2144573441,  198765808, 1617144982 },
-	{ 2144555009,  321528767,  155821259 },
-	{ 2144550913,  814139516, 1819937644 },
-	{ 2144536577,  571143206,  962942255 },
-	{ 2144524289, 1746733766,    2471321 },
-	{ 2144512001, 1821415077,  124190939 },
-	{ 2144468993,  917871546, 1260072806 },
-	{ 2144458753,  378417981, 1569240563 },
-	{ 2144421889,  175229668, 1825620763 },
-	{ 2144409601, 1699216963,  351648117 },
-	{ 2144370689, 1071885991,  958186029 },
-	{ 2144348161, 1763151227,  540353574 },
-	{ 2144335873, 1060214804,  919598847 },
-	{ 2144329729,  663515846, 1448552668 },
-	{ 2144327681, 1057776305,  590222840 },
-	{ 2144309249, 1705149168, 1459294624 },
-	{ 2144296961,  325823721, 1649016934 },
-	{ 2144290817,  738775789,  447427206 },
-	{ 2144243713,  962347618,  893050215 },
-	{ 2144237569, 1655257077,  900860862 },
-	{ 2144161793,  242206694, 1567868672 },
-	{ 2144155649,  769415308, 1247993134 },
-	{ 2144137217,  320492023,  515841070 },
-	{ 2144120833, 1639388522,  770877302 },
-	{ 2144071681, 1761785233,  964296120 },
-	{ 2144065537,  419817825,  204564472 },
-	{ 2144028673,  666050597, 2091019760 },
-	{ 2144010241, 1413657615, 1518702610 },
-	{ 2143952897, 1238327946,  475672271 },
-	{ 2143940609,  307063413, 1176750846 },
-	{ 2143918081, 2062905559,  786785803 },
-	{ 2143899649, 1338112849, 1562292083 },
-	{ 2143891457,   68149545,   87166451 },
-	{ 2143885313,  921750778,  394460854 },
-	{ 2143854593,  719766593,  133877196 },
-	{ 2143836161, 1149399850, 1861591875 },
-	{ 2143762433, 1848739366, 1335934145 },
-	{ 2143756289, 1326674710,  102999236 },
-	{ 2143713281,  808061791, 1156900308 },
-	{ 2143690753,  388399459, 1926468019 },
-	{ 2143670273, 1427891374, 1756689401 },
-	{ 2143666177, 1912173949,  986629565 },
-	{ 2143645697, 2041160111,  371842865 },
-	{ 2143641601, 1279906897, 2023974350 },
-	{ 2143635457,  720473174, 1389027526 },
-	{ 2143621121, 1298309455, 1732632006 },
-	{ 2143598593, 1548762216, 1825417506 },
-	{ 2143567873,  620475784, 1073787233 },
-	{ 2143561729, 1932954575,  949167309 },
-	{ 2143553537,  354315656, 1652037534 },
-	{ 2143541249,  577424288, 1097027618 },
-	{ 2143531009,  357862822,  478640055 },
-	{ 2143522817, 2017706025, 1550531668 },
-	{ 2143506433, 2078127419, 1824320165 },
-	{ 2143488001,  613475285, 1604011510 },
-	{ 2143469569, 1466594987,  502095196 },
-	{ 2143426561, 1115430331, 1044637111 },
-	{ 2143383553,    9778045, 1902463734 },
-	{ 2143377409, 1557401276, 2056861771 },
-	{ 2143363073,  652036455, 1965915971 },
-	{ 2143260673, 1464581171, 1523257541 },
-	{ 2143246337, 1876119649,  764541916 },
-	{ 2143209473, 1614992673, 1920672844 },
-	{ 2143203329,  981052047, 2049774209 },
-	{ 2143160321, 1847355533,  728535665 },
-	{ 2143129601,  965558457,  603052992 },
-	{ 2143123457, 2140817191,    8348679 },
-	{ 2143100929, 1547263683,  694209023 },
-	{ 2143092737,  643459066, 1979934533 },
-	{ 2143082497,  188603778, 2026175670 },
-	{ 2143062017, 1657329695,  377451099 },
-	{ 2143051777,  114967950,  979255473 },
-	{ 2143025153, 1698431342, 1449196896 },
-	{ 2143006721, 1862741675, 1739650365 },
-	{ 2142996481,  756660457,  996160050 },
-	{ 2142976001,  927864010, 1166847574 },
-	{ 2142965761,  905070557,  661974566 },
-	{ 2142916609,   40932754, 1787161127 },
-	{ 2142892033, 1987985648,  675335382 },
-	{ 2142885889,  797497211, 1323096997 },
-	{ 2142871553, 2068025830, 1411877159 },
-	{ 2142861313, 1217177090, 1438410687 },
-	{ 2142830593,  409906375, 1767860634 },
-	{ 2142803969, 1197788993,  359782919 },
-	{ 2142785537,  643817365,  513932862 },
-	{ 2142779393, 1717046338,  218943121 },
-	{ 2142724097,   89336830,  416687049 },
-	{ 2142707713,    5944581, 1356813523 },
-	{ 2142658561,  887942135, 2074011722 },
-	{ 2142638081,  151851972, 1647339939 },
-	{ 2142564353, 1691505537, 1483107336 },
-	{ 2142533633, 1989920200, 1135938817 },
-	{ 2142529537,  959263126, 1531961857 },
-	{ 2142527489,  453251129, 1725566162 },
-	{ 2142502913, 1536028102,  182053257 },
-	{ 2142498817,  570138730,  701443447 },
-	{ 2142416897,  326965800,  411931819 },
-	{ 2142363649, 1675665410, 1517191733 },
-	{ 2142351361,  968529566, 1575712703 },
-	{ 2142330881, 1384953238, 1769087884 },
-	{ 2142314497, 1977173242, 1833745524 },
-	{ 2142289921,   95082313, 1714775493 },
-	{ 2142283777,  109377615, 1070584533 },
-	{ 2142277633,   16960510,  702157145 },
-	{ 2142263297,  553850819,  431364395 },
-	{ 2142208001,  241466367, 2053967982 },
-	{ 2142164993, 1795661326, 1031836848 },
-	{ 2142097409, 1212530046,  712772031 },
-	{ 2142087169, 1763869720,  822276067 },
-	{ 2142078977,  644065713, 1765268066 },
-	{ 2142074881,  112671944,  643204925 },
-	{ 2142044161, 1387785471, 1297890174 },
-	{ 2142025729,  783885537, 1000425730 },
-	{ 2142011393,  905662232, 1679401033 },
-	{ 2141974529,  799788433,  468119557 },
-	{ 2141943809, 1932544124,  449305555 },
-	{ 2141933569, 1527403256,  841867925 },
-	{ 2141931521, 1247076451,  743823916 },
-	{ 2141902849, 1199660531,  401687910 },
-	{ 2141890561,  150132350, 1720336972 },
-	{ 2141857793, 1287438162,  663880489 },
-	{ 2141833217,  618017731, 1819208266 },
-	{ 2141820929,  999578638, 1403090096 },
-	{ 2141786113,   81834325, 1523542501 },
-	{ 2141771777,  120001928,  463556492 },
-	{ 2141759489,  122455485, 2124928282 },
-	{ 2141749249,  141986041,  940339153 },
-	{ 2141685761,  889088734,  477141499 },
-	{ 2141673473,  324212681, 1122558298 },
-	{ 2141669377, 1175806187, 1373818177 },
-	{ 2141655041, 1113654822,  296887082 },
-	{ 2141587457,  991103258, 1585913875 },
-	{ 2141583361, 1401451409, 1802457360 },
-	{ 2141575169, 1571977166,  712760980 },
-	{ 2141546497, 1107849376, 1250270109 },
-	{ 2141515777,  196544219,  356001130 },
-	{ 2141495297, 1733571506, 1060744866 },
-	{ 2141483009,  321552363, 1168297026 },
-	{ 2141458433,  505818251,  733225819 },
-	{ 2141360129, 1026840098,  948342276 },
-	{ 2141325313,  945133744, 2129965998 },
-	{ 2141317121, 1871100260, 1843844634 },
-	{ 2141286401, 1790639498, 1750465696 },
-	{ 2141267969, 1376858592,  186160720 },
-	{ 2141255681, 2129698296, 1876677959 },
-	{ 2141243393, 2138900688, 1340009628 },
-	{ 2141214721, 1933049835, 1087819477 },
-	{ 2141212673, 1898664939, 1786328049 },
-	{ 2141202433,  990234828,  940682169 },
-	{ 2141175809, 1406392421,  993089586 },
-	{ 2141165569, 1263518371,  289019479 },
-	{ 2141073409, 1485624211,  507864514 },
-	{ 2141052929, 1885134788,  311252465 },
-	{ 2141040641, 1285021247,  280941862 },
-	{ 2141028353, 1527610374,  375035110 },
-	{ 2141011969, 1400626168,  164696620 },
-	{ 2140999681,  632959608,  966175067 },
-	{ 2140997633, 2045628978, 1290889438 },
-	{ 2140993537, 1412755491,  375366253 },
-	{ 2140942337,  719477232,  785367828 },
-	{ 2140925953,   45224252,  836552317 },
-	{ 2140917761, 1157376588, 1001839569 },
-	{ 2140887041,  278480752, 2098732796 },
-	{ 2140837889, 1663139953,  924094810 },
-	{ 2140788737,  802501511, 2045368990 },
-	{ 2140766209, 1820083885, 1800295504 },
-	{ 2140764161, 1169561905, 2106792035 },
-	{ 2140696577,  127781498, 1885987531 },
-	{ 2140684289,   16014477, 1098116827 },
-	{ 2140653569,  665960598, 1796728247 },
-	{ 2140594177, 1043085491,  377310938 },
-	{ 2140579841, 1732838211, 1504505945 },
-	{ 2140569601,  302071939,  358291016 },
-	{ 2140567553,  192393733, 1909137143 },
-	{ 2140557313,  406595731, 1175330270 },
-	{ 2140549121, 1748850918,  525007007 },
-	{ 2140477441,  499436566, 1031159814 },
-	{ 2140469249, 1886004401, 1029951320 },
-	{ 2140426241, 1483168100, 1676273461 },
-	{ 2140420097, 1779917297,  846024476 },
-	{ 2140413953,  522948893, 1816354149 },
-	{ 2140383233, 1931364473, 1296921241 },
-	{ 2140366849, 1917356555,  147196204 },
-	{ 2140354561,   16466177, 1349052107 },
-	{ 2140348417, 1875366972, 1860485634 },
-	{ 2140323841,  456498717, 1790256483 },
-	{ 2140321793, 1629493973,  150031888 },
-	{ 2140315649, 1904063898,  395510935 },
-	{ 2140280833, 1784104328,  831417909 },
-	{ 2140250113,  256087139,  697349101 },
-	{ 2140229633,  388553070,  243875754 },
-	{ 2140223489,  747459608, 1396270850 },
-	{ 2140200961,  507423743, 1895572209 },
-	{ 2140162049,  580106016, 2045297469 },
-	{ 2140149761,  712426444,  785217995 },
-	{ 2140137473, 1441607584,  536866543 },
-	{ 2140119041,  346538902, 1740434653 },
-	{ 2140090369,  282642885,   21051094 },
-	{ 2140076033, 1407456228,  319910029 },
-	{ 2140047361, 1619330500, 1488632070 },
-	{ 2140041217, 2089408064, 2012026134 },
-	{ 2140008449, 1705524800, 1613440760 },
-	{ 2139924481, 1846208233, 1280649481 },
-	{ 2139906049,  989438755, 1185646076 },
-	{ 2139867137, 1522314850,  372783595 },
-	{ 2139842561, 1681587377,  216848235 },
-	{ 2139826177, 2066284988, 1784999464 },
-	{ 2139824129,  480888214, 1513323027 },
-	{ 2139789313,  847937200,  858192859 },
-	{ 2139783169, 1642000434, 1583261448 },
-	{ 2139770881,  940699589,  179702100 },
-	{ 2139768833,  315623242,  964612676 },
-	{ 2139666433,  331649203,  764666914 },
-	{ 2139641857, 2118730799, 1313764644 },
-	{ 2139635713,  519149027,  519212449 },
-	{ 2139598849, 1526413634, 1769667104 },
-	{ 2139574273,  551148610,  820739925 },
-	{ 2139568129, 1386800242,  472447405 },
-	{ 2139549697,  813760130, 1412328531 },
-	{ 2139537409, 1615286260, 1609362979 },
-	{ 2139475969, 1352559299, 1696720421 },
-	{ 2139455489, 1048691649, 1584935400 },
-	{ 2139432961,  836025845,  950121150 },
-	{ 2139424769, 1558281165, 1635486858 },
-	{ 2139406337, 1728402143, 1674423301 },
-	{ 2139396097, 1727715782, 1483470544 },
-	{ 2139383809, 1092853491, 1741699084 },
-	{ 2139369473,  690776899, 1242798709 },
-	{ 2139351041, 1768782380, 2120712049 },
-	{ 2139334657, 1739968247, 1427249225 },
-	{ 2139332609, 1547189119,  623011170 },
-	{ 2139310081, 1346827917, 1605466350 },
-	{ 2139303937,  369317948,  828392831 },
-	{ 2139301889, 1560417239, 1788073219 },
-	{ 2139283457, 1303121623,  595079358 },
-	{ 2139248641, 1354555286,  573424177 },
-	{ 2139240449,   60974056,  885781403 },
-	{ 2139222017,  355573421, 1221054839 },
-	{ 2139215873,  566477826, 1724006500 },
-	{ 2139150337,  871437673, 1609133294 },
-	{ 2139144193, 1478130914, 1137491905 },
-	{ 2139117569, 1854880922,  964728507 },
-	{ 2139076609,  202405335,  756508944 },
-	{ 2139062273, 1399715741,  884826059 },
-	{ 2139045889, 1051045798, 1202295476 },
-	{ 2139033601, 1707715206,  632234634 },
-	{ 2139006977, 2035853139,  231626690 },
-	{ 2138951681,  183867876,  838350879 },
-	{ 2138945537, 1403254661,  404460202 },
-	{ 2138920961,  310865011, 1282911681 },
-	{ 2138910721, 1328496553,  103472415 },
-	{ 2138904577,   78831681,  993513549 },
-	{ 2138902529, 1319697451, 1055904361 },
-	{ 2138816513,  384338872, 1706202469 },
-	{ 2138810369, 1084868275,  405677177 },
-	{ 2138787841,  401181788, 1964773901 },
-	{ 2138775553, 1850532988, 1247087473 },
-	{ 2138767361,  874261901, 1576073565 },
-	{ 2138757121, 1187474742,  993541415 },
-	{ 2138748929, 1782458888, 1043206483 },
-	{ 2138744833, 1221500487,  800141243 },
-	{ 2138738689,  413465368, 1450660558 },
-	{ 2138695681,  739045140,  342611472 },
-	{ 2138658817, 1355845756,  672674190 },
-	{ 2138644481,  608379162, 1538874380 },
-	{ 2138632193, 1444914034,  686911254 },
-	{ 2138607617,  484707818, 1435142134 },
-	{ 2138591233,  539460669, 1290458549 },
-	{ 2138572801, 2093538990, 2011138646 },
-	{ 2138552321, 1149786988, 1076414907 },
-	{ 2138546177,  840688206, 2108985273 },
-	{ 2138533889,  209669619,  198172413 },
-	{ 2138523649, 1975879426, 1277003968 },
-	{ 2138490881, 1351891144, 1976858109 },
-	{ 2138460161, 1817321013, 1979278293 },
-	{ 2138429441, 1950077177,  203441928 },
-	{ 2138400769,  908970113,  628395069 },
-	{ 2138398721,  219890864,  758486760 },
-	{ 2138376193, 1306654379,  977554090 },
-	{ 2138351617,  298822498, 2004708503 },
-	{ 2138337281,  441457816, 1049002108 },
-	{ 2138320897, 1517731724, 1442269609 },
-	{ 2138290177, 1355911197, 1647139103 },
-	{ 2138234881,  531313247, 1746591962 },
-	{ 2138214401, 1899410930,  781416444 },
-	{ 2138202113, 1813477173, 1622508515 },
-	{ 2138191873, 1086458299, 1025408615 },
-	{ 2138183681, 1998800427,  827063290 },
-	{ 2138173441, 1921308898,  749670117 },
-	{ 2138103809, 1620902804, 2126787647 },
-	{ 2138099713,  828647069, 1892961817 },
-	{ 2138085377,  179405355, 1525506535 },
-	{ 2138060801,  615683235, 1259580138 },
-	{ 2138044417, 2030277840, 1731266562 },
-	{ 2138042369, 2087222316, 1627902259 },
-	{ 2138032129,  126388712, 1108640984 },
-	{ 2138011649,  715026550, 1017980050 },
-	{ 2137993217, 1693714349, 1351778704 },
-	{ 2137888769, 1289762259, 1053090405 },
-	{ 2137853953,  199991890, 1254192789 },
-	{ 2137833473,  941421685,  896995556 },
-	{ 2137817089,  750416446, 1251031181 },
-	{ 2137792513,  798075119,  368077456 },
-	{ 2137786369,  878543495, 1035375025 },
-	{ 2137767937,    9351178, 1156563902 },
-	{ 2137755649, 1382297614, 1686559583 },
-	{ 2137724929, 1345472850, 1681096331 },
-	{ 2137704449,  834666929,  630551727 },
-	{ 2137673729, 1646165729, 1892091571 },
-	{ 2137620481,  778943821,   48456461 },
-	{ 2137618433, 1730837875, 1713336725 },
-	{ 2137581569,  805610339, 1378891359 },
-	{ 2137538561,  204342388, 1950165220 },
-	{ 2137526273, 1947629754, 1500789441 },
-	{ 2137516033,  719902645, 1499525372 },
-	{ 2137491457,  230451261,  556382829 },
-	{ 2137440257,  979573541,  412760291 },
-	{ 2137374721,  927841248, 1954137185 },
-	{ 2137362433, 1243778559,  861024672 },
-	{ 2137313281, 1341338501,  980638386 },
-	{ 2137311233,  937415182, 1793212117 },
-	{ 2137255937,  795331324, 1410253405 },
-	{ 2137243649,  150756339, 1966999887 },
-	{ 2137182209,  163346914, 1939301431 },
-	{ 2137171969, 1952552395,  758913141 },
-	{ 2137159681,  570788721,  218668666 },
-	{ 2137147393, 1896656810, 2045670345 },
-	{ 2137141249,  358493842,  518199643 },
-	{ 2137139201, 1505023029,  674695848 },
-	{ 2137133057,   27911103,  830956306 },
-	{ 2137122817,  439771337, 1555268614 },
-	{ 2137116673,  790988579, 1871449599 },
-	{ 2137110529,  432109234,  811805080 },
-	{ 2137102337, 1357900653, 1184997641 },
-	{ 2137098241,  515119035, 1715693095 },
-	{ 2137090049,  408575203, 2085660657 },
-	{ 2137085953, 2097793407, 1349626963 },
-	{ 2137055233, 1556739954, 1449960883 },
-	{ 2137030657, 1545758650, 1369303716 },
-	{ 2136987649,  332602570,  103875114 },
-	{ 2136969217, 1499989506, 1662964115 },
-	{ 2136924161,  857040753,    4738842 },
-	{ 2136895489, 1948872712,  570436091 },
-	{ 2136893441,   58969960, 1568349634 },
-	{ 2136887297, 2127193379,  273612548 },
-	{ 2136850433,  111208983, 1181257116 },
-	{ 2136809473, 1627275942, 1680317971 },
-	{ 2136764417, 1574888217,   14011331 },
-	{ 2136741889,   14011055, 1129154251 },
-	{ 2136727553,   35862563, 1838555253 },
-	{ 2136721409,  310235666, 1363928244 },
-	{ 2136698881, 1612429202, 1560383828 },
-	{ 2136649729, 1138540131,  800014364 },
-	{ 2136606721,  602323503, 1433096652 },
-	{ 2136563713,  182209265, 1919611038 },
-	{ 2136555521,  324156477,  165591039 },
-	{ 2136549377,  195513113,  217165345 },
-	{ 2136526849, 1050768046,  939647887 },
-	{ 2136508417, 1886286237, 1619926572 },
-	{ 2136477697,  609647664,   35065157 },
-	{ 2136471553,  679352216, 1452259468 },
-	{ 2136457217,  128630031,  824816521 },
-	{ 2136422401,   19787464, 1526049830 },
-	{ 2136420353,  698316836, 1530623527 },
-	{ 2136371201, 1651862373, 1804812805 },
-	{ 2136334337,  326596005,  336977082 },
-	{ 2136322049,   63253370, 1904972151 },
-	{ 2136297473,  312176076,  172182411 },
-	{ 2136248321,  381261841,  369032670 },
-	{ 2136242177,  358688773, 1640007994 },
-	{ 2136229889,  512677188,   75585225 },
-	{ 2136219649, 2095003250, 1970086149 },
-	{ 2136207361, 1909650722,  537760675 },
-	{ 2136176641, 1334616195, 1533487619 },
-	{ 2136158209, 2096285632, 1793285210 },
-	{ 2136143873, 1897347517,  293843959 },
-	{ 2136133633,  923586222, 1022655978 },
-	{ 2136096769, 1464868191, 1515074410 },
-	{ 2136094721, 2020679520, 2061636104 },
-	{ 2136076289,  290798503, 1814726809 },
-	{ 2136041473,  156415894, 1250757633 },
-	{ 2135996417,  297459940, 1132158924 },
-	{ 2135955457,  538755304, 1688831340 },
-	{ 0, 0, 0 }
-};
-
-/*
- * Reduce a small signed integer modulo a small prime. The source
- * value x MUST be such that -p < x < p.
- */
-static inline uint32_t
-modp_set(int32_t x, uint32_t p)
-{
-	uint32_t w;
-
-	w = (uint32_t)x;
-	w += p & -(w >> 31);
-	return w;
-}
-
-/*
- * Normalize a modular integer around 0.
- */
-static inline int32_t
-modp_norm(uint32_t x, uint32_t p)
-{
-	return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
-}
-
-/*
- * Compute -1/p mod 2^31. This works for all odd integers p that fit
- * on 31 bits.
- */
-static uint32_t
-modp_ninv31(uint32_t p)
-{
-	uint32_t y;
-
-	y = 2 - p;
-	y *= 2 - p * y;
-	y *= 2 - p * y;
-	y *= 2 - p * y;
-	y *= 2 - p * y;
-	return (uint32_t)0x7FFFFFFF & -y;
-}
-
-/*
- * Compute R = 2^31 mod p.
- */
-static inline uint32_t
-modp_R(uint32_t p)
-{
-	/*
-	 * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
-	 * 2^31 - p.
-	 */
-	return ((uint32_t)1 << 31) - p;
-}
-
-/*
- * Addition modulo p.
- */
-static inline uint32_t
-modp_add(uint32_t a, uint32_t b, uint32_t p)
-{
-	uint32_t d;
-
-	d = a + b - p;
-	d += p & -(d >> 31);
-	return d;
-}
-
-/*
- * Subtraction modulo p.
- */
-static inline uint32_t
-modp_sub(uint32_t a, uint32_t b, uint32_t p)
-{
-	uint32_t d;
-
-	d = a - b;
-	d += p & -(d >> 31);
-	return d;
-}
-
-/*
- * Halving modulo p.
- */
-/* unused
-static inline uint32_t
-modp_half(uint32_t a, uint32_t p)
-{
-	a += p & -(a & 1);
-	return a >> 1;
-}
-*/
-
-/*
- * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
- * It is required that p is an odd integer.
- */
-static inline uint32_t
-modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i)
-{
-	uint64_t z, w;
-	uint32_t d;
-
-	z = (uint64_t)a * (uint64_t)b;
-	w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
-	d = (uint32_t)((z + w) >> 31) - p;
-	d += p & -(d >> 31);
-	return d;
-}
-
-/*
- * Compute R2 = 2^62 mod p.
- */
-static uint32_t
-modp_R2(uint32_t p, uint32_t p0i)
-{
-	uint32_t z;
-
-	/*
-	 * Compute z = 2^31 mod p (this is the value 1 in Montgomery
-	 * representation), then double it with an addition.
-	 */
-	z = modp_R(p);
-	z = modp_add(z, z, p);
-
-	/*
-	 * Square it five times to obtain 2^32 in Montgomery representation
-	 * (i.e. 2^63 mod p).
-	 */
-	z = modp_montymul(z, z, p, p0i);
-	z = modp_montymul(z, z, p, p0i);
-	z = modp_montymul(z, z, p, p0i);
-	z = modp_montymul(z, z, p, p0i);
-	z = modp_montymul(z, z, p, p0i);
-
-	/*
-	 * Halve the value mod p to get 2^62.
-	 */
-	z = (z + (p & -(z & 1))) >> 1;
-	return z;
-}
-
-/*
- * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
- * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
- * -1/p mod 2^31; R2 must be equal to 2^62 mod p.
- */
-static inline uint32_t
-modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2)
-{
-	int i;
-	uint32_t r, z;
-
-	/*
-	 * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
-	 * representation of (2^31)^e mod p, where e = x-1.
-	 * R2 is 2^31 in Montgomery representation.
-	 */
-	x --;
-	r = R2;
-	z = modp_R(p);
-	for (i = 0; (1U << i) <= x; i ++) {
-		if ((x & (1U << i)) != 0) {
-			z = modp_montymul(z, r, p, p0i);
-		}
-		r = modp_montymul(r, r, p, p0i);
-	}
-	return z;
-}
-
-/*
- * Division modulo p. If the divisor (b) is 0, then 0 is returned.
- * This function computes proper results only when p is prime.
- * Parameters:
- *   a     dividend
- *   b     divisor
- *   p     odd prime modulus
- *   p0i   -1/p mod 2^31
- *   R     2^31 mod R
- */
-static uint32_t
-modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R)
-{
-	uint32_t z, e;
-	int i;
-
-	e = p - 2;
-	z = R;
-	for (i = 30; i >= 0; i --) {
-		uint32_t z2;
-
-		z = modp_montymul(z, z, p, p0i);
-		z2 = modp_montymul(z, b, p, p0i);
-		z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
-	}
-
-	/*
-	 * The loop above just assumed that b was in Montgomery
-	 * representation, i.e. really contained b*R; under that
-	 * assumption, it returns 1/b in Montgomery representation,
-	 * which is R/b. But we gave it b in normal representation,
-	 * so the loop really returned R/(b/R) = R^2/b.
-	 *
-	 * We want a/b, so we need one Montgomery multiplication with a,
-	 * which also remove one of the R factors, and another such
-	 * multiplication to remove the second R factor.
-	 */
-	z = modp_montymul(z, 1, p, p0i);
-	return modp_montymul(a, z, p, p0i);
-}
-
-/*
- * Bit-reversal index table.
- */
-static const uint16_t REV10[] = {
-	   0,  512,  256,  768,  128,  640,  384,  896,   64,  576,  320,  832,
-	 192,  704,  448,  960,   32,  544,  288,  800,  160,  672,  416,  928,
-	  96,  608,  352,  864,  224,  736,  480,  992,   16,  528,  272,  784,
-	 144,  656,  400,  912,   80,  592,  336,  848,  208,  720,  464,  976,
-	  48,  560,  304,  816,  176,  688,  432,  944,  112,  624,  368,  880,
-	 240,  752,  496, 1008,    8,  520,  264,  776,  136,  648,  392,  904,
-	  72,  584,  328,  840,  200,  712,  456,  968,   40,  552,  296,  808,
-	 168,  680,  424,  936,  104,  616,  360,  872,  232,  744,  488, 1000,
-	  24,  536,  280,  792,  152,  664,  408,  920,   88,  600,  344,  856,
-	 216,  728,  472,  984,   56,  568,  312,  824,  184,  696,  440,  952,
-	 120,  632,  376,  888,  248,  760,  504, 1016,    4,  516,  260,  772,
-	 132,  644,  388,  900,   68,  580,  324,  836,  196,  708,  452,  964,
-	  36,  548,  292,  804,  164,  676,  420,  932,  100,  612,  356,  868,
-	 228,  740,  484,  996,   20,  532,  276,  788,  148,  660,  404,  916,
-	  84,  596,  340,  852,  212,  724,  468,  980,   52,  564,  308,  820,
-	 180,  692,  436,  948,  116,  628,  372,  884,  244,  756,  500, 1012,
-	  12,  524,  268,  780,  140,  652,  396,  908,   76,  588,  332,  844,
-	 204,  716,  460,  972,   44,  556,  300,  812,  172,  684,  428,  940,
-	 108,  620,  364,  876,  236,  748,  492, 1004,   28,  540,  284,  796,
-	 156,  668,  412,  924,   92,  604,  348,  860,  220,  732,  476,  988,
-	  60,  572,  316,  828,  188,  700,  444,  956,  124,  636,  380,  892,
-	 252,  764,  508, 1020,    2,  514,  258,  770,  130,  642,  386,  898,
-	  66,  578,  322,  834,  194,  706,  450,  962,   34,  546,  290,  802,
-	 162,  674,  418,  930,   98,  610,  354,  866,  226,  738,  482,  994,
-	  18,  530,  274,  786,  146,  658,  402,  914,   82,  594,  338,  850,
-	 210,  722,  466,  978,   50,  562,  306,  818,  178,  690,  434,  946,
-	 114,  626,  370,  882,  242,  754,  498, 1010,   10,  522,  266,  778,
-	 138,  650,  394,  906,   74,  586,  330,  842,  202,  714,  458,  970,
-	  42,  554,  298,  810,  170,  682,  426,  938,  106,  618,  362,  874,
-	 234,  746,  490, 1002,   26,  538,  282,  794,  154,  666,  410,  922,
-	  90,  602,  346,  858,  218,  730,  474,  986,   58,  570,  314,  826,
-	 186,  698,  442,  954,  122,  634,  378,  890,  250,  762,  506, 1018,
-	   6,  518,  262,  774,  134,  646,  390,  902,   70,  582,  326,  838,
-	 198,  710,  454,  966,   38,  550,  294,  806,  166,  678,  422,  934,
-	 102,  614,  358,  870,  230,  742,  486,  998,   22,  534,  278,  790,
-	 150,  662,  406,  918,   86,  598,  342,  854,  214,  726,  470,  982,
-	  54,  566,  310,  822,  182,  694,  438,  950,  118,  630,  374,  886,
-	 246,  758,  502, 1014,   14,  526,  270,  782,  142,  654,  398,  910,
-	  78,  590,  334,  846,  206,  718,  462,  974,   46,  558,  302,  814,
-	 174,  686,  430,  942,  110,  622,  366,  878,  238,  750,  494, 1006,
-	  30,  542,  286,  798,  158,  670,  414,  926,   94,  606,  350,  862,
-	 222,  734,  478,  990,   62,  574,  318,  830,  190,  702,  446,  958,
-	 126,  638,  382,  894,  254,  766,  510, 1022,    1,  513,  257,  769,
-	 129,  641,  385,  897,   65,  577,  321,  833,  193,  705,  449,  961,
-	  33,  545,  289,  801,  161,  673,  417,  929,   97,  609,  353,  865,
-	 225,  737,  481,  993,   17,  529,  273,  785,  145,  657,  401,  913,
-	  81,  593,  337,  849,  209,  721,  465,  977,   49,  561,  305,  817,
-	 177,  689,  433,  945,  113,  625,  369,  881,  241,  753,  497, 1009,
-	   9,  521,  265,  777,  137,  649,  393,  905,   73,  585,  329,  841,
-	 201,  713,  457,  969,   41,  553,  297,  809,  169,  681,  425,  937,
-	 105,  617,  361,  873,  233,  745,  489, 1001,   25,  537,  281,  793,
-	 153,  665,  409,  921,   89,  601,  345,  857,  217,  729,  473,  985,
-	  57,  569,  313,  825,  185,  697,  441,  953,  121,  633,  377,  889,
-	 249,  761,  505, 1017,    5,  517,  261,  773,  133,  645,  389,  901,
-	  69,  581,  325,  837,  197,  709,  453,  965,   37,  549,  293,  805,
-	 165,  677,  421,  933,  101,  613,  357,  869,  229,  741,  485,  997,
-	  21,  533,  277,  789,  149,  661,  405,  917,   85,  597,  341,  853,
-	 213,  725,  469,  981,   53,  565,  309,  821,  181,  693,  437,  949,
-	 117,  629,  373,  885,  245,  757,  501, 1013,   13,  525,  269,  781,
-	 141,  653,  397,  909,   77,  589,  333,  845,  205,  717,  461,  973,
-	  45,  557,  301,  813,  173,  685,  429,  941,  109,  621,  365,  877,
-	 237,  749,  493, 1005,   29,  541,  285,  797,  157,  669,  413,  925,
-	  93,  605,  349,  861,  221,  733,  477,  989,   61,  573,  317,  829,
-	 189,  701,  445,  957,  125,  637,  381,  893,  253,  765,  509, 1021,
-	   3,  515,  259,  771,  131,  643,  387,  899,   67,  579,  323,  835,
-	 195,  707,  451,  963,   35,  547,  291,  803,  163,  675,  419,  931,
-	  99,  611,  355,  867,  227,  739,  483,  995,   19,  531,  275,  787,
-	 147,  659,  403,  915,   83,  595,  339,  851,  211,  723,  467,  979,
-	  51,  563,  307,  819,  179,  691,  435,  947,  115,  627,  371,  883,
-	 243,  755,  499, 1011,   11,  523,  267,  779,  139,  651,  395,  907,
-	  75,  587,  331,  843,  203,  715,  459,  971,   43,  555,  299,  811,
-	 171,  683,  427,  939,  107,  619,  363,  875,  235,  747,  491, 1003,
-	  27,  539,  283,  795,  155,  667,  411,  923,   91,  603,  347,  859,
-	 219,  731,  475,  987,   59,  571,  315,  827,  187,  699,  443,  955,
-	 123,  635,  379,  891,  251,  763,  507, 1019,    7,  519,  263,  775,
-	 135,  647,  391,  903,   71,  583,  327,  839,  199,  711,  455,  967,
-	  39,  551,  295,  807,  167,  679,  423,  935,  103,  615,  359,  871,
-	 231,  743,  487,  999,   23,  535,  279,  791,  151,  663,  407,  919,
-	  87,  599,  343,  855,  215,  727,  471,  983,   55,  567,  311,  823,
-	 183,  695,  439,  951,  119,  631,  375,  887,  247,  759,  503, 1015,
-	  15,  527,  271,  783,  143,  655,  399,  911,   79,  591,  335,  847,
-	 207,  719,  463,  975,   47,  559,  303,  815,  175,  687,  431,  943,
-	 111,  623,  367,  879,  239,  751,  495, 1007,   31,  543,  287,  799,
-	 159,  671,  415,  927,   95,  607,  351,  863,  223,  735,  479,  991,
-	  63,  575,  319,  831,  191,  703,  447,  959,  127,  639,  383,  895,
-	 255,  767,  511, 1023
-};
-
-/*
- * Compute the roots for NTT and inverse NTT (binary case). Input
- * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
- * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
- *   gm[rev(i)] = g^i mod p
- *   igm[rev(i)] = (1/g)^i mod p
- * where rev() is the "bit reversal" function over 10 bits. It fills
- * the arrays only up to N = 2^logn values.
- *
- * The values stored in gm[] and igm[] are in Montgomery representation.
- *
- * p must be a prime such that p = 1 mod 2048.
- */
-static void
-modp_mkgm2(uint32_t *restrict gm, uint32_t *restrict igm, unsigned logn,
-	uint32_t g, uint32_t p, uint32_t p0i)
-{
-	size_t u, n;
-	unsigned k;
-	uint32_t ig, x1, x2, R2;
-
-	n = (size_t)1 << logn;
-
-	/*
-	 * We want g such that g^(2N) = 1 mod p, but the provided
-	 * generator has order 2048. We must square it a few times.
-	 */
-	R2 = modp_R2(p, p0i);
-	g = modp_montymul(g, R2, p, p0i);
-	for (k = logn; k < 10; k ++) {
-		g = modp_montymul(g, g, p, p0i);
-	}
-
-	ig = modp_div(R2, g, p, p0i, modp_R(p));
-	k = 10 - logn;
-	x1 = x2 = modp_R(p);
-	for (u = 0; u < n; u ++) {
-		size_t v;
-
-		v = REV10[u << k];
-		gm[v] = x1;
-		igm[v] = x2;
-		x1 = modp_montymul(x1, g, p, p0i);
-		x2 = modp_montymul(x2, ig, p, p0i);
-	}
-}
-
-/*
- * Compute the NTT over a polynomial (binary case). Polynomial elements
- * are a[0], a[stride], a[2 * stride]...
- */
-static void
-modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
-	uint32_t p, uint32_t p0i)
-{
-	size_t t, m, n;
-
-	if (logn == 0) {
-		return;
-	}
-	n = (size_t)1 << logn;
-	t = n;
-	for (m = 1; m < n; m <<= 1) {
-		size_t ht, u, v1;
-
-		ht = t >> 1;
-		for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
-			uint32_t s;
-			size_t v;
-			uint32_t *r1, *r2;
-
-			s = gm[m + u];
-			r1 = a + v1 * stride;
-			r2 = r1 + ht * stride;
-			for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
-				uint32_t x, y;
-
-				x = *r1;
-				y = modp_montymul(*r2, s, p, p0i);
-				*r1 = modp_add(x, y, p);
-				*r2 = modp_sub(x, y, p);
-			}
-		}
-		t = ht;
-	}
-}
-
-/*
- * Compute the inverse NTT over a polynomial (binary case).
- */
-static void
-modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
-	uint32_t p, uint32_t p0i)
-{
-	size_t t, m, n, k;
-	uint32_t ni;
-	uint32_t *r;
-
-	if (logn == 0) {
-		return;
-	}
-	n = (size_t)1 << logn;
-	t = 1;
-	for (m = n; m > 1; m >>= 1) {
-		size_t hm, dt, u, v1;
-
-		hm = m >> 1;
-		dt = t << 1;
-		for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
-			uint32_t s;
-			size_t v;
-			uint32_t *r1, *r2;
-
-			s = igm[hm + u];
-			r1 = a + v1 * stride;
-			r2 = r1 + t * stride;
-			for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
-				uint32_t x, y;
-
-				x = *r1;
-				y = *r2;
-				*r1 = modp_add(x, y, p);
-				*r2 = modp_montymul(
-					modp_sub(x, y, p), s, p, p0i);;
-			}
-		}
-		t = dt;
-	}
-
-	/*
-	 * We need 1/n in Montgomery representation, i.e. R/n. Since
-	 * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
-	 * thus a simple shift will do.
-	 */
-	ni = (uint32_t)1 << (31 - logn);
-	for (k = 0, r = a; k < n; k ++, r += stride) {
-		*r = modp_montymul(*r, ni, p, p0i);
-	}
-}
-
-/*
- * Simplified macros for NTT and iNTT (binary case) when the elements
- * are consecutive in RAM.
- */
-#define modp_NTT2(a, gm, logn, p, p0i)   modp_NTT2_ext(a, 1, gm, logn, p, p0i)
-#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)
-
-/*
- * Given polynomial f in NTT representation modulo p, compute f' of degree
- * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
- * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
- *
- * The new polynomial is written "in place" over the first N/2 elements
- * of f.
- *
- * If applied logn times successively on a given polynomial, the resulting
- * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
- *
- * This function applies only to the binary case; it is invoked from
- * solve_NTRU_binary_depth1().
- */
-static void
-modp_poly_rec_res(uint32_t *f, unsigned logn,
-	uint32_t p, uint32_t p0i, uint32_t R2)
-{
-	size_t hn, u;
-
-	hn = (size_t)1 << (logn - 1);
-	for (u = 0; u < hn; u ++) {
-		uint32_t w0, w1;
-
-		w0 = f[(u << 1) + 0];
-		w1 = f[(u << 1) + 1];
-		f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
-	}
-}
-
-/* ==================================================================== */
-/*
- * Custom bignum implementation.
- *
- * This is a very reduced set of functionalities. We need to do the
- * following operations:
- *
- *  - Rebuild the resultant and the polynomial coefficients from their
- *    values modulo small primes (of length 31 bits each).
- *
- *  - Compute an extended GCD between the two computed resultants.
- *
- *  - Extract top bits and add scaled values during the successive steps
- *    of Babai rounding.
- *
- * When rebuilding values using CRT, we must also recompute the product
- * of the small prime factors. We always do it one small factor at a
- * time, so the "complicated" operations can be done modulo the small
- * prime with the modp_* functions. CRT coefficients (inverses) are
- * precomputed.
- *
- * All values are positive until the last step: when the polynomial
- * coefficients have been rebuilt, we normalize them around 0. But then,
- * only additions and subtractions on the upper few bits are needed
- * afterwards.
- *
- * We keep big integers as arrays of 31-bit words (in uint32_t values);
- * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
- * makes it easier to keep track of carries. When negative values are
- * used, two's complement is used.
- */
-
-/*
- * Subtract integer b from integer a. Both integers are supposed to have
- * the same size. The carry (0 or 1) is returned. Source arrays a and b
- * MUST be distinct.
- *
- * The operation is performed as described above if ctr = 1. If
- * ctl = 0, the value a[] is unmodified, but all memory accesses are
- * still performed, and the carry is computed and returned.
- */
-static uint32_t
-zint_sub(uint32_t *restrict a, const uint32_t *restrict b, size_t len,
-	uint32_t ctl)
-{
-	size_t u;
-	uint32_t cc, m;
-
-	cc = 0;
-	m = -ctl;
-	for (u = 0; u < len; u ++) {
-		uint32_t aw, w;
-
-		aw = a[u];
-		w = aw - b[u] - cc;
-		cc = w >> 31;
-		aw ^= ((w & 0x7FFFFFFF) ^ aw) & m;
-		a[u] = aw;
-	}
-	return cc;
-}
-
-/*
- * Mutiply the provided big integer m with a small value x.
- * This function assumes that x < 2^31. The carry word is returned.
- */
-static uint32_t
-zint_mul_small(uint32_t *m, size_t mlen, uint32_t x)
-{
-	size_t u;
-	uint32_t cc;
-
-	cc = 0;
-	for (u = 0; u < mlen; u ++) {
-		uint64_t z;
-
-		z = (uint64_t)m[u] * (uint64_t)x + cc;
-		m[u] = (uint32_t)z & 0x7FFFFFFF;
-		cc = (uint32_t)(z >> 31);
-	}
-	return cc;
-}
-
-/*
- * Reduce a big integer d modulo a small integer p.
- * Rules:
- *  d is unsigned
- *  p is prime
- *  2^30 < p < 2^31
- *  p0i = -(1/p) mod 2^31
- *  R2 = 2^62 mod p
- */
-static uint32_t
-zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
-	uint32_t p, uint32_t p0i, uint32_t R2)
-{
-	uint32_t x;
-	size_t u;
-
-	/*
-	 * Algorithm: we inject words one by one, starting with the high
-	 * word. Each step is:
-	 *  - multiply x by 2^31
-	 *  - add new word
-	 */
-	x = 0;
-	u = dlen;
-	while (u -- > 0) {
-		uint32_t w;
-
-		x = modp_montymul(x, R2, p, p0i);
-		w = d[u] - p;
-		w += p & -(w >> 31);
-		x = modp_add(x, w, p);
-	}
-	return x;
-}
-
-/*
- * Similar to zint_mod_small_unsigned(), except that d may be signed.
- * Extra parameter is Rx = 2^(31*dlen) mod p.
- */
-static uint32_t
-zint_mod_small_signed(const uint32_t *d, size_t dlen,
-	uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx)
-{
-	uint32_t z;
-
-	if (dlen == 0) {
-		return 0;
-	}
-	z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
-	z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
-	return z;
-}
-
-/*
- * Add y*s to x. x and y initially have length 'len' words; the new x
- * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must
- * not overlap.
- */
-static void
-zint_add_mul_small(uint32_t *restrict x,
-	const uint32_t *restrict y, size_t len, uint32_t s)
-{
-	size_t u;
-	uint32_t cc;
-
-	cc = 0;
-	for (u = 0; u < len; u ++) {
-		uint32_t xw, yw;
-		uint64_t z;
-
-		xw = x[u];
-		yw = y[u];
-		z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
-		x[u] = (uint32_t)z & 0x7FFFFFFF;
-		cc = (uint32_t)(z >> 31);
-	}
-	x[len] = cc;
-}
-
-/*
- * Normalize a modular integer around 0: if x > p/2, then x is replaced
- * with x - p (signed encoding with two's complement); otherwise, x is
- * untouched. The two integers x and p are encoded over the same length.
- */
-static void
-zint_norm_zero(uint32_t *restrict x, const uint32_t *restrict p, size_t len)
-{
-	size_t u;
-	uint32_t r, bb;
-
-	/*
-	 * Compare x with p/2. We use the shifted version of p, and p
-	 * is odd, so we really compare with (p-1)/2; we want to perform
-	 * the subtraction if and only if x > (p-1)/2.
-	 */
-	r = 0;
-	bb = 0;
-	u = len;
-	while (u -- > 0) {
-		uint32_t wx, wp, cc;
-
-		/*
-		 * Get the two words to compare in wx and wp (both over
-		 * 31 bits exactly).
-		 */
-		wx = x[u];
-		wp = (p[u] >> 1) | (bb << 30);
-		bb = p[u] & 1;
-
-		/*
-		 * We set cc to -1, 0 or 1, depending on whether wp is
-		 * lower than, equal to, or greater than wx.
-		 */
-		cc = wp - wx;
-		cc = ((-cc) >> 31) | -(cc >> 31);
-
-		/*
-		 * If r != 0 then it is either 1 or -1, and we keep its
-		 * value. Otherwise, if r = 0, then we replace it with cc.
-		 */
-		r |= cc & ((r & 1) - 1);
-	}
-
-	/*
-	 * At this point, r = -1, 0 or 1, depending on whether (p-1)/2
-	 * is lower than, equal to, or greater than x. We thus want to
-	 * do the subtraction only if r = -1.
-	 */
-	zint_sub(x, p, len, r >> 31);
-}
-
-/*
- * Rebuild integers from their RNS representation. There are 'num'
- * integers, and each consists in 'xlen' words. 'xx' points at that
- * first word of the first integer; subsequent integers are accessed
- * by adding 'xstride' repeatedly.
- *
- * The words of an integer are the RNS representation of that integer,
- * using the provided 'primes' are moduli. This function replaces
- * each integer with its multi-word value (little-endian order).
- *
- * If "normalize_signed" is non-zero, then the returned value is
- * normalized to the -m/2..m/2 interval (where m is the product of all
- * small prime moduli); two's complement is used for negative values.
- */
-static void
-zint_rebuild_CRT(uint32_t *restrict xx, size_t xlen, size_t xstride,
-	size_t num, const small_prime *primes, int normalize_signed,
-	uint32_t *restrict tmp)
-{
-	size_t u;
-	uint32_t *x;
-
-	tmp[0] = primes[0].p;
-	for (u = 1; u < xlen; u ++) {
-		/*
-		 * At the entry of each loop iteration:
-		 *  - the first u words of each array have been
-		 *    reassembled;
-		 *  - the first u words of tmp[] contains the
-		 * product of the prime moduli processed so far.
-		 *
-		 * We call 'q' the product of all previous primes.
-		 */
-		uint32_t p, p0i, s, R2;
-		size_t v;
-
-		p = primes[u].p;
-		s = primes[u].s;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-
-		for (v = 0, x = xx; v < num; v ++, x += xstride) {
-			uint32_t xp, xq, xr;
-			/*
-			 * xp = the integer x modulo the prime p for this
-			 *      iteration
-			 * xq = (x mod q) mod p
-			 */
-			xp = x[u];
-			xq = zint_mod_small_unsigned(x, u, p, p0i, R2);
-
-			/*
-			 * New value is (x mod q) + q * (s * (xp - xq) mod p)
-			 */
-			xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
-			zint_add_mul_small(x, tmp, u, xr);
-		}
-
-		/*
-		 * Update product of primes in tmp[].
-		 */
-		tmp[u] = zint_mul_small(tmp, u, p);
-	}
-
-	/*
-	 * Normalize the reconstructed values around 0.
-	 */
-	if (normalize_signed) {
-		for (u = 0, x = xx; u < num; u ++, x += xstride) {
-			zint_norm_zero(x, tmp, xlen);
-		}
-	}
-}
-
-/*
- * Negate a big integer conditionally: value a is replaced with -a if
- * and only if ctl = 1. Control value ctl must be 0 or 1.
- */
-static void
-zint_negate(uint32_t *a, size_t len, uint32_t ctl)
-{
-	size_t u;
-	uint32_t cc, m;
-
-	/*
-	 * If ctl = 1 then we flip the bits of a by XORing with
-	 * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR
-	 * with 0 and add 0, which leaves the value unchanged.
-	 */
-	cc = ctl;
-	m = -ctl >> 1;
-	for (u = 0; u < len; u ++) {
-		uint32_t aw;
-
-		aw = a[u];
-		aw = (aw ^ m) + cc;
-		a[u] = aw & 0x7FFFFFFF;
-		cc = aw >> 31;
-	}
-}
-
-/*
- * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
- * The low bits are dropped (the caller should compute the coefficients
- * such that these dropped bits are all zeros). If either or both
- * yields a negative value, then the value is negated.
- *
- * Returned value is:
- *  0  both values were positive
- *  1  new a had to be negated
- *  2  new b had to be negated
- *  3  both new a and new b had to be negated
- *
- * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
- */
-static uint32_t
-zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
-	int64_t xa, int64_t xb, int64_t ya, int64_t yb)
-{
-	size_t u;
-	int64_t cca, ccb;
-	uint32_t nega, negb;
-
-	cca = 0;
-	ccb = 0;
-	for (u = 0; u < len; u ++) {
-		uint32_t wa, wb;
-		uint64_t za, zb;
-
-		wa = a[u];
-		wb = b[u];
-		za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca;
-		zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb;
-		if (u > 0) {
-			a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
-			b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
-		}
-		cca = *(int64_t *)&za >> 31;
-		ccb = *(int64_t *)&zb >> 31;
-	}
-	a[len - 1] = (uint32_t)cca;
-	b[len - 1] = (uint32_t)ccb;
-
-	nega = (uint32_t)((uint64_t)cca >> 63);
-	negb = (uint32_t)((uint64_t)ccb >> 63);
-	zint_negate(a, len, nega);
-	zint_negate(b, len, negb);
-	return nega | (negb << 1);
-}
-
-/*
- * Finish modular reduction. Rules on input parameters:
- *
- *   if neg = 1, then -m <= a < 0
- *   if neg = 0, then 0 <= a < 2*m
- *
- * If neg = 0, then the top word of a[] is allowed to use 32 bits.
- *
- * Modulus m must be odd.
- */
-static void
-zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg)
-{
-	size_t u;
-	uint32_t cc, xm, ym;
-
-	/*
-	 * First pass: compare a (assumed nonnegative) with m. Note that
-	 * if the top word uses 32 bits, subtracting m must yield a
-	 * value less than 2^31 since a < 2*m.
-	 */
-	cc = 0;
-	for (u = 0; u < len; u ++) {
-		cc = (a[u] - m[u] - cc) >> 31;
-	}
-
-	/*
-	 * If neg = 1 then we must add m (regardless of cc)
-	 * If neg = 0 and cc = 0 then we must subtract m
-	 * If neg = 0 and cc = 1 then we must do nothing
-	 *
-	 * In the loop below, we conditionally subtract either m or -m
-	 * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1);
-	 * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0.
-	 */
-	xm = -neg >> 1;
-	ym = -(neg | (1 - cc));
-	cc = neg;
-	for (u = 0; u < len; u ++) {
-		uint32_t aw, mw;
-
-		aw = a[u];
-		mw = (m[u] ^ xm) & ym;
-		aw = aw - mw - cc;
-		a[u] = aw & 0x7FFFFFFF;
-		cc = aw >> 31;
-	}
-}
-
-/*
- * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
- * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
- */
-static void
-zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
-	uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb)
-{
-	size_t u;
-	int64_t cca, ccb;
-	uint32_t fa, fb;
-
-	/*
-	 * These are actually four combined Montgomery multiplications.
-	 */
-	cca = 0;
-	ccb = 0;
-	fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
-	fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
-	for (u = 0; u < len; u ++) {
-		uint32_t wa, wb;
-		uint64_t za, zb;
-
-		wa = a[u];
-		wb = b[u];
-		za = wa * (uint64_t)xa + wb * (uint64_t)xb
-			+ m[u] * (uint64_t)fa + (uint64_t)cca;
-		zb = wa * (uint64_t)ya + wb * (uint64_t)yb
-			+ m[u] * (uint64_t)fb + (uint64_t)ccb;
-		if (u > 0) {
-			a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
-			b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
-		}
-		cca = *(int64_t *)&za >> 31;
-		ccb = *(int64_t *)&zb >> 31;
-	}
-	a[len - 1] = (uint32_t)cca;
-	b[len - 1] = (uint32_t)ccb;
-
-	/*
-	 * At this point:
-	 *   -m <= a < 2*m
-	 *   -m <= b < 2*m
-	 * (this is a case of Montgomery reduction)
-	 * The top words of 'a' and 'b' may have a 32-th bit set.
-	 * We want to add or subtract the modulus, as required.
-	 */
-	zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
-	zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
-}
-
-/*
- * Compute a GCD between two positive big integers x and y. The two
- * integers must be odd. Returned value is 1 if the GCD is 1, 0
- * otherwise. When 1 is returned, arrays u and v are filled with values
- * such that:
- *   0 <= u <= y
- *   0 <= v <= x
- *   x*u - y*v = 1
- * x[] and y[] are unmodified. Both input values must have the same
- * encoded length. Temporary array must be large enough to accommodate 4
- * extra values of that length. Arrays u, v and tmp may not overlap with
- * each other, or with either x or y.
- */
-static int
-zint_bezout(uint32_t *restrict u, uint32_t *restrict v,
-	const uint32_t *restrict x, const uint32_t *restrict y,
-	size_t len, uint32_t *restrict tmp)
-{
-	/*
-	 * Algorithm is an extended binary GCD. We maintain 6 values
-	 * a, b, u0, u1, v0 and v1 with the following invariants:
-	 *
-	 *  a = x*u0 - y*v0
-	 *  b = x*u1 - y*v1
-	 *  0 <= a <= x
-	 *  0 <= b <= y
-	 *  0 <= u0 < y
-	 *  0 <= v0 < x
-	 *  0 <= u1 <= y
-	 *  0 <= v1 < x
-	 *
-	 * Initial values are:
-	 *
-	 *  a = x   u0 = 1   v0 = 0
-	 *  b = y   u1 = y   v1 = x-1
-	 *
-	 * Each iteration reduces either a or b, and maintains the
-	 * invariants. Algorithm stops when a = b, at which point their
-	 * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
-	 * the values (u,v) we want to return.
-	 *
-	 * The formal definition of the algorithm is a sequence of steps:
-	 *
-	 *  - If a is even, then:
-	 *        a <- a/2
-	 *        u0 <- u0/2 mod y
-	 *        v0 <- v0/2 mod x
-	 *
-	 *  - Otherwise, if b is even, then:
-	 *        b <- b/2
-	 *        u1 <- u1/2 mod y
-	 *        v1 <- v1/2 mod x
-	 *
-	 *  - Otherwise, if a > b, then:
-	 *        a <- (a-b)/2
-	 *        u0 <- (u0-u1)/2 mod y
-	 *        v0 <- (v0-v1)/2 mod x
-	 *
-	 *  - Otherwise:
-	 *        b <- (b-a)/2
-	 *        u1 <- (u1-u0)/2 mod y
-	 *        v1 <- (v1-v0)/2 mod y
-	 *
-	 * We can show that the operations above preserve the invariants:
-	 *
-	 *  - If a is even, then u0 and v0 are either both even or both
-	 *    odd (since a = x*u0 - y*v0, and x and y are both odd).
-	 *    If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2).
-	 *    Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way,
-	 *    the a = x*u0 - y*v0 invariant is preserved.
-	 *
-	 *  - The same holds for the case where b is even.
-	 *
-	 *  - If a and b are odd, and a > b, then:
-	 *
-	 *      a-b = x*(u0-u1) - y*(v0-v1)
-	 *
-	 *    In that situation, if u0 < u1, then x*(u0-u1) < 0, but
-	 *    a-b > 0; therefore, it must be that v0 < v1, and the
-	 *    first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x),
-	 *    which preserves the invariants. Otherwise, if u0 > u1,
-	 *    then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and
-	 *    b >= 0, hence a-b <= x. It follows that, in that case,
-	 *    v0-v1 >= 0. The first part of the update is then:
-	 *    (u0,v0) <- (u0-u1,v0-v1), which again preserves the
-	 *    invariants.
-	 *
-	 *    Either way, once the subtraction is done, the new value of
-	 *    a, which is the difference of two odd values, is even,
-	 *    and the remaining of this step is a subcase of the
-	 *    first algorithm case (i.e. when a is even).
-	 *
-	 *  - If a and b are odd, and b > a, then the a similar
-	 *    argument holds.
-	 *
-	 * The values a and b start at x and y, respectively. Since x
-	 * and y are odd, their GCD is odd, and it is easily seen that
-	 * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b);
-	 * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a
-	 * or b is reduced by at least one bit at each iteration, so
-	 * the algorithm necessarily converges on the case a = b, at
-	 * which point the common value is the GCD.
-	 *
-	 * In the algorithm expressed above, when a = b, the fourth case
-	 * applies, and sets b = 0. Since a contains the GCD of x and y,
-	 * which are both odd, a must be odd, and subsequent iterations
-	 * (if any) will simply divide b by 2 repeatedly, which has no
-	 * consequence. Thus, the algorithm can run for more iterations
-	 * than necessary; the final GCD will be in a, and the (u,v)
-	 * coefficients will be (u0,v0).
-	 *
-	 *
-	 * The presentation above is bit-by-bit. It can be sped up by
-	 * noticing that all decisions are taken based on the low bits
-	 * and high bits of a and b. We can extract the two top words
-	 * and low word of each of a and b, and compute reduction
-	 * parameters pa, pb, qa and qb such that the new values for
-	 * a and b are:
-	 *    a' = (a*pa + b*pb) / (2^31)
-	 *    b' = (a*qa + b*qb) / (2^31)
-	 * the two divisions being exact. The coefficients are obtained
-	 * just from the extracted words, and may be slightly off, requiring
-	 * an optional correction: if a' < 0, then we replace pa with -pa
-	 * and pb with -pb. Each such step will reduce the total length
-	 * (sum of lengths of a and b) by at least 30 bits at each
-	 * iteration.
-	 */
-	uint32_t *u0, *u1, *v0, *v1, *a, *b;
-	uint32_t x0i, y0i;
-	uint32_t num, rc;
-	size_t j;
-
-	if (len == 0) {
-		return 0;
-	}
-
-	/*
-	 * u0 and v0 are the u and v result buffers; the four other
-	 * values (u1, v1, a and b) are taken from tmp[].
-	 */
-	u0 = u;
-	v0 = v;
-	u1 = tmp;
-	v1 = u1 + len;
-	a = v1 + len;
-	b = a + len;
-
-	/*
-	 * We'll need the Montgomery reduction coefficients.
-	 */
-	x0i = modp_ninv31(x[0]);
-	y0i = modp_ninv31(y[0]);
-
-	/*
-	 * Initialize a, b, u0, u1, v0 and v1.
-	 *  a = x   u0 = 1   v0 = 0
-	 *  b = y   u1 = y   v1 = x-1
-	 * Note that x is odd, so computing x-1 is easy.
-	 */
-	memcpy(a, x, len * sizeof *x);
-	memcpy(b, y, len * sizeof *y);
-	u0[0] = 1;
-	memset(u0 + 1, 0, (len - 1) * sizeof *u0);
-	memset(v0, 0, len * sizeof *v0);
-	memcpy(u1, y, len * sizeof *u1);
-	memcpy(v1, x, len * sizeof *v1);
-	v1[0] --;
-
-	/*
-	 * Each input operand may be as large as 31*len bits, and we
-	 * reduce the total length by at least 30 bits at each iteration.
-	 */
-	for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) {
-		uint32_t c0, c1;
-		uint32_t a0, a1, b0, b1;
-		uint64_t a_hi, b_hi;
-		uint32_t a_lo, b_lo;
-		int64_t pa, pb, qa, qb;
-		int i;
-		uint32_t r;
-
-		/*
-		 * Extract the top words of a and b. If j is the highest
-		 * index >= 1 such that a[j] != 0 or b[j] != 0, then we
-		 * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1].
-		 * If a and b are down to one word each, then we use
-		 * a[0] and b[0].
-		 */
-		c0 = (uint32_t)-1;
-		c1 = (uint32_t)-1;
-		a0 = 0;
-		a1 = 0;
-		b0 = 0;
-		b1 = 0;
-		j = len;
-		while (j -- > 0) {
-			uint32_t aw, bw;
-
-			aw = a[j];
-			bw = b[j];
-			a0 ^= (a0 ^ aw) & c0;
-			a1 ^= (a1 ^ aw) & c1;
-			b0 ^= (b0 ^ bw) & c0;
-			b1 ^= (b1 ^ bw) & c1;
-			c1 = c0;
-			c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
-		}
-
-		/*
-		 * If c1 = 0, then we grabbed two words for a and b.
-		 * If c1 != 0 but c0 = 0, then we grabbed one word. It
-		 * is not possible that c1 != 0 and c0 != 0, because that
-		 * would mean that both integers are zero.
-		 */
-		a1 |= a0 & c1;
-		a0 &= ~c1;
-		b1 |= b0 & c1;
-		b0 &= ~c1;
-		a_hi = ((uint64_t)a0 << 31) + a1;
-		b_hi = ((uint64_t)b0 << 31) + b1;
-		a_lo = a[0];
-		b_lo = b[0];
-
-		/*
-		 * Compute reduction factors:
-		 *
-		 *   a' = a*pa + b*pb
-		 *   b' = a*qa + b*qb
-		 *
-		 * such that a' and b' are both multiple of 2^31, but are
-		 * only marginally larger than a and b.
-		 */
-		pa = 1;
-		pb = 0;
-		qa = 0;
-		qb = 1;
-		for (i = 0; i < 31; i ++) {
-			/*
-			 * At each iteration:
-			 *
-			 *   a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
-			 *   b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
-			 *   a <- a/2 if: a is even
-			 *   b <- b/2 if: a is odd, b is even
-			 *
-			 * We multiply a_lo and b_lo by 2 at each
-			 * iteration, thus a division by 2 really is a
-			 * non-multiplication by 2.
-			 */
-			uint32_t rt, oa, ob, cAB, cBA, cA;
-			uint64_t rz;
-
-			/*
-			 * rt = 1 if a_hi > b_hi, 0 otherwise.
-			 */
-			rz = b_hi - a_hi;
-			rt = (uint32_t)((rz ^ ((a_hi ^ b_hi)
-				& (a_hi ^ rz))) >> 63);
-
-			/*
-			 * cAB = 1 if b must be subtracted from a
-			 * cBA = 1 if a must be subtracted from b
-			 * cA = 1 if a must be divided by 2
-			 *
-			 * Rules:
-			 *
-			 *   cAB and cBA cannot both be 1.
-			 *   If a is not divided by 2, b is.
-			 */
-			oa = (a_lo >> i) & 1;
-			ob = (b_lo >> i) & 1;
-			cAB = oa & ob & rt;
-			cBA = oa & ob & ~rt;
-			cA = cAB | (oa ^ 1);
-
-			/*
-			 * Conditional subtractions.
-			 */
-			a_lo -= b_lo & -cAB;
-			a_hi -= b_hi & -(uint64_t)cAB;
-			pa -= qa & -(int64_t)cAB;
-			pb -= qb & -(int64_t)cAB;
-			b_lo -= a_lo & -cBA;
-			b_hi -= a_hi & -(uint64_t)cBA;
-			qa -= pa & -(int64_t)cBA;
-			qb -= pb & -(int64_t)cBA;
-
-			/*
-			 * Shifting.
-			 */
-			a_lo += a_lo & (cA - 1);
-			pa += pa & ((int64_t)cA - 1);
-			pb += pb & ((int64_t)cA - 1);
-			a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
-			b_lo += b_lo & -cA;
-			qa += qa & -(int64_t)cA;
-			qb += qb & -(int64_t)cA;
-			b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
-		}
-
-		/*
-		 * Apply the computed parameters to our values. We
-		 * may have to correct pa and pb depending on the
-		 * returned value of zint_co_reduce() (when a and/or b
-		 * had to be negated).
-		 */
-		r = zint_co_reduce(a, b, len, pa, pb, qa, qb);
-		pa -= (pa + pa) & -(int64_t)(r & 1);
-		pb -= (pb + pb) & -(int64_t)(r & 1);
-		qa -= (qa + qa) & -(int64_t)(r >> 1);
-		qb -= (qb + qb) & -(int64_t)(r >> 1);
-		zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb);
-		zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb);
-	}
-
-	/*
-	 * At that point, array a[] should contain the GCD, and the
-	 * results (u,v) should already be set. We check that the GCD
-	 * is indeed 1. We also check that the two operands x and y
-	 * are odd.
-	 */
-	rc = a[0] ^ 1;
-	for (j = 1; j < len; j ++) {
-		rc |= a[j];
-	}
-	return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]);
-}
-
-/*
- * Add k*y*2^sc to x. The result is assumed to fit in the array of
- * size xlen (truncation is applied if necessary).
- * Scale factor 'sc' is provided as sch and scl, such that:
- *   sch = sc / 31
- *   scl = sc % 31
- * xlen MUST NOT be lower than ylen.
- *
- * x[] and y[] are both signed integers, using two's complement for
- * negative values.
- */
-static void
-zint_add_scaled_mul_small(uint32_t *restrict x, size_t xlen,
-	const uint32_t *restrict y, size_t ylen, int32_t k,
-	uint32_t sch, uint32_t scl)
-{
-	size_t u;
-	uint32_t ysign, tw;
-	int32_t cc;
-
-	if (ylen == 0) {
-		return;
-	}
-
-	ysign = -(y[ylen - 1] >> 30) >> 1;
-	tw = 0;
-	cc = 0;
-	for (u = sch; u < xlen; u ++) {
-		size_t v;
-		uint32_t wy, wys, ccu;
-		uint64_t z;
-
-		/*
-		 * Get the next word of y (scaled).
-		 */
-		v = u - sch;
-		wy = v < ylen ? y[v] : ysign;
-		wys = ((wy << scl) & 0x7FFFFFFF) | tw;
-		tw = wy >> (31 - scl);
-
-		/*
-		 * The expression below does not overflow.
-		 */
-		z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc);
-		x[u] = (uint32_t)z & 0x7FFFFFFF;
-
-		/*
-		 * Right-shifting the signed value z would yield
-		 * implementation-defined results (arithmetic shift is
-		 * not guaranteed). However, we can cast to unsigned,
-		 * and get the next carry as an unsigned word. We can
-		 * then convert it back to signed by using the guaranteed
-		 * fact that 'int32_t' uses two's complement with no
-		 * trap representation or padding bit, and with a layout
-		 * compatible with that of 'uint32_t'.
-		 */
-		ccu = (uint32_t)(z >> 31);
-		cc = *(int32_t *)&ccu;
-	}
-}
-
-/*
- * Subtract y*2^sc from x. The result is assumed to fit in the array of
- * size xlen (truncation is applied if necessary).
- * Scale factor 'sc' is provided as sch and scl, such that:
- *   sch = sc / 31
- *   scl = sc % 31
- * xlen MUST NOT be lower than ylen.
- *
- * x[] and y[] are both signed integers, using two's complement for
- * negative values.
- */
-static void
-zint_sub_scaled(uint32_t *restrict x, size_t xlen,
-	const uint32_t *restrict y, size_t ylen, uint32_t sch, uint32_t scl)
-{
-	size_t u;
-	uint32_t ysign, tw;
-	uint32_t cc;
-
-	if (ylen == 0) {
-		return;
-	}
-
-	ysign = -(y[ylen - 1] >> 30) >> 1;
-	tw = 0;
-	cc = 0;
-	for (u = sch; u < xlen; u ++) {
-		size_t v;
-		uint32_t w, wy, wys;
-
-		/*
-		 * Get the next word of y (scaled).
-		 */
-		v = u - sch;
-		wy = v < ylen ? y[v] : ysign;
-		wys = ((wy << scl) & 0x7FFFFFFF) | tw;
-		tw = wy >> (31 - scl);
-
-		w = x[u] - wys - cc;
-		x[u] = w & 0x7FFFFFFF;
-		cc = w >> 31;
-	}
-}
-
-/*
- * Convert a one-word signed big integer into a signed value.
- */
-static inline int32_t
-zint_one_to_plain(const uint32_t *x)
-{
-	uint32_t w;
-
-	w = x[0];
-	w |= (w & 0x40000000) << 1;
-	return *(int32_t *)&w;
-}
-
-/* ==================================================================== */
-
-/*
- * Convert a polynomial to floating-point values.
- *
- * Each coefficient has length flen words, and starts fstride words after
- * the previous.
- *
- * IEEE-754 binary64 values can represent values in a finite range,
- * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large,
- * they should be "trimmed" by pointing not to the lowest word of each,
- * but upper.
- */
-static void
-poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
-	unsigned logn)
-{
-	size_t n, u;
-
-	n = MKN(logn);
-	if (flen == 0) {
-		for (u = 0; u < n; u ++) {
-			d[u] = fpr_zero;
-		}
-		return;
-	}
-	for (u = 0; u < n; u ++, f += fstride) {
-		size_t v;
-		uint32_t neg, cc, xm;
-		fpr x, fsc;
-
-		/*
-		 * Get sign of the integer; if it is negative, then we
-		 * will load its absolute value instead, and negate the
-		 * result.
-		 */
-		neg = -(f[flen - 1] >> 30);
-		xm = neg >> 1;
-		cc = neg & 1;
-		x = fpr_zero;
-		fsc = fpr_one;
-		for (v = 0; v < flen; v ++, fsc = fpr_mul(fsc, fpr_ptwo31)) {
-			uint32_t w;
-
-			w = (f[v] ^ xm) + cc;
-			cc = w >> 31;
-			w &= 0x7FFFFFFF;
-			w -= (w << 1) & neg;
-			x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc));
-		}
-		d[u] = x;
-	}
-}
-
-/*
- * Convert a polynomial to small integers. Source values are supposed
- * to be one-word integers, signed over 31 bits. Returned value is 0
- * if any of the coefficients exceeds the provided limit (in absolute
- * value), or 1 on success.
- *
- * This is not constant-time; this is not a problem here, because on
- * any failure, the NTRU-solving process will be deemed to have failed
- * and the (f,g) polynomials will be discarded.
- */
-static int
-poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn)
-{
-	size_t n, u;
-
-	n = MKN(logn);
-	for (u = 0; u < n; u ++) {
-		int32_t z;
-
-		z = zint_one_to_plain(s + u);
-		if (z < -lim || z > lim) {
-			return 0;
-		}
-		d[u] = (int8_t)z;
-	}
-	return 1;
-}
-
-/*
- * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
- * Coefficients of polynomial k are small integers (signed values in the
- * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31
- * and scl = sc % 31.
- *
- * This function implements the basic quadratic multiplication algorithm,
- * which is efficient in space (no extra buffer needed) but slow at
- * high degree.
- */
-static void
-poly_sub_scaled(uint32_t *restrict F, size_t Flen, size_t Fstride,
-	const uint32_t *restrict f, size_t flen, size_t fstride,
-	const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn)
-{
-	size_t n, u;
-
-	n = MKN(logn);
-	for (u = 0; u < n; u ++) {
-		int32_t kf;
-		size_t v;
-		uint32_t *x;
-		const uint32_t *y;
-
-		kf = -k[u];
-		x = F + u * Fstride;
-		y = f;
-		for (v = 0; v < n; v ++) {
-			zint_add_scaled_mul_small(
-				x, Flen, y, flen, kf, sch, scl);
-			if (u + v == n - 1) {
-				x = F;
-				kf = -kf;
-			} else {
-				x += Fstride;
-			}
-			y += fstride;
-		}
-	}
-}
-
-/*
- * Subtract k*f from F. Coefficients of polynomial k are small integers
- * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
- * assumes that the degree is large, and integers relatively small.
- * The value sc is provided as sch = sc / 31 and scl = sc % 31.
- */
-static void
-poly_sub_scaled_ntt(uint32_t *restrict F, size_t Flen, size_t Fstride,
-	const uint32_t *restrict f, size_t flen, size_t fstride,
-	const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn,
-	uint32_t *restrict tmp)
-{
-	uint32_t *gm, *igm, *fk, *t1, *x;
-	const uint32_t *y;
-	size_t n, u, tlen;
-	const small_prime *primes;
-
-	n = MKN(logn);
-	tlen = flen + 1;
-	gm = tmp;
-	igm = gm + MKN(logn);
-	fk = igm + MKN(logn);
-	t1 = fk + n * tlen;
-
-	primes = PRIMES;
-
-	/*
-	 * Compute k*f in fk[], in RNS notation.
-	 */
-	for (u = 0; u < tlen; u ++) {
-		uint32_t p, p0i, R2, Rx;
-		size_t v;
-
-		p = primes[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-		Rx = modp_Rx((unsigned)flen, p, p0i, R2);
-		modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
-
-		for (v = 0; v < n; v ++) {
-			t1[v] = modp_set(k[v], p);
-		}
-		modp_NTT2(t1, gm, logn, p, p0i);
-		for (v = 0, y = f, x = fk + u;
-			v < n; v ++, y += fstride, x += tlen)
-		{
-			*x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
-		}
-		modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
-		for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
-			*x = modp_montymul(
-				modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
-		}
-		modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
-	}
-
-	/*
-	 * Rebuild k*f.
-	 */
-	zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);
-
-	/*
-	 * Subtract k*f, scaled, from F.
-	 */
-	for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
-		zint_sub_scaled(x, Flen, y, tlen, sch, scl);
-	}
-}
-
-/* ==================================================================== */
-
-#if FALCON_KG_CHACHA20  // yyyKG_CHACHA20+1
-
-#define RNG_CONTEXT   prng
-#define get_rng_u64   prng_get_u64
-
-#else  // yyyKG_CHACHA20+0
-
-#define RNG_CONTEXT   inner_shake256_context
-
-/*
- * Get a random 8-byte integer from a SHAKE-based RNG. This function
- * ensures consistent interpretation of the SHAKE output so that
- * the same values will be obtained over different platforms, in case
- * a known seed is used.
- */
-static inline uint64_t
-get_rng_u64(inner_shake256_context *rng)
-{
-	/*
-	 * We enforce little-endian representation.
-	 */
-
-#if FALCON_LE  // yyyLE+1
-	/*
-	 * On little-endian systems we just interpret the bytes "as is"
-	 * (this is correct because the exact-width types such as
-	 * 'uint64_t' are guaranteed to have no padding and no trap
-	 * representation).
-	 */
-	uint64_t r;
-
-	inner_shake256_extract(rng, (uint8_t *)&r, sizeof r);
-	return r;
-#else  // yyyLE+0
-	uint8_t tmp[8];
-
-	inner_shake256_extract(rng, tmp, sizeof tmp);
-	return (uint64_t)tmp[0]
-		| ((uint64_t)tmp[1] << 8)
-		| ((uint64_t)tmp[2] << 16)
-		| ((uint64_t)tmp[3] << 24)
-		| ((uint64_t)tmp[4] << 32)
-		| ((uint64_t)tmp[5] << 40)
-		| ((uint64_t)tmp[6] << 48)
-		| ((uint64_t)tmp[7] << 56);
-#endif  // yyyLE-
-}
-
-#endif  // yyyKG_CHACHA20-
-
-/*
- * Table below incarnates a discrete Gaussian distribution:
- *    D(x) = exp(-(x^2)/(2*sigma^2))
- * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
- * Element 0 of the table is P(x = 0).
- * For k > 0, element k is P(x >= k+1 | x > 0).
- * Probabilities are scaled up by 2^63.
- */
-static const uint64_t gauss_1024_12289[] = {
-	 1283868770400643928u,  6416574995475331444u,  4078260278032692663u,
-	 2353523259288686585u,  1227179971273316331u,   575931623374121527u,
-	  242543240509105209u,    91437049221049666u,    30799446349977173u,
-	    9255276791179340u,     2478152334826140u,      590642893610164u,
-	     125206034929641u,       23590435911403u,        3948334035941u,
-	        586753615614u,          77391054539u,           9056793210u,
-	           940121950u,             86539696u,              7062824u,
-	              510971u,                32764u,                 1862u,
-	                  94u,                    4u,                    0u
-};
-
-/*
- * Generate a random value with a Gaussian distribution centered on 0.
- * The RNG must be ready for extraction (already flipped).
- *
- * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
- * precomputed table is for N = 1024. Since the sum of two independent
- * values of standard deviation sigma has standard deviation
- * sigma*sqrt(2), then we can just generate more values and add them
- * together for lower dimensions.
- */
-static int
-mkgauss(RNG_CONTEXT *rng, unsigned logn)
-{
-	unsigned u, g;
-	int val;
-
-	g = 1U << (10 - logn);
-	val = 0;
-	for (u = 0; u < g; u ++) {
-		/*
-		 * Each iteration generates one value with the
-		 * Gaussian distribution for N = 1024.
-		 *
-		 * We use two random 64-bit values. First value
-		 * decides on whether the generated value is 0, and,
-		 * if not, the sign of the value. Second random 64-bit
-		 * word is used to generate the non-zero value.
-		 *
-		 * For constant-time code we have to read the complete
-		 * table. This has negligible cost, compared with the
-		 * remainder of the keygen process (solving the NTRU
-		 * equation).
-		 */
-		uint64_t r;
-		uint32_t f, v, k, neg;
-
-		/*
-		 * First value:
-		 *  - flag 'neg' is randomly selected to be 0 or 1.
-		 *  - flag 'f' is set to 1 if the generated value is zero,
-		 *    or set to 0 otherwise.
-		 */
-		r = get_rng_u64(rng);
-		neg = (uint32_t)(r >> 63);
-		r &= ~((uint64_t)1 << 63);
-		f = (uint32_t)((r - gauss_1024_12289[0]) >> 63);
-
-		/*
-		 * We produce a new random 63-bit integer r, and go over
-		 * the array, starting at index 1. We store in v the
-		 * index of the first array element which is not greater
-		 * than r, unless the flag f was already 1.
-		 */
-		v = 0;
-		r = get_rng_u64(rng);
-		r &= ~((uint64_t)1 << 63);
-		for (k = 1; k < (sizeof gauss_1024_12289)
-			/ (sizeof gauss_1024_12289[0]); k ++)
-		{
-			uint32_t t;
-
-			t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1;
-			v |= k & -(t & (f ^ 1));
-			f |= t;
-		}
-
-		/*
-		 * We apply the sign ('neg' flag). If the value is zero,
-		 * the sign has no effect.
-		 */
-		v = (v ^ -neg) + neg;
-
-		/*
-		 * Generated value is added to val.
-		 */
-		val += *(int32_t *)&v;
-	}
-	return val;
-}
-
-/*
- * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit
- * words, of intermediate values in the computation:
- *
- *   MAX_BL_SMALL[depth]: length for the input f and g at that depth
- *   MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth
- *
- * Rules:
- *
- *  - Within an array, values grow.
- *
- *  - The 'SMALL' array must have an entry for maximum depth, corresponding
- *    to the size of values used in the binary GCD. There is no such value
- *    for the 'LARGE' array (the binary GCD yields already reduced
- *    coefficients).
- *
- *  - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
- *
- *  - Values must be large enough to handle the common cases, with some
- *    margins.
- *
- *  - Values must not be "too large" either because we will convert some
- *    integers into floating-point values by considering the top 10 words,
- *    i.e. 310 bits; hence, for values of length more than 10 words, we
- *    should take care to have the length centered on the expected size.
- *
- * The following average lengths, in bits, have been measured on thousands
- * of random keys (fg = max length of the absolute value of coefficients
- * of f and g at that depth; FG = idem for the unreduced F and G; for the
- * maximum depth, F and G are the output of binary GCD, multiplied by q;
- * for each value, the average and standard deviation are provided).
- *
- * Binary case:
- *    depth: 10    fg: 6307.52 (24.48)    FG: 6319.66 (24.51)
- *    depth:  9    fg: 3138.35 (12.25)    FG: 9403.29 (27.55)
- *    depth:  8    fg: 1576.87 ( 7.49)    FG: 4703.30 (14.77)
- *    depth:  7    fg:  794.17 ( 4.98)    FG: 2361.84 ( 9.31)
- *    depth:  6    fg:  400.67 ( 3.10)    FG: 1188.68 ( 6.04)
- *    depth:  5    fg:  202.22 ( 1.87)    FG:  599.81 ( 3.87)
- *    depth:  4    fg:  101.62 ( 1.02)    FG:  303.49 ( 2.38)
- *    depth:  3    fg:   50.37 ( 0.53)    FG:  153.65 ( 1.39)
- *    depth:  2    fg:   24.07 ( 0.25)    FG:   78.20 ( 0.73)
- *    depth:  1    fg:   10.99 ( 0.08)    FG:   39.82 ( 0.41)
- *    depth:  0    fg:    4.00 ( 0.00)    FG:   19.61 ( 0.49)
- *
- * Integers are actually represented either in binary notation over
- * 31-bit words (signed, using two's complement), or in RNS, modulo
- * many small primes. These small primes are close to, but slightly
- * lower than, 2^31. Use of RNS loses less than two bits, even for
- * the largest values.
- *
- * IMPORTANT: if these values are modified, then the temporary buffer
- * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed
- * accordingly.
- */
-
-static const size_t MAX_BL_SMALL[] = {
-	1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209
-};
-
-static const size_t MAX_BL_LARGE[] = {
-	2, 2, 5, 7, 12, 21, 40, 78, 157, 308
-};
-
-/*
- * Average and standard deviation for the maximum size (in bits) of
- * coefficients of (f,g), depending on depth. These values are used
- * to compute bounds for Babai's reduction.
- */
-static const struct {
-	int avg;
-	int std;
-} BITLENGTH[] = {
-	{    4,  0 },
-	{   11,  1 },
-	{   24,  1 },
-	{   50,  1 },
-	{  102,  1 },
-	{  202,  2 },
-	{  401,  4 },
-	{  794,  5 },
-	{ 1577,  8 },
-	{ 3138, 13 },
-	{ 6308, 25 }
-};
-
-/*
- * Minimal recursion depth at which we rebuild intermediate values
- * when reconstructing f and g.
- */
-#define DEPTH_INT_FG   4
-
-/*
- * Compute squared norm of a short vector. Returned value is saturated to
- * 2^32-1 if it is not lower than 2^31.
- */
-static uint32_t
-poly_small_sqnorm(const int8_t *f, unsigned logn)
-{
-	size_t n, u;
-	uint32_t s, ng;
-
-	n = MKN(logn);
-	s = 0;
-	ng = 0;
-	for (u = 0; u < n; u ++) {
-		int32_t z;
-
-		z = f[u];
-		s += (uint32_t)(z * z);
-		ng |= s;
-	}
-	return s | -(ng >> 31);
-}
-
-/*
- * Align (upwards) the provided 'data' pointer with regards to 'base'
- * so that the offset is a multiple of the size of 'fpr'.
- */
-static fpr *
-align_fpr(void *base, void *data)
-{
-	uint8_t *cb, *cd;
-	size_t k, km;
-
-	cb = base;
-	cd = data;
-	k = (size_t)(cd - cb);
-	km = k % sizeof(fpr);
-	if (km) {
-		k += (sizeof(fpr)) - km;
-	}
-	return (fpr *)(cb + k);
-}
-
-/*
- * Align (upwards) the provided 'data' pointer with regards to 'base'
- * so that the offset is a multiple of the size of 'uint32_t'.
- */
-static uint32_t *
-align_u32(void *base, void *data)
-{
-	uint8_t *cb, *cd;
-	size_t k, km;
-
-	cb = base;
-	cd = data;
-	k = (size_t)(cd - cb);
-	km = k % sizeof(uint32_t);
-	if (km) {
-		k += (sizeof(uint32_t)) - km;
-	}
-	return (uint32_t *)(cb + k);
-}
-
-/*
- * Convert a small vector to floating point.
- */
-static void
-poly_small_to_fp(fpr *x, const int8_t *f, unsigned logn)
-{
-	size_t n, u;
-
-	n = MKN(logn);
-	for (u = 0; u < n; u ++) {
-		x[u] = fpr_of(f[u]);
-	}
-}
-
-/*
- * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
- * individual length.
- *
- * Output: f',g' of degree N/2, with the length for 'depth+1'.
- *
- * Values are in RNS; input and/or output may also be in NTT.
- */
-static void
-make_fg_step(uint32_t *data, unsigned logn, unsigned depth,
-	int in_ntt, int out_ntt)
-{
-	size_t n, hn, u;
-	size_t slen, tlen;
-	uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
-	const small_prime *primes;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-	slen = MAX_BL_SMALL[depth];
-	tlen = MAX_BL_SMALL[depth + 1];
-	primes = PRIMES;
-
-	/*
-	 * Prepare room for the result.
-	 */
-	fd = data;
-	gd = fd + hn * tlen;
-	fs = gd + hn * tlen;
-	gs = fs + n * slen;
-	gm = gs + n * slen;
-	igm = gm + n;
-	t1 = igm + n;
-	memmove(fs, data, 2 * n * slen * sizeof *data);
-
-	/*
-	 * First slen words: we use the input values directly, and apply
-	 * inverse NTT as we go.
-	 */
-	for (u = 0; u < slen; u ++) {
-		uint32_t p, p0i, R2;
-		size_t v;
-		uint32_t *x;
-
-		p = primes[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-		modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
-
-		for (v = 0, x = fs + u; v < n; v ++, x += slen) {
-			t1[v] = *x;
-		}
-		if (!in_ntt) {
-			modp_NTT2(t1, gm, logn, p, p0i);
-		}
-		for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
-			uint32_t w0, w1;
-
-			w0 = t1[(v << 1) + 0];
-			w1 = t1[(v << 1) + 1];
-			*x = modp_montymul(
-				modp_montymul(w0, w1, p, p0i), R2, p, p0i);
-		}
-		if (in_ntt) {
-			modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i);
-		}
-
-		for (v = 0, x = gs + u; v < n; v ++, x += slen) {
-			t1[v] = *x;
-		}
-		if (!in_ntt) {
-			modp_NTT2(t1, gm, logn, p, p0i);
-		}
-		for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
-			uint32_t w0, w1;
-
-			w0 = t1[(v << 1) + 0];
-			w1 = t1[(v << 1) + 1];
-			*x = modp_montymul(
-				modp_montymul(w0, w1, p, p0i), R2, p, p0i);
-		}
-		if (in_ntt) {
-			modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i);
-		}
-
-		if (!out_ntt) {
-			modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
-			modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
-		}
-	}
-
-	/*
-	 * Since the fs and gs words have been de-NTTized, we can use the
-	 * CRT to rebuild the values.
-	 */
-	zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
-	zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);
-
-	/*
-	 * Remaining words: use modular reductions to extract the values.
-	 */
-	for (u = slen; u < tlen; u ++) {
-		uint32_t p, p0i, R2, Rx;
-		size_t v;
-		uint32_t *x;
-
-		p = primes[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-		Rx = modp_Rx((unsigned)slen, p, p0i, R2);
-		modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
-		for (v = 0, x = fs; v < n; v ++, x += slen) {
-			t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
-		}
-		modp_NTT2(t1, gm, logn, p, p0i);
-		for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
-			uint32_t w0, w1;
-
-			w0 = t1[(v << 1) + 0];
-			w1 = t1[(v << 1) + 1];
-			*x = modp_montymul(
-				modp_montymul(w0, w1, p, p0i), R2, p, p0i);
-		}
-		for (v = 0, x = gs; v < n; v ++, x += slen) {
-			t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
-		}
-		modp_NTT2(t1, gm, logn, p, p0i);
-		for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
-			uint32_t w0, w1;
-
-			w0 = t1[(v << 1) + 0];
-			w1 = t1[(v << 1) + 1];
-			*x = modp_montymul(
-				modp_montymul(w0, w1, p, p0i), R2, p, p0i);
-		}
-
-		if (!out_ntt) {
-			modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
-			modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
-		}
-	}
-}
-
-/*
- * Compute f and g at a specific depth, in RNS notation.
- *
- * Returned values are stored in the data[] array, at slen words per integer.
- *
- * Conditions:
- *   0 <= depth <= logn
- *
- * Space use in data[]: enough room for any two successive values (f', g',
- * f and g).
- */
-static void
-make_fg(uint32_t *data, const int8_t *f, const int8_t *g,
-	unsigned logn, unsigned depth, int out_ntt)
-{
-	size_t n, u;
-	uint32_t *ft, *gt, p0;
-	unsigned d;
-	const small_prime *primes;
-
-	n = MKN(logn);
-	ft = data;
-	gt = ft + n;
-	primes = PRIMES;
-	p0 = primes[0].p;
-	for (u = 0; u < n; u ++) {
-		ft[u] = modp_set(f[u], p0);
-		gt[u] = modp_set(g[u], p0);
-	}
-
-	if (depth == 0 && out_ntt) {
-		uint32_t *gm, *igm;
-		uint32_t p, p0i;
-
-		p = primes[0].p;
-		p0i = modp_ninv31(p);
-		gm = gt + n;
-		igm = gm + MKN(logn);
-		modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
-		modp_NTT2(ft, gm, logn, p, p0i);
-		modp_NTT2(gt, gm, logn, p, p0i);
-		return;
-	}
-
-	for (d = 0; d < depth; d ++) {
-		make_fg_step(data, logn - d, d,
-			d != 0, (d + 1) < depth || out_ntt);
-	}
-}
-
-/*
- * Solving the NTRU equation, deepest level: compute the resultants of
- * f and g with X^N+1, and use binary GCD. The F and G values are
- * returned in tmp[].
- *
- * Returned value: 1 on success, 0 on error.
- */
-static int
-solve_NTRU_deepest(unsigned logn_top,
-	const int8_t *f, const int8_t *g, uint32_t *tmp)
-{
-	size_t len;
-	uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
-	const small_prime *primes;
-
-	len = MAX_BL_SMALL[logn_top];
-	primes = PRIMES;
-
-	Fp = tmp;
-	Gp = Fp + len;
-	fp = Gp + len;
-	gp = fp + len;
-	t1 = gp + len;
-
-	make_fg(fp, f, g, logn_top, logn_top, 0);
-
-	/*
-	 * We use the CRT to rebuild the resultants as big integers.
-	 * There are two such big integers. The resultants are always
-	 * nonnegative.
-	 */
-	zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);
-
-	/*
-	 * Apply the binary GCD. The zint_bezout() function works only
-	 * if both inputs are odd.
-	 *
-	 * We can test on the result and return 0 because that would
-	 * imply failure of the NTRU solving equation, and the (f,g)
-	 * values will be abandoned in that case.
-	 */
-	if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
-		return 0;
-	}
-
-	/*
-	 * Multiply the two values by the target value q. Values must
-	 * fit in the destination arrays.
-	 * We can again test on the returned words: a non-zero output
-	 * of zint_mul_small() means that we exceeded our array
-	 * capacity, and that implies failure and rejection of (f,g).
-	 */
-	q = 12289;
-	if (zint_mul_small(Fp, len, q) != 0
-		|| zint_mul_small(Gp, len, q) != 0)
-	{
-		return 0;
-	}
-
-	return 1;
-}
-
-/*
- * Solving the NTRU equation, intermediate level. Upon entry, the F and G
- * from the previous level should be in the tmp[] array.
- * This function MAY be invoked for the top-level (in which case depth = 0).
- *
- * Returned value: 1 on success, 0 on error.
- */
-static int
-solve_NTRU_intermediate(unsigned logn_top,
-	const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp)
-{
-	/*
-	 * In this function, 'logn' is the log2 of the degree for
-	 * this step. If N = 2^logn, then:
-	 *  - the F and G values already in fk->tmp (from the deeper
-	 *    levels) have degree N/2;
-	 *  - this function should return F and G of degree N.
-	 */
-	unsigned logn;
-	size_t n, hn, slen, dlen, llen, rlen, FGlen, u;
-	uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
-	fpr *rt1, *rt2, *rt3, *rt4, *rt5;
-	int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k;
-	uint32_t *x, *y;
-	int32_t *k;
-	const small_prime *primes;
-
-	logn = logn_top - depth;
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-
-	/*
-	 * slen = size for our input f and g; also size of the reduced
-	 *        F and G we return (degree N)
-	 *
-	 * dlen = size of the F and G obtained from the deeper level
-	 *        (degree N/2 or N/3)
-	 *
-	 * llen = size for intermediary F and G before reduction (degree N)
-	 *
-	 * We build our non-reduced F and G as two independent halves each,
-	 * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
-	 */
-	slen = MAX_BL_SMALL[depth];
-	dlen = MAX_BL_SMALL[depth + 1];
-	llen = MAX_BL_LARGE[depth];
-	primes = PRIMES;
-
-	/*
-	 * Fd and Gd are the F and G from the deeper level.
-	 */
-	Fd = tmp;
-	Gd = Fd + dlen * hn;
-
-	/*
-	 * Compute the input f and g for this level. Note that we get f
-	 * and g in RNS + NTT representation.
-	 */
-	ft = Gd + dlen * hn;
-	make_fg(ft, f, g, logn_top, depth, 1);
-
-	/*
-	 * Move the newly computed f and g to make room for our candidate
-	 * F and G (unreduced).
-	 */
-	Ft = tmp;
-	Gt = Ft + n * llen;
-	t1 = Gt + n * llen;
-	memmove(t1, ft, 2 * n * slen * sizeof *ft);
-	ft = t1;
-	gt = ft + slen * n;
-	t1 = gt + slen * n;
-
-	/*
-	 * Move Fd and Gd _after_ f and g.
-	 */
-	memmove(t1, Fd, 2 * hn * dlen * sizeof *Fd);
-	Fd = t1;
-	Gd = Fd + hn * dlen;
-
-	/*
-	 * We reduce Fd and Gd modulo all the small primes we will need,
-	 * and store the values in Ft and Gt (only n/2 values in each).
-	 */
-	for (u = 0; u < llen; u ++) {
-		uint32_t p, p0i, R2, Rx;
-		size_t v;
-		uint32_t *xs, *ys, *xd, *yd;
-
-		p = primes[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-		Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
-		for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
-			v < hn;
-			v ++, xs += dlen, ys += dlen, xd += llen, yd += llen)
-		{
-			*xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
-			*yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
-		}
-	}
-
-	/*
-	 * We do not need Fd and Gd after that point.
-	 */
-
-	/*
-	 * Compute our F and G modulo sufficiently many small primes.
-	 */
-	for (u = 0; u < llen; u ++) {
-		uint32_t p, p0i, R2;
-		uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
-		size_t v;
-
-		/*
-		 * All computations are done modulo p.
-		 */
-		p = primes[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-
-		/*
-		 * If we processed slen words, then f and g have been
-		 * de-NTTized, and are in RNS; we can rebuild them.
-		 */
-		if (u == slen) {
-			zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
-			zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
-		}
-
-		gm = t1;
-		igm = gm + n;
-		fx = igm + n;
-		gx = fx + n;
-
-		modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
-
-		if (u < slen) {
-			for (v = 0, x = ft + u, y = gt + u;
-				v < n; v ++, x += slen, y += slen)
-			{
-				fx[v] = *x;
-				gx[v] = *y;
-			}
-			modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i);
-			modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i);
-		} else {
-			uint32_t Rx;
-
-			Rx = modp_Rx((unsigned)slen, p, p0i, R2);
-			for (v = 0, x = ft, y = gt;
-				v < n; v ++, x += slen, y += slen)
-			{
-				fx[v] = zint_mod_small_signed(x, slen,
-					p, p0i, R2, Rx);
-				gx[v] = zint_mod_small_signed(y, slen,
-					p, p0i, R2, Rx);
-			}
-			modp_NTT2(fx, gm, logn, p, p0i);
-			modp_NTT2(gx, gm, logn, p, p0i);
-		}
-
-		/*
-		 * Get F' and G' modulo p and in NTT representation
-		 * (they have degree n/2). These values were computed in
-		 * a previous step, and stored in Ft and Gt.
-		 */
-		Fp = gx + n;
-		Gp = Fp + hn;
-		for (v = 0, x = Ft + u, y = Gt + u;
-			v < hn; v ++, x += llen, y += llen)
-		{
-			Fp[v] = *x;
-			Gp[v] = *y;
-		}
-		modp_NTT2(Fp, gm, logn - 1, p, p0i);
-		modp_NTT2(Gp, gm, logn - 1, p, p0i);
-
-		/*
-		 * Compute our F and G modulo p.
-		 *
-		 * General case:
-		 *
-		 *   we divide degree by d = 2 or 3
-		 *   f'(x^d) = N(f)(x^d) = f * adj(f)
-		 *   g'(x^d) = N(g)(x^d) = g * adj(g)
-		 *   f'*G' - g'*F' = q
-		 *   F = F'(x^d) * adj(g)
-		 *   G = G'(x^d) * adj(f)
-		 *
-		 * We compute things in the NTT. We group roots of phi
-		 * such that all roots x in a group share the same x^d.
-		 * If the roots in a group are x_1, x_2... x_d, then:
-		 *
-		 *   N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
-		 *
-		 * Thus, we have:
-		 *
-		 *   G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
-		 *   G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
-		 *   ...
-		 *   G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
-		 *
-		 * In all cases, we can thus compute F and G in NTT
-		 * representation by a few simple multiplications.
-		 * Moreover, in our chosen NTT representation, roots
-		 * from the same group are consecutive in RAM.
-		 */
-		for (v = 0, x = Ft + u, y = Gt + u; v < hn;
-			v ++, x += (llen << 1), y += (llen << 1))
-		{
-			uint32_t ftA, ftB, gtA, gtB;
-			uint32_t mFp, mGp;
-
-			ftA = fx[(v << 1) + 0];
-			ftB = fx[(v << 1) + 1];
-			gtA = gx[(v << 1) + 0];
-			gtB = gx[(v << 1) + 1];
-			mFp = modp_montymul(Fp[v], R2, p, p0i);
-			mGp = modp_montymul(Gp[v], R2, p, p0i);
-			x[0] = modp_montymul(gtB, mFp, p, p0i);
-			x[llen] = modp_montymul(gtA, mFp, p, p0i);
-			y[0] = modp_montymul(ftB, mGp, p, p0i);
-			y[llen] = modp_montymul(ftA, mGp, p, p0i);
-		}
-		modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
-		modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
-	}
-
-	/*
-	 * Rebuild F and G with the CRT.
-	 */
-	zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
-	zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);
-
-	/*
-	 * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
-	 * order).
-	 */
-
-	/*
-	 * Apply Babai reduction to bring back F and G to size slen.
-	 *
-	 * We use the FFT to compute successive approximations of the
-	 * reduction coefficient. We first isolate the top bits of
-	 * the coefficients of f and g, and convert them to floating
-	 * point; with the FFT, we compute adj(f), adj(g), and
-	 * 1/(f*adj(f)+g*adj(g)).
-	 *
-	 * Then, we repeatedly apply the following:
-	 *
-	 *   - Get the top bits of the coefficients of F and G into
-	 *     floating point, and use the FFT to compute:
-	 *        (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
-	 *
-	 *   - Convert back that value into normal representation, and
-	 *     round it to the nearest integers, yielding a polynomial k.
-	 *     Proper scaling is applied to f, g, F and G so that the
-	 *     coefficients fit on 32 bits (signed).
-	 *
-	 *   - Subtract k*f from F and k*g from G.
-	 *
-	 * Under normal conditions, this process reduces the size of F
-	 * and G by some bits at each iteration. For constant-time
-	 * operation, we do not want to measure the actual length of
-	 * F and G; instead, we do the following:
-	 *
-	 *   - f and g are converted to floating-point, with some scaling
-	 *     if necessary to keep values in the representable range.
-	 *
-	 *   - For each iteration, we _assume_ a maximum size for F and G,
-	 *     and use the values at that size. If we overreach, then
-	 *     we get zeros, which is harmless: the resulting coefficients
-	 *     of k will be 0 and the value won't be reduced.
-	 *
-	 *   - We conservatively assume that F and G will be reduced by
-	 *     at least 25 bits at each iteration.
-	 *
-	 * Even when reaching the bottom of the reduction, reduction
-	 * coefficient will remain low. If it goes out-of-range, then
-	 * something wrong occurred and the whole NTRU solving fails.
-	 */
-
-	/*
-	 * Memory layout:
-	 *  - We need to compute and keep adj(f), adj(g), and
-	 *    1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
-	 *    respectively).
-	 *  - At each iteration we need two extra fp buffer (N fp values),
-	 *    and produce a k (N 32-bit words). k will be shared with one
-	 *    of the fp buffers.
-	 *  - To compute k*f and k*g efficiently (with the NTT), we need
-	 *    some extra room; we reuse the space of the temporary buffers.
-	 *
-	 * Arrays of 'fpr' are obtained from the temporary array itself.
-	 * We ensure that the base is at a properly aligned offset (the
-	 * source array tmp[] is supposed to be already aligned).
-	 */
-
-	rt3 = align_fpr(tmp, t1);
-	rt4 = rt3 + n;
-	rt5 = rt4 + n;
-	rt1 = rt5 + (n >> 1);
-	k = (int32_t *)align_u32(tmp, rt1);
-	rt2 = align_fpr(tmp, k + n);
-	if (rt2 < (rt1 + n)) {
-		rt2 = rt1 + n;
-	}
-	t1 = (uint32_t *)k + n;
-
-	/*
-	 * Get f and g into rt3 and rt4 as floating-point approximations.
-	 *
-	 * We need to "scale down" the floating-point representation of
-	 * coefficients when they are too big. We want to keep the value
-	 * below 2^310 or so. Thus, when values are larger than 10 words,
-	 * we consider only the top 10 words. Array lengths have been
-	 * computed so that average maximum length will fall in the
-	 * middle or the upper half of these top 10 words.
-	 */
-	rlen = (slen > 10) ? 10 : slen;
-	poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn);
-	poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn);
-
-	/*
-	 * Values in rt3 and rt4 are downscaled by 2^(scale_fg).
-	 */
-	scale_fg = 31 * (int)(slen - rlen);
-
-	/*
-	 * Estimated boundaries for the maximum size (in bits) of the
-	 * coefficients of (f,g). We use the measured average, and
-	 * allow for a deviation of at most six times the standard
-	 * deviation.
-	 */
-	minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std;
-	maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std;
-
-	/*
-	 * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f)
-	 * and adj(g) in rt3 and rt4, respectively.
-	 */
-	Zf(FFT)(rt3, logn);
-	Zf(FFT)(rt4, logn);
-	Zf(poly_invnorm2_fft)(rt5, rt3, rt4, logn);
-	Zf(poly_adj_fft)(rt3, logn);
-	Zf(poly_adj_fft)(rt4, logn);
-
-	/*
-	 * Reduce F and G repeatedly.
-	 *
-	 * The expected maximum bit length of coefficients of F and G
-	 * is kept in maxbl_FG, with the corresponding word length in
-	 * FGlen.
-	 */
-	FGlen = llen;
-	maxbl_FG = 31 * (int)llen;
-
-	/*
-	 * Each reduction operation computes the reduction polynomial
-	 * "k". We need that polynomial to have coefficients that fit
-	 * on 32-bit signed integers, with some scaling; thus, we use
-	 * a descending sequence of scaling values, down to zero.
-	 *
-	 * The size of the coefficients of k is (roughly) the difference
-	 * between the size of the coefficients of (F,G) and the size
-	 * of the coefficients of (f,g). Thus, the maximum size of the
-	 * coefficients of k is, at the start, maxbl_FG - minbl_fg;
-	 * this is our starting scale value for k.
-	 *
-	 * We need to estimate the size of (F,G) during the execution of
-	 * the algorithm; we are allowed some overestimation but not too
-	 * much (poly_big_to_fp() uses a 310-bit window). Generally
-	 * speaking, after applying a reduction with k scaled to
-	 * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd,
-	 * where 'dd' is a few bits to account for the fact that the
-	 * reduction is never perfect (intuitively, dd is on the order
-	 * of sqrt(N), so at most 5 bits; we here allow for 10 extra
-	 * bits).
-	 *
-	 * The size of (f,g) is not known exactly, but maxbl_fg is an
-	 * upper bound.
-	 */
-	scale_k = maxbl_FG - minbl_fg;
-
-	for (;;) {
-		int scale_FG, dc, new_maxbl_FG;
-		uint32_t scl, sch;
-		fpr pdc, pt;
-
-		/*
-		 * Convert current F and G into floating-point. We apply
-		 * scaling if the current length is more than 10 words.
-		 */
-		rlen = (FGlen > 10) ? 10 : FGlen;
-		scale_FG = 31 * (int)(FGlen - rlen);
-		poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn);
-		poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn);
-
-		/*
-		 * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2.
-		 */
-		Zf(FFT)(rt1, logn);
-		Zf(FFT)(rt2, logn);
-		Zf(poly_mul_fft)(rt1, rt3, logn);
-		Zf(poly_mul_fft)(rt2, rt4, logn);
-		Zf(poly_add)(rt2, rt1, logn);
-		Zf(poly_mul_autoadj_fft)(rt2, rt5, logn);
-		Zf(iFFT)(rt2, logn);
-
-		/*
-		 * (f,g) are scaled by 'scale_fg', meaning that the
-		 * numbers in rt3/rt4 should be multiplied by 2^(scale_fg)
-		 * to have their true mathematical value.
-		 *
-		 * (F,G) are similarly scaled by 'scale_FG'. Therefore,
-		 * the value we computed in rt2 is scaled by
-		 * 'scale_FG-scale_fg'.
-		 *
-		 * We want that value to be scaled by 'scale_k', hence we
-		 * apply a corrective scaling. After scaling, the values
-		 * should fit in -2^31-1..+2^31-1.
-		 */
-		dc = scale_k - scale_FG + scale_fg;
-
-		/*
-		 * We will need to multiply values by 2^(-dc). The value
-		 * 'dc' is not secret, so we can compute 2^(-dc) with a
-		 * non-constant-time process.
-		 * (We could use ldexp(), but we prefer to avoid any
-		 * dependency on libm. When using FP emulation, we could
-		 * use our fpr_ldexp(), which is constant-time.)
-		 */
-		if (dc < 0) {
-			dc = -dc;
-			pt = fpr_two;
-		} else {
-			pt = fpr_onehalf;
-		}
-		pdc = fpr_one;
-		while (dc != 0) {
-			if ((dc & 1) != 0) {
-				pdc = fpr_mul(pdc, pt);
-			}
-			dc >>= 1;
-			pt = fpr_sqr(pt);
-		}
-
-		for (u = 0; u < n; u ++) {
-			fpr xv;
-
-			xv = fpr_mul(rt2[u], pdc);
-
-			/*
-			 * Sometimes the values can be out-of-bounds if
-			 * the algorithm fails; we must not call
-			 * fpr_rint() (and cast to int32_t) if the value
-			 * is not in-bounds. Note that the test does not
-			 * break constant-time discipline, since any
-			 * failure here implies that we discard the current
-			 * secret key (f,g).
-			 */
-			if (!fpr_lt(fpr_mtwo31m1, xv)
-				|| !fpr_lt(xv, fpr_ptwo31m1))
-			{
-				return 0;
-			}
-			k[u] = (int32_t)fpr_rint(xv);
-		}
-
-		/*
-		 * Values in k[] are integers. They really are scaled
-		 * down by maxbl_FG - minbl_fg bits.
-		 *
-		 * If we are at low depth, then we use the NTT to
-		 * compute k*f and k*g.
-		 */
-		sch = (uint32_t)(scale_k / 31);
-		scl = (uint32_t)(scale_k % 31);
-		if (depth <= DEPTH_INT_FG) {
-			poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen,
-				k, sch, scl, logn, t1);
-			poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen,
-				k, sch, scl, logn, t1);
-		} else {
-			poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen,
-				k, sch, scl, logn);
-			poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen,
-				k, sch, scl, logn);
-		}
-
-		/*
-		 * We compute the new maximum size of (F,G), assuming that
-		 * (f,g) has _maximal_ length (i.e. that reduction is
-		 * "late" instead of "early". We also adjust FGlen
-		 * accordingly.
-		 */
-		new_maxbl_FG = scale_k + maxbl_fg + 10;
-		if (new_maxbl_FG < maxbl_FG) {
-			maxbl_FG = new_maxbl_FG;
-			if ((int)FGlen * 31 >= maxbl_FG + 31) {
-				FGlen --;
-			}
-		}
-
-		/*
-		 * We suppose that scaling down achieves a reduction by
-		 * at least 25 bits per iteration. We stop when we have
-		 * done the loop with an unscaled k.
-		 */
-		if (scale_k <= 0) {
-			break;
-		}
-		scale_k -= 25;
-		if (scale_k < 0) {
-			scale_k = 0;
-		}
-	}
-
-	/*
-	 * If (F,G) length was lowered below 'slen', then we must take
-	 * care to re-extend the sign.
-	 */
-	if (FGlen < slen) {
-		for (u = 0; u < n; u ++, Ft += llen, Gt += llen) {
-			size_t v;
-			uint32_t sw;
-
-			sw = -(Ft[FGlen - 1] >> 30) >> 1;
-			for (v = FGlen; v < slen; v ++) {
-				Ft[v] = sw;
-			}
-			sw = -(Gt[FGlen - 1] >> 30) >> 1;
-			for (v = FGlen; v < slen; v ++) {
-				Gt[v] = sw;
-			}
-		}
-	}
-
-	/*
-	 * Compress encoding of all values to 'slen' words (this is the
-	 * expected output format).
-	 */
-	for (u = 0, x = tmp, y = tmp;
-		u < (n << 1); u ++, x += slen, y += llen)
-	{
-		memmove(x, y, slen * sizeof *y);
-	}
-	return 1;
-}
-
-/*
- * Solving the NTRU equation, binary case, depth = 1. Upon entry, the
- * F and G from the previous level should be in the tmp[] array.
- *
- * Returned value: 1 on success, 0 on error.
- */
-static int
-solve_NTRU_binary_depth1(unsigned logn_top,
-	const int8_t *f, const int8_t *g, uint32_t *tmp)
-{
-	/*
-	 * The first half of this function is a copy of the corresponding
-	 * part in solve_NTRU_intermediate(), for the reconstruction of
-	 * the unreduced F and G. The second half (Babai reduction) is
-	 * done differently, because the unreduced F and G fit in 53 bits
-	 * of precision, allowing a much simpler process with lower RAM
-	 * usage.
-	 */
-	unsigned depth, logn;
-	size_t n_top, n, hn, slen, dlen, llen, u;
-	uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
-	fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6;
-	uint32_t *x, *y;
-
-	depth = 1;
-	n_top = (size_t)1 << logn_top;
-	logn = logn_top - depth;
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-
-	/*
-	 * Equations are:
-	 *
-	 *   f' = f0^2 - X^2*f1^2
-	 *   g' = g0^2 - X^2*g1^2
-	 *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
-	 *   F = F'*(g0 - X*g1)
-	 *   G = G'*(f0 - X*f1)
-	 *
-	 * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
-	 * degree N/2 (their odd-indexed coefficients are all zero).
-	 */
-
-	/*
-	 * slen = size for our input f and g; also size of the reduced
-	 *        F and G we return (degree N)
-	 *
-	 * dlen = size of the F and G obtained from the deeper level
-	 *        (degree N/2)
-	 *
-	 * llen = size for intermediary F and G before reduction (degree N)
-	 *
-	 * We build our non-reduced F and G as two independent halves each,
-	 * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
-	 */
-	slen = MAX_BL_SMALL[depth];
-	dlen = MAX_BL_SMALL[depth + 1];
-	llen = MAX_BL_LARGE[depth];
-
-	/*
-	 * Fd and Gd are the F and G from the deeper level. Ft and Gt
-	 * are the destination arrays for the unreduced F and G.
-	 */
-	Fd = tmp;
-	Gd = Fd + dlen * hn;
-	Ft = Gd + dlen * hn;
-	Gt = Ft + llen * n;
-
-	/*
-	 * We reduce Fd and Gd modulo all the small primes we will need,
-	 * and store the values in Ft and Gt.
-	 */
-	for (u = 0; u < llen; u ++) {
-		uint32_t p, p0i, R2, Rx;
-		size_t v;
-		uint32_t *xs, *ys, *xd, *yd;
-
-		p = PRIMES[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-		Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
-		for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
-			v < hn;
-			v ++, xs += dlen, ys += dlen, xd += llen, yd += llen)
-		{
-			*xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
-			*yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
-		}
-	}
-
-	/*
-	 * Now Fd and Gd are not needed anymore; we can squeeze them out.
-	 */
-	memmove(tmp, Ft, llen * n * sizeof(uint32_t));
-	Ft = tmp;
-	memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t));
-	Gt = Ft + llen * n;
-	ft = Gt + llen * n;
-	gt = ft + slen * n;
-
-	t1 = gt + slen * n;
-
-	/*
-	 * Compute our F and G modulo sufficiently many small primes.
-	 */
-	for (u = 0; u < llen; u ++) {
-		uint32_t p, p0i, R2;
-		uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
-		unsigned e;
-		size_t v;
-
-		/*
-		 * All computations are done modulo p.
-		 */
-		p = PRIMES[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-
-		/*
-		 * We recompute things from the source f and g, of full
-		 * degree. However, we will need only the n first elements
-		 * of the inverse NTT table (igm); the call to modp_mkgm()
-		 * below will fill n_top elements in igm[] (thus overflowing
-		 * into fx[]) but later code will overwrite these extra
-		 * elements.
-		 */
-		gm = t1;
-		igm = gm + n_top;
-		fx = igm + n;
-		gx = fx + n_top;
-		modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i);
-
-		/*
-		 * Set ft and gt to f and g modulo p, respectively.
-		 */
-		for (v = 0; v < n_top; v ++) {
-			fx[v] = modp_set(f[v], p);
-			gx[v] = modp_set(g[v], p);
-		}
-
-		/*
-		 * Convert to NTT and compute our f and g.
-		 */
-		modp_NTT2(fx, gm, logn_top, p, p0i);
-		modp_NTT2(gx, gm, logn_top, p, p0i);
-		for (e = logn_top; e > logn; e --) {
-			modp_poly_rec_res(fx, e, p, p0i, R2);
-			modp_poly_rec_res(gx, e, p, p0i, R2);
-		}
-
-		/*
-		 * From that point onward, we only need tables for
-		 * degree n, so we can save some space.
-		 */
-		if (depth > 0) { /* always true */
-			memmove(gm + n, igm, n * sizeof *igm);
-			igm = gm + n;
-			memmove(igm + n, fx, n * sizeof *ft);
-			fx = igm + n;
-			memmove(fx + n, gx, n * sizeof *gt);
-			gx = fx + n;
-		}
-
-		/*
-		 * Get F' and G' modulo p and in NTT representation
-		 * (they have degree n/2). These values were computed
-		 * in a previous step, and stored in Ft and Gt.
-		 */
-		Fp = gx + n;
-		Gp = Fp + hn;
-		for (v = 0, x = Ft + u, y = Gt + u;
-			v < hn; v ++, x += llen, y += llen)
-		{
-			Fp[v] = *x;
-			Gp[v] = *y;
-		}
-		modp_NTT2(Fp, gm, logn - 1, p, p0i);
-		modp_NTT2(Gp, gm, logn - 1, p, p0i);
-
-		/*
-		 * Compute our F and G modulo p.
-		 *
-		 * Equations are:
-		 *
-		 *   f'(x^2) = N(f)(x^2) = f * adj(f)
-		 *   g'(x^2) = N(g)(x^2) = g * adj(g)
-		 *
-		 *   f'*G' - g'*F' = q
-		 *
-		 *   F = F'(x^2) * adj(g)
-		 *   G = G'(x^2) * adj(f)
-		 *
-		 * The NTT representation of f is f(w) for all w which
-		 * are roots of phi. In the binary case, as well as in
-		 * the ternary case for all depth except the deepest,
-		 * these roots can be grouped in pairs (w,-w), and we
-		 * then have:
-		 *
-		 *   f(w) = adj(f)(-w)
-		 *   f(-w) = adj(f)(w)
-		 *
-		 * and w^2 is then a root for phi at the half-degree.
-		 *
-		 * At the deepest level in the ternary case, this still
-		 * holds, in the following sense: the roots of x^2-x+1
-		 * are (w,-w^2) (for w^3 = -1, and w != -1), and we
-		 * have:
-		 *
-		 *   f(w) = adj(f)(-w^2)
-		 *   f(-w^2) = adj(f)(w)
-		 *
-		 * In all case, we can thus compute F and G in NTT
-		 * representation by a few simple multiplications.
-		 * Moreover, the two roots for each pair are consecutive
-		 * in our bit-reversal encoding.
-		 */
-		for (v = 0, x = Ft + u, y = Gt + u;
-			v < hn; v ++, x += (llen << 1), y += (llen << 1))
-		{
-			uint32_t ftA, ftB, gtA, gtB;
-			uint32_t mFp, mGp;
-
-			ftA = fx[(v << 1) + 0];
-			ftB = fx[(v << 1) + 1];
-			gtA = gx[(v << 1) + 0];
-			gtB = gx[(v << 1) + 1];
-			mFp = modp_montymul(Fp[v], R2, p, p0i);
-			mGp = modp_montymul(Gp[v], R2, p, p0i);
-			x[0] = modp_montymul(gtB, mFp, p, p0i);
-			x[llen] = modp_montymul(gtA, mFp, p, p0i);
-			y[0] = modp_montymul(ftB, mGp, p, p0i);
-			y[llen] = modp_montymul(ftA, mGp, p, p0i);
-		}
-		modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
-		modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
-
-		/*
-		 * Also save ft and gt (only up to size slen).
-		 */
-		if (u < slen) {
-			modp_iNTT2(fx, igm, logn, p, p0i);
-			modp_iNTT2(gx, igm, logn, p, p0i);
-			for (v = 0, x = ft + u, y = gt + u;
-				v < n; v ++, x += slen, y += slen)
-			{
-				*x = fx[v];
-				*y = gx[v];
-			}
-		}
-	}
-
-	/*
-	 * Rebuild f, g, F and G with the CRT. Note that the elements of F
-	 * and G are consecutive, and thus can be rebuilt in a single
-	 * loop; similarly, the elements of f and g are consecutive.
-	 */
-	zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1);
-	zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1);
-
-	/*
-	 * Here starts the Babai reduction, specialized for depth = 1.
-	 *
-	 * Candidates F and G (from Ft and Gt), and base f and g (ft and gt),
-	 * are converted to floating point. There is no scaling, and a
-	 * single pass is sufficient.
-	 */
-
-	/*
-	 * Convert F and G into floating point (rt1 and rt2).
-	 */
-	rt1 = align_fpr(tmp, gt + slen * n);
-	rt2 = rt1 + n;
-	poly_big_to_fp(rt1, Ft, llen, llen, logn);
-	poly_big_to_fp(rt2, Gt, llen, llen, logn);
-
-	/*
-	 * Integer representation of F and G is no longer needed, we
-	 * can remove it.
-	 */
-	memmove(tmp, ft, 2 * slen * n * sizeof *ft);
-	ft = tmp;
-	gt = ft + slen * n;
-	rt3 = align_fpr(tmp, gt + slen * n);
-	memmove(rt3, rt1, 2 * n * sizeof *rt1);
-	rt1 = rt3;
-	rt2 = rt1 + n;
-	rt3 = rt2 + n;
-	rt4 = rt3 + n;
-
-	/*
-	 * Convert f and g into floating point (rt3 and rt4).
-	 */
-	poly_big_to_fp(rt3, ft, slen, slen, logn);
-	poly_big_to_fp(rt4, gt, slen, slen, logn);
-
-	/*
-	 * Remove unneeded ft and gt.
-	 */
-	memmove(tmp, rt1, 4 * n * sizeof *rt1);
-	rt1 = (fpr *)tmp;
-	rt2 = rt1 + n;
-	rt3 = rt2 + n;
-	rt4 = rt3 + n;
-
-	/*
-	 * We now have:
-	 *   rt1 = F
-	 *   rt2 = G
-	 *   rt3 = f
-	 *   rt4 = g
-	 * in that order in RAM. We convert all of them to FFT.
-	 */
-	Zf(FFT)(rt1, logn);
-	Zf(FFT)(rt2, logn);
-	Zf(FFT)(rt3, logn);
-	Zf(FFT)(rt4, logn);
-
-	/*
-	 * Compute:
-	 *   rt5 = F*adj(f) + G*adj(g)
-	 *   rt6 = 1 / (f*adj(f) + g*adj(g))
-	 * (Note that rt6 is half-length.)
-	 */
-	rt5 = rt4 + n;
-	rt6 = rt5 + n;
-	Zf(poly_add_muladj_fft)(rt5, rt1, rt2, rt3, rt4, logn);
-	Zf(poly_invnorm2_fft)(rt6, rt3, rt4, logn);
-
-	/*
-	 * Compute:
-	 *   rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g))
-	 */
-	Zf(poly_mul_autoadj_fft)(rt5, rt6, logn);
-
-	/*
-	 * Compute k as the rounded version of rt5. Check that none of
-	 * the values is larger than 2^63-1 (in absolute value)
-	 * because that would make the fpr_rint() do something undefined;
-	 * note that any out-of-bounds value here implies a failure and
-	 * (f,g) will be discarded, so we can make a simple test.
-	 */
-	Zf(iFFT)(rt5, logn);
-	for (u = 0; u < n; u ++) {
-		fpr z;
-
-		z = rt5[u];
-		if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) {
-			return 0;
-		}
-		rt5[u] = fpr_of(fpr_rint(z));
-	}
-	Zf(FFT)(rt5, logn);
-
-	/*
-	 * Subtract k*f from F, and k*g from G.
-	 */
-	Zf(poly_mul_fft)(rt3, rt5, logn);
-	Zf(poly_mul_fft)(rt4, rt5, logn);
-	Zf(poly_sub)(rt1, rt3, logn);
-	Zf(poly_sub)(rt2, rt4, logn);
-	Zf(iFFT)(rt1, logn);
-	Zf(iFFT)(rt2, logn);
-
-	/*
-	 * Convert back F and G to integers, and return.
-	 */
-	Ft = tmp;
-	Gt = Ft + n;
-	rt3 = align_fpr(tmp, Gt + n);
-	memmove(rt3, rt1, 2 * n * sizeof *rt1);
-	rt1 = rt3;
-	rt2 = rt1 + n;
-	for (u = 0; u < n; u ++) {
-		Ft[u] = (uint32_t)fpr_rint(rt1[u]);
-		Gt[u] = (uint32_t)fpr_rint(rt2[u]);
-	}
-
-	return 1;
-}
-
-/*
- * Solving the NTRU equation, top level. Upon entry, the F and G
- * from the previous level should be in the tmp[] array.
- *
- * Returned value: 1 on success, 0 on error.
- */
-static int
-solve_NTRU_binary_depth0(unsigned logn,
-	const int8_t *f, const int8_t *g, uint32_t *tmp)
-{
-	size_t n, hn, u;
-	uint32_t p, p0i, R2;
-	uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5;
-	uint32_t *gm, *igm, *ft, *gt;
-	fpr *rt2, *rt3;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-
-	/*
-	 * Equations are:
-	 *
-	 *   f' = f0^2 - X^2*f1^2
-	 *   g' = g0^2 - X^2*g1^2
-	 *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
-	 *   F = F'*(g0 - X*g1)
-	 *   G = G'*(f0 - X*f1)
-	 *
-	 * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
-	 * degree N/2 (their odd-indexed coefficients are all zero).
-	 *
-	 * Everything should fit in 31-bit integers, hence we can just use
-	 * the first small prime p = 2147473409.
-	 */
-	p = PRIMES[0].p;
-	p0i = modp_ninv31(p);
-	R2 = modp_R2(p, p0i);
-
-	Fp = tmp;
-	Gp = Fp + hn;
-	ft = Gp + hn;
-	gt = ft + n;
-	gm = gt + n;
-	igm = gm + n;
-
-	modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i);
-
-	/*
-	 * Convert F' anf G' in NTT representation.
-	 */
-	for (u = 0; u < hn; u ++) {
-		Fp[u] = modp_set(zint_one_to_plain(Fp + u), p);
-		Gp[u] = modp_set(zint_one_to_plain(Gp + u), p);
-	}
-	modp_NTT2(Fp, gm, logn - 1, p, p0i);
-	modp_NTT2(Gp, gm, logn - 1, p, p0i);
-
-	/*
-	 * Load f and g and convert them to NTT representation.
-	 */
-	for (u = 0; u < n; u ++) {
-		ft[u] = modp_set(f[u], p);
-		gt[u] = modp_set(g[u], p);
-	}
-	modp_NTT2(ft, gm, logn, p, p0i);
-	modp_NTT2(gt, gm, logn, p, p0i);
-
-	/*
-	 * Build the unreduced F,G in ft and gt.
-	 */
-	for (u = 0; u < n; u += 2) {
-		uint32_t ftA, ftB, gtA, gtB;
-		uint32_t mFp, mGp;
-
-		ftA = ft[u + 0];
-		ftB = ft[u + 1];
-		gtA = gt[u + 0];
-		gtB = gt[u + 1];
-		mFp = modp_montymul(Fp[u >> 1], R2, p, p0i);
-		mGp = modp_montymul(Gp[u >> 1], R2, p, p0i);
-		ft[u + 0] = modp_montymul(gtB, mFp, p, p0i);
-		ft[u + 1] = modp_montymul(gtA, mFp, p, p0i);
-		gt[u + 0] = modp_montymul(ftB, mGp, p, p0i);
-		gt[u + 1] = modp_montymul(ftA, mGp, p, p0i);
-	}
-	modp_iNTT2(ft, igm, logn, p, p0i);
-	modp_iNTT2(gt, igm, logn, p, p0i);
-
-	Gp = Fp + n;
-	t1 = Gp + n;
-	memmove(Fp, ft, 2 * n * sizeof *ft);
-
-	/*
-	 * We now need to apply the Babai reduction. At that point,
-	 * we have F and G in two n-word arrays.
-	 *
-	 * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g)
-	 * modulo p, using the NTT. We still move memory around in
-	 * order to save RAM.
-	 */
-	t2 = t1 + n;
-	t3 = t2 + n;
-	t4 = t3 + n;
-	t5 = t4 + n;
-
-	/*
-	 * Compute the NTT tables in t1 and t2. We do not keep t2
-	 * (we'll recompute it later on).
-	 */
-	modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i);
-
-	/*
-	 * Convert F and G to NTT.
-	 */
-	modp_NTT2(Fp, t1, logn, p, p0i);
-	modp_NTT2(Gp, t1, logn, p, p0i);
-
-	/*
-	 * Load f and adj(f) in t4 and t5, and convert them to NTT
-	 * representation.
-	 */
-	t4[0] = t5[0] = modp_set(f[0], p);
-	for (u = 1; u < n; u ++) {
-		t4[u] = modp_set(f[u], p);
-		t5[n - u] = modp_set(-f[u], p);
-	}
-	modp_NTT2(t4, t1, logn, p, p0i);
-	modp_NTT2(t5, t1, logn, p, p0i);
-
-	/*
-	 * Compute F*adj(f) in t2, and f*adj(f) in t3.
-	 */
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = modp_montymul(t5[u], R2, p, p0i);
-		t2[u] = modp_montymul(w, Fp[u], p, p0i);
-		t3[u] = modp_montymul(w, t4[u], p, p0i);
-	}
-
-	/*
-	 * Load g and adj(g) in t4 and t5, and convert them to NTT
-	 * representation.
-	 */
-	t4[0] = t5[0] = modp_set(g[0], p);
-	for (u = 1; u < n; u ++) {
-		t4[u] = modp_set(g[u], p);
-		t5[n - u] = modp_set(-g[u], p);
-	}
-	modp_NTT2(t4, t1, logn, p, p0i);
-	modp_NTT2(t5, t1, logn, p, p0i);
-
-	/*
-	 * Add G*adj(g) to t2, and g*adj(g) to t3.
-	 */
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = modp_montymul(t5[u], R2, p, p0i);
-		t2[u] = modp_add(t2[u],
-			modp_montymul(w, Gp[u], p, p0i), p);
-		t3[u] = modp_add(t3[u],
-			modp_montymul(w, t4[u], p, p0i), p);
-	}
-
-	/*
-	 * Convert back t2 and t3 to normal representation (normalized
-	 * around 0), and then
-	 * move them to t1 and t2. We first need to recompute the
-	 * inverse table for NTT.
-	 */
-	modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i);
-	modp_iNTT2(t2, t4, logn, p, p0i);
-	modp_iNTT2(t3, t4, logn, p, p0i);
-	for (u = 0; u < n; u ++) {
-		t1[u] = (uint32_t)modp_norm(t2[u], p);
-		t2[u] = (uint32_t)modp_norm(t3[u], p);
-	}
-
-	/*
-	 * At that point, array contents are:
-	 *
-	 *   F (NTT representation) (Fp)
-	 *   G (NTT representation) (Gp)
-	 *   F*adj(f)+G*adj(g) (t1)
-	 *   f*adj(f)+g*adj(g) (t2)
-	 *
-	 * We want to divide t1 by t2. The result is not integral; it
-	 * must be rounded. We thus need to use the FFT.
-	 */
-
-	/*
-	 * Get f*adj(f)+g*adj(g) in FFT representation. Since this
-	 * polynomial is auto-adjoint, all its coordinates in FFT
-	 * representation are actually real, so we can truncate off
-	 * the imaginary parts.
-	 */
-	rt3 = align_fpr(tmp, t3);
-	for (u = 0; u < n; u ++) {
-		rt3[u] = fpr_of(((int32_t *)t2)[u]);
-	}
-	Zf(FFT)(rt3, logn);
-	rt2 = align_fpr(tmp, t2);
-	memmove(rt2, rt3, hn * sizeof *rt3);
-
-	/*
-	 * Convert F*adj(f)+G*adj(g) in FFT representation.
-	 */
-	rt3 = rt2 + hn;
-	for (u = 0; u < n; u ++) {
-		rt3[u] = fpr_of(((int32_t *)t1)[u]);
-	}
-	Zf(FFT)(rt3, logn);
-
-	/*
-	 * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get
-	 * its rounded normal representation in t1.
-	 */
-	Zf(poly_div_autoadj_fft)(rt3, rt2, logn);
-	Zf(iFFT)(rt3, logn);
-	for (u = 0; u < n; u ++) {
-		t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p);
-	}
-
-	/*
-	 * RAM contents are now:
-	 *
-	 *   F (NTT representation) (Fp)
-	 *   G (NTT representation) (Gp)
-	 *   k (t1)
-	 *
-	 * We want to compute F-k*f, and G-k*g.
-	 */
-	t2 = t1 + n;
-	t3 = t2 + n;
-	t4 = t3 + n;
-	t5 = t4 + n;
-	modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i);
-	for (u = 0; u < n; u ++) {
-		t4[u] = modp_set(f[u], p);
-		t5[u] = modp_set(g[u], p);
-	}
-	modp_NTT2(t1, t2, logn, p, p0i);
-	modp_NTT2(t4, t2, logn, p, p0i);
-	modp_NTT2(t5, t2, logn, p, p0i);
-	for (u = 0; u < n; u ++) {
-		uint32_t kw;
-
-		kw = modp_montymul(t1[u], R2, p, p0i);
-		Fp[u] = modp_sub(Fp[u],
-			modp_montymul(kw, t4[u], p, p0i), p);
-		Gp[u] = modp_sub(Gp[u],
-			modp_montymul(kw, t5[u], p, p0i), p);
-	}
-	modp_iNTT2(Fp, t3, logn, p, p0i);
-	modp_iNTT2(Gp, t3, logn, p, p0i);
-	for (u = 0; u < n; u ++) {
-		Fp[u] = (uint32_t)modp_norm(Fp[u], p);
-		Gp[u] = (uint32_t)modp_norm(Gp[u], p);
-	}
-
-	return 1;
-}
-
-/*
- * Solve the NTRU equation. Returned value is 1 on success, 0 on error.
- * G can be NULL, in which case that value is computed but not returned.
- * If any of the coefficients of F and G exceeds lim (in absolute value),
- * then 0 is returned.
- */
-static int
-solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
-	const int8_t *f, const int8_t *g, int lim, uint32_t *tmp)
-{
-	size_t n, u;
-	uint32_t *ft, *gt, *Ft, *Gt, *gm;
-	uint32_t p, p0i, r;
-	const small_prime *primes;
-
-	n = MKN(logn);
-
-	if (!solve_NTRU_deepest(logn, f, g, tmp)) {
-		return 0;
-	}
-
-	/*
-	 * For logn <= 2, we need to use solve_NTRU_intermediate()
-	 * directly, because coefficients are a bit too large and
-	 * do not fit the hypotheses in solve_NTRU_binary_depth0().
-	 */
-	if (logn <= 2) {
-		unsigned depth;
-
-		depth = logn;
-		while (depth -- > 0) {
-			if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
-				return 0;
-			}
-		}
-	} else {
-		unsigned depth;
-
-		depth = logn;
-		while (depth -- > 2) {
-			if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
-				return 0;
-			}
-		}
-		if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) {
-			return 0;
-		}
-		if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) {
-			return 0;
-		}
-	}
-
-	/*
-	 * If no buffer has been provided for G, use a temporary one.
-	 */
-	if (G == NULL) {
-		G = (int8_t *)(tmp + 2 * n);
-	}
-
-	/*
-	 * Final F and G are in fk->tmp, one word per coefficient
-	 * (signed value over 31 bits).
-	 */
-	if (!poly_big_to_small(F, tmp, lim, logn)
-		|| !poly_big_to_small(G, tmp + n, lim, logn))
-	{
-		return 0;
-	}
-
-	/*
-	 * Verify that the NTRU equation is fulfilled. Since all elements
-	 * have short lengths, verifying modulo a small prime p works, and
-	 * allows using the NTT.
-	 *
-	 * We put Gt[] first in tmp[], and process it first, so that it does
-	 * not overlap with G[] in case we allocated it ourselves.
-	 */
-	Gt = tmp;
-	ft = Gt + n;
-	gt = ft + n;
-	Ft = gt + n;
-	gm = Ft + n;
-
-	primes = PRIMES;
-	p = primes[0].p;
-	p0i = modp_ninv31(p);
-	modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i);
-	for (u = 0; u < n; u ++) {
-		Gt[u] = modp_set(G[u], p);
-	}
-	for (u = 0; u < n; u ++) {
-		ft[u] = modp_set(f[u], p);
-		gt[u] = modp_set(g[u], p);
-		Ft[u] = modp_set(F[u], p);
-	}
-	modp_NTT2(ft, gm, logn, p, p0i);
-	modp_NTT2(gt, gm, logn, p, p0i);
-	modp_NTT2(Ft, gm, logn, p, p0i);
-	modp_NTT2(Gt, gm, logn, p, p0i);
-	r = modp_montymul(12289, 1, p, p0i);
-	for (u = 0; u < n; u ++) {
-		uint32_t z;
-
-		z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i),
-			modp_montymul(gt[u], Ft[u], p, p0i), p);
-		if (z != r) {
-			return 0;
-		}
-	}
-
-	return 1;
-}
-
-/*
- * Generate a random polynomial with a Gaussian distribution. This function
- * also makes sure that the resultant of the polynomial with phi is odd.
- */
-static void
-poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn)
-{
-	size_t n, u;
-	unsigned mod2;
-
-	n = MKN(logn);
-	mod2 = 0;
-	for (u = 0; u < n; u ++) {
-		int s;
-
-	restart:
-		s = mkgauss(rng, logn);
-
-		/*
-		 * We need the coefficient to fit within -127..+127;
-		 * realistically, this is always the case except for
-		 * the very low degrees (N = 2 or 4), for which there
-		 * is no real security anyway.
-		 */
-		if (s < -127 || s > 127) {
-			goto restart;
-		}
-
-		/*
-		 * We need the sum of all coefficients to be 1; otherwise,
-		 * the resultant of the polynomial with X^N+1 will be even,
-		 * and the binary GCD will fail.
-		 */
-		if (u == n - 1) {
-			if ((mod2 ^ (unsigned)(s & 1)) == 0) {
-				goto restart;
-			}
-		} else {
-			mod2 ^= (unsigned)(s & 1);
-		}
-		f[u] = (int8_t)s;
-	}
-}
-
-/* see falcon.h */
-void
-Zf(keygen)(inner_shake256_context *rng,
-	int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
-	unsigned logn, uint8_t *tmp)
-{
-	/*
-	 * Algorithm is the following:
-	 *
-	 *  - Generate f and g with the Gaussian distribution.
-	 *
-	 *  - If either Res(f,phi) or Res(g,phi) is even, try again.
-	 *
-	 *  - If ||(f,g)|| is too large, try again.
-	 *
-	 *  - If ||B~_{f,g}|| is too large, try again.
-	 *
-	 *  - If f is not invertible mod phi mod q, try again.
-	 *
-	 *  - Compute h = g/f mod phi mod q.
-	 *
-	 *  - Solve the NTRU equation fG - gF = q; if the solving fails,
-	 *    try again. Usual failure condition is when Res(f,phi)
-	 *    and Res(g,phi) are not prime to each other.
-	 */
-	size_t n, u;
-	uint16_t *h2, *tmp2;
-	RNG_CONTEXT *rc;
-#if FALCON_KG_CHACHA20  // yyyKG_CHACHA20+1
-	prng p;
-#endif  // yyyKG_CHACHA20-
-
-	n = MKN(logn);
-#if FALCON_KG_CHACHA20  // yyyKG_CHACHA20+1
-	Zf(prng_init)(&p, rng);
-	rc = &p;
-#else // yyyKG_CHACHA20+0
-	rc = rng;
-#endif  // yyyKG_CHACHA20-
-
-	/*
-	 * We need to generate f and g randomly, until we find values
-	 * such that the norm of (g,-f), and of the orthogonalized
-	 * vector, are satisfying. The orthogonalized vector is:
-	 *   (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g)))
-	 * (it is actually the (N+1)-th row of the Gram-Schmidt basis).
-	 *
-	 * In the binary case, coefficients of f and g are generated
-	 * independently of each other, with a discrete Gaussian
-	 * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then,
-	 * the two vectors have expected norm 1.17*sqrt(q), which is
-	 * also our acceptance bound: we require both vectors to be no
-	 * larger than that (this will be satisfied about 1/4th of the
-	 * time, thus we expect sampling new (f,g) about 4 times for that
-	 * step).
-	 *
-	 * We require that Res(f,phi) and Res(g,phi) are both odd (the
-	 * NTRU equation solver requires it).
-	 */
-	for (;;) {
-		fpr *rt1, *rt2, *rt3;
-		fpr bnorm;
-		uint32_t normf, normg, norm;
-		int lim;
-
-		/*
-		 * The poly_small_mkgauss() function makes sure
-		 * that the sum of coefficients is 1 modulo 2
-		 * (i.e. the resultant of the polynomial with phi
-		 * will be odd).
-		 */
-		poly_small_mkgauss(rc, f, logn);
-		poly_small_mkgauss(rc, g, logn);
-
-		/*
-		 * Verify that all coefficients are within the bounds
-		 * defined in max_fg_bits. This is the case with
-		 * overwhelming probability; this guarantees that the
-		 * key will be encodable with FALCON_COMP_TRIM.
-		 */
-		lim = 1 << (Zf(max_fg_bits)[logn] - 1);
-		for (u = 0; u < n; u ++) {
-			/*
-			 * We can use non-CT tests since on any failure
-			 * we will discard f and g.
-			 */
-			if (f[u] >= lim || f[u] <= -lim
-				|| g[u] >= lim || g[u] <= -lim)
-			{
-				lim = -1;
-				break;
-			}
-		}
-		if (lim < 0) {
-			continue;
-		}
-
-		/*
-		 * Bound is 1.17*sqrt(q). We compute the squared
-		 * norms. With q = 12289, the squared bound is:
-		 *   (1.17^2)* 12289 = 16822.4121
-		 * Since f and g are integral, the squared norm
-		 * of (g,-f) is an integer.
-		 */
-		normf = poly_small_sqnorm(f, logn);
-		normg = poly_small_sqnorm(g, logn);
-		norm = (normf + normg) | -((normf | normg) >> 31);
-		if (norm >= 16823) {
-			continue;
-		}
-
-		/*
-		 * We compute the orthogonalized vector norm.
-		 */
-		rt1 = (fpr *)tmp;
-		rt2 = rt1 + n;
-		rt3 = rt2 + n;
-		poly_small_to_fp(rt1, f, logn);
-		poly_small_to_fp(rt2, g, logn);
-		Zf(FFT)(rt1, logn);
-		Zf(FFT)(rt2, logn);
-		Zf(poly_invnorm2_fft)(rt3, rt1, rt2, logn);
-		Zf(poly_adj_fft)(rt1, logn);
-		Zf(poly_adj_fft)(rt2, logn);
-		Zf(poly_mulconst)(rt1, fpr_q, logn);
-		Zf(poly_mulconst)(rt2, fpr_q, logn);
-		Zf(poly_mul_autoadj_fft)(rt1, rt3, logn);
-		Zf(poly_mul_autoadj_fft)(rt2, rt3, logn);
-		Zf(iFFT)(rt1, logn);
-		Zf(iFFT)(rt2, logn);
-		bnorm = fpr_zero;
-		for (u = 0; u < n; u ++) {
-			bnorm = fpr_add(bnorm, fpr_sqr(rt1[u]));
-			bnorm = fpr_add(bnorm, fpr_sqr(rt2[u]));
-		}
-		if (!fpr_lt(bnorm, fpr_bnorm_max)) {
-			continue;
-		}
-
-		/*
-		 * Compute public key h = g/f mod X^N+1 mod q. If this
-		 * fails, we must restart.
-		 */
-		if (h == NULL) {
-			h2 = (uint16_t *)tmp;
-			tmp2 = h2 + n;
-		} else {
-			h2 = h;
-			tmp2 = (uint16_t *)tmp;
-		}
-		if (!Zf(compute_public)(h2, f, g, logn, (uint8_t *)tmp2)) {
-			continue;
-		}
-
-		/*
-		 * Solve the NTRU equation to get F and G.
-		 */
-		lim = (1 << (Zf(max_FG_bits)[logn] - 1)) - 1;
-		if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) {
-			continue;
-		}
-
-		/*
-		 * Key pair is generated.
-		 */
-		break;
-	}
-}
diff --git a/crypto_sign/falcon-512-tree/m4-ct/pqm4.c b/crypto_sign/falcon-512-tree/m4-ct/pqm4.c
deleted file mode 100644
index 74b83a8b..00000000
--- a/crypto_sign/falcon-512-tree/m4-ct/pqm4.c
+++ /dev/null
@@ -1,347 +0,0 @@
-#include <stddef.h>
-#include <string.h>
-
-#include "api.h"
-#include "inner.h"
-#include "randombytes.h"
-
-/* ==================================================================== */
-
-/*
- * Falcon degree is N = 2^LOGN, where LOGN=9 (for Falcon-512) or 10
- * (for Falcon-1024). We use the advertised public key size to know
- * which degree is used.
- */
-#if CRYPTO_PUBLICKEYBYTES == 897
-#define LOGN   9
-#elif CRYPTO_PUBLICKEYBYTES == 1793
-#define LOGN   10
-#else
-#error Unknown Falcon degree (unexpected public key size)
-#endif
-
-#define N   ((size_t)1 << LOGN)
-#define NONCELEN   40
-#define SEEDLEN    48
-
-/*
- * If the private key length is larger than 10000, then this is the
- * variant with precomputed expanded keys.
- */
-#if CRYPTO_SECRETKEYBYTES > 10000
-#define KG_EXPAND   1
-#else
-#define KG_EXPAND   0
-#endif
-
-/*
- * Common buffer, to avoid bulky stack allocation. The buffer sizes are
- * all expressed in bytes, but the buffer must be suitably aligned for
- * 64-bit integers and floating-point values.
- *
- * Required size (in bytes):
- *
- *   With expanded key:
- *      keygen:  48*N + 6*N = 54*N
- *      sign:    48*N + 2*N = 50*N
- *      vrfy:    8*N
- *
- *   Without expanded key:
- *      keygen:  28*N + 5*N = 33*N
- *      sign:    72*N + 6*N = 78*N
- *      vrfy:    8*N
- */
-static union {
-#if KG_EXPAND
-	uint8_t b[54 * N];
-#else
-	uint8_t b[78 * N];
-#endif
-	uint64_t dummy_u64;
-	fpr dummy_fp;
-} tmp;
-
-int
-crypto_sign_keypair(unsigned char *pk, unsigned char *sk)
-{
-	int8_t *f, *g, *F, *G;
-	uint16_t *h;
-	inner_shake256_context rng;
-	unsigned char seed[SEEDLEN];
-#if KG_EXPAND
-	size_t v;
-#else
-	size_t u, v;
-#endif
-	unsigned sav_cw;
-
-#if KG_EXPAND
-	f = (int8_t *)&tmp.b[48 * N];
-	g = f + N;
-	F = g + N;
-	G = F + N;
-	h = (uint16_t *)(G + N);
-#else
-	f = (int8_t *)&tmp.b[28 * N];
-	g = f + N;
-	F = g + N;
-	G = NULL;
-	h = (uint16_t *)(F + N);
-#endif
-
-	randombytes(seed, SEEDLEN);
-	inner_shake256_init(&rng);
-	inner_shake256_inject(&rng, seed, SEEDLEN);
-	inner_shake256_flip(&rng);
-	sav_cw = set_fpu_cw(2);
-	Zf(keygen)(&rng, f, g, F, G, h, LOGN, tmp.b);
-
-#if KG_EXPAND
-	/*
-	 * Expand private key.
-	 */
-	Zf(expand_privkey)((fpr *)sk, f, g, F, G, LOGN, tmp.b);
-	set_fpu_cw(sav_cw);
-#else
-	set_fpu_cw(sav_cw);
-
-	/*
-	 * Encode private key.
-	 */
-	sk[0] = 0x50 + LOGN;
-	u = 1;
-	v = Zf(trim_i8_encode)(sk + u, CRYPTO_SECRETKEYBYTES - u,
-		f, LOGN, Zf(max_fg_bits)[LOGN]);
-	if (v == 0) {
-		return -1;
-	}
-	u += v;
-	v = Zf(trim_i8_encode)(sk + u, CRYPTO_SECRETKEYBYTES - u,
-		g, LOGN, Zf(max_fg_bits)[LOGN]);
-	if (v == 0) {
-		return -1;
-	}
-	u += v;
-	v = Zf(trim_i8_encode)(sk + u, CRYPTO_SECRETKEYBYTES - u,
-		F, LOGN, Zf(max_FG_bits)[LOGN]);
-	if (v == 0) {
-		return -1;
-	}
-	u += v;
-	if (u != CRYPTO_SECRETKEYBYTES) {
-		return -1;
-	}
-#endif
-
-	/*
-	 * Encode public key.
-	 */
-	pk[0] = 0x00 + LOGN;
-	v = Zf(modq_encode)(pk + 1, CRYPTO_PUBLICKEYBYTES - 1, h, LOGN);
-	if (v != CRYPTO_PUBLICKEYBYTES - 1) {
-		return -1;
-	}
-
-	return 0;
-}
-
-int
-crypto_sign(unsigned char *sm, size_t *smlen,
-	const unsigned char *m, size_t mlen,
-	const unsigned char *sk)
-{
-#if KG_EXPAND
-	const fpr *expanded_key;
-#else
-	int8_t *f, *g, *F, *G;
-	size_t u, v;
-#endif
-	int16_t *sig;
-	uint16_t *hm;
-	unsigned char seed[SEEDLEN], nonce[NONCELEN];
-	unsigned char *esig;
-	inner_shake256_context sc;
-	size_t sig_len;
-	unsigned sav_cw;
-
-#if KG_EXPAND
-	sig = (int16_t *)&tmp.b[48 * N];
-#else
-	f = (int8_t *)&tmp.b[72 * N];
-	g = f + N;
-	F = g + N;
-	G = F + N;
-	sig = (int16_t *)(G + N);
-#endif
-	hm = (uint16_t *)sig;  /* hm[] is shared with sig[] */
-	esig = (unsigned char *)tmp.b;
-
-#if KG_EXPAND
-	/*
-	 * Expanded key is provided "as is".
-	 */
-	expanded_key = (const fpr *)sk;
-#else
-	/*
-	 * Decode the private key.
-	 */
-	if (sk[0] != 0x50 + LOGN) {
-		return -1;
-	}
-	u = 1;
-	v = Zf(trim_i8_decode)(f, LOGN, Zf(max_fg_bits)[LOGN],
-		sk + u, CRYPTO_SECRETKEYBYTES - u);
-	if (v == 0) {
-		return -1;
-	}
-	u += v;
-	v = Zf(trim_i8_decode)(g, LOGN, Zf(max_fg_bits)[LOGN],
-		sk + u, CRYPTO_SECRETKEYBYTES - u);
-	if (v == 0) {
-		return -1;
-	}
-	u += v;
-	v = Zf(trim_i8_decode)(F, LOGN, Zf(max_FG_bits)[LOGN],
-		sk + u, CRYPTO_SECRETKEYBYTES - u);
-	if (v == 0) {
-		return -1;
-	}
-	u += v;
-	if (u != CRYPTO_SECRETKEYBYTES) {
-		return -1;
-	}
-	if (!Zf(complete_private)(G, f, g, F, LOGN, tmp.b)) {
-		return -1;
-	}
-#endif
-
-	/*
-	 * Create a random nonce (40 bytes).
-	 */
-	randombytes(nonce, NONCELEN);
-
-	/*
-	 * Hash message nonce + message into a vector.
-	 */
-	inner_shake256_init(&sc);
-	inner_shake256_inject(&sc, nonce, NONCELEN);
-	inner_shake256_inject(&sc, m, mlen);
-	inner_shake256_flip(&sc);
-	Zf(hash_to_point_vartime)(&sc, hm, LOGN);
-
-	/*
-	 * Initialize a RNG.
-	 */
-	randombytes(seed, SEEDLEN);
-	inner_shake256_init(&sc);
-	inner_shake256_inject(&sc, seed, SEEDLEN);
-	inner_shake256_flip(&sc);
-
-	/*
-	 * Compute the signature.
-	 */
-	sav_cw = set_fpu_cw(2);
-#if KG_EXPAND
-	Zf(sign_tree)(sig, &sc, expanded_key, hm, LOGN, tmp.b);
-#else
-	Zf(sign_dyn)(sig, &sc, f, g, F, G, hm, LOGN, tmp.b);
-#endif
-	set_fpu_cw(sav_cw);
-
-	/*
-	 * Encode the signature and bundle it with the message. Format is:
-	 *   signature length     2 bytes, big-endian
-	 *   nonce                40 bytes
-	 *   message              mlen bytes
-	 *   signature            slen bytes
-	 */
-	esig[0] = 0x20 + LOGN;
-	sig_len = Zf(comp_encode)(esig + 1, CRYPTO_BYTES - 1, sig, LOGN);
-	if (sig_len == 0) {
-		return -1;
-	}
-	sig_len ++;
-	memmove(sm + 2 + NONCELEN, m, mlen);
-	sm[0] = (unsigned char)(sig_len >> 8);
-	sm[1] = (unsigned char)sig_len;
-	memcpy(sm + 2, nonce, NONCELEN);
-	memcpy(sm + 2 + NONCELEN + mlen, esig, sig_len);
-	*smlen = 2 + NONCELEN + mlen + sig_len;
-	return 0;
-}
-
-int
-crypto_sign_open(unsigned char *m, size_t *mlen,
-	const unsigned char *sm, size_t smlen,
-	const unsigned char *pk)
-{
-	uint16_t *h, *hm;
-	int16_t *sig;
-	const unsigned char *esig;
-	inner_shake256_context sc;
-	size_t sig_len, msg_len;
-
-	h = (uint16_t *)&tmp.b[2 * N];
-	hm = h + N;
-	sig = (int16_t *)(hm + N);
-
-	/*
-	 * Decode public key.
-	 */
-	if (pk[0] != 0x00 + LOGN) {
-		return -1;
-	}
-	if (Zf(modq_decode)(h, LOGN, pk + 1, CRYPTO_PUBLICKEYBYTES - 1)
-		!= CRYPTO_PUBLICKEYBYTES - 1)
-	{
-		return -1;
-	}
-	Zf(to_ntt_monty)(h, LOGN);
-
-	/*
-	 * Find nonce, signature, message length.
-	 */
-	if (smlen < 2 + NONCELEN) {
-		return -1;
-	}
-	sig_len = ((size_t)sm[0] << 8) | (size_t)sm[1];
-	if (sig_len > (smlen - 2 - NONCELEN)) {
-		return -1;
-	}
-	msg_len = smlen - 2 - NONCELEN - sig_len;
-
-	/*
-	 * Decode signature.
-	 */
-	esig = sm + 2 + NONCELEN + msg_len;
-	if (sig_len < 1 || esig[0] != 0x20 + LOGN) {
-		return -1;
-	}
-	if (Zf(comp_decode)(sig, LOGN,
-		esig + 1, sig_len - 1) != sig_len - 1)
-	{
-		return -1;
-	}
-
-	/*
-	 * Hash nonce + message into a vector.
-	 */
-	inner_shake256_init(&sc);
-	inner_shake256_inject(&sc, sm + 2, NONCELEN + msg_len);
-	inner_shake256_flip(&sc);
-	Zf(hash_to_point_vartime)(&sc, hm, LOGN);
-
-	/*
-	 * Verify signature.
-	 */
-	if (!Zf(verify_raw)(hm, sig, h, LOGN, tmp.b)) {
-		return -1;
-	}
-
-	/*
-	 * Return plaintext.
-	 */
-	memmove(m, sm + 2 + NONCELEN, msg_len);
-	*mlen = msg_len;
-	return 0;
-}
diff --git a/crypto_sign/falcon-512-tree/m4-ct/rng.c b/crypto_sign/falcon-512-tree/m4-ct/rng.c
deleted file mode 100644
index d2ecb7af..00000000
--- a/crypto_sign/falcon-512-tree/m4-ct/rng.c
+++ /dev/null
@@ -1,379 +0,0 @@
-/*
- * PRNG and interface to the system RNG.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include <assert.h>
-
-#include "inner.h"
-
-// yyyNIST+0 yyyPQCLEAN+0
-/*
- * Include relevant system header files. For Win32, this will also need
- * linking with advapi32.dll, which we trigger with an appropriate #pragma.
- */
-#if FALCON_RAND_GETENTROPY
-#include <unistd.h>
-#endif
-#if FALCON_RAND_URANDOM
-#include <sys/types.h>
-#if !FALCON_RAND_GETENTROPY
-#include <unistd.h>
-#endif
-#include <fcntl.h>
-#include <errno.h>
-#endif
-#if FALCON_RAND_WIN32
-#include <windows.h>
-#include <wincrypt.h>
-#pragma comment(lib, "advapi32")
-#endif
-
-/* see inner.h */
-int
-Zf(get_seed)(void *seed, size_t len)
-{
-	(void)seed;
-	if (len == 0) {
-		return 1;
-	}
-#if FALCON_RAND_GETENTROPY
-	if (getentropy(seed, len) == 0) {
-		return 1;
-	}
-#endif
-#if FALCON_RAND_URANDOM
-	{
-		int f;
-
-		f = open("/dev/urandom", O_RDONLY);
-		if (f >= 0) {
-			while (len > 0) {
-				ssize_t rlen;
-
-				rlen = read(f, seed, len);
-				if (rlen < 0) {
-					if (errno == EINTR) {
-						continue;
-					}
-					break;
-				}
-				seed = (uint8_t *)seed + rlen;
-				len -= (size_t)rlen;
-			}
-			close(f);
-			if (len == 0) {
-				return 1;
-			}
-		}
-	}
-#endif
-#if FALCON_RAND_WIN32
-	{
-		HCRYPTPROV hp;
-
-		if (CryptAcquireContext(&hp, 0, 0, PROV_RSA_FULL,
-			CRYPT_VERIFYCONTEXT | CRYPT_SILENT))
-		{
-			BOOL r;
-
-			r = CryptGenRandom(hp, (DWORD)len, seed);
-			CryptReleaseContext(hp, 0);
-			if (r) {
-				return 1;
-			}
-		}
-	}
-#endif
-	return 0;
-}
-// yyyNIST- yyyPQCLEAN-
-
-/* see inner.h */
-void
-Zf(prng_init)(prng *p, inner_shake256_context *src)
-{
-#if FALCON_LE  // yyyLE+1
-	inner_shake256_extract(src, p->state.d, 56);
-#else  // yyyLE+0
-	/*
-	 * To ensure reproducibility for a given seed, we
-	 * must enforce little-endian interpretation of
-	 * the state words.
-	 */
-	uint8_t tmp[56];
-	uint64_t th, tl;
-	int i;
-
-	inner_shake256_extract(src, tmp, 56);
-	for (i = 0; i < 14; i ++) {
-		uint32_t w;
-
-		w = (uint32_t)tmp[(i << 2) + 0]
-			| ((uint32_t)tmp[(i << 2) + 1] << 8)
-			| ((uint32_t)tmp[(i << 2) + 2] << 16)
-			| ((uint32_t)tmp[(i << 2) + 3] << 24);
-		*(uint32_t *)(p->state.d + (i << 2)) = w;
-	}
-	tl = *(uint32_t *)(p->state.d + 48);
-	th = *(uint32_t *)(p->state.d + 52);
-	*(uint64_t *)(p->state.d + 48) = tl + (th << 32);
-#endif  // yyyLE-
-	Zf(prng_refill)(p);
-}
-
-/*
- * PRNG based on ChaCha20.
- *
- * State consists in key (32 bytes) then IV (16 bytes) and block counter
- * (8 bytes). Normally, we should not care about local endianness (this
- * is for a PRNG), but for the NIST competition we need reproducible KAT
- * vectors that work across architectures, so we enforce little-endian
- * interpretation where applicable. Moreover, output words are "spread
- * out" over the output buffer with the interleaving pattern that is
- * naturally obtained from the AVX2 implementation that runs eight
- * ChaCha20 instances in parallel.
- *
- * The block counter is XORed into the first 8 bytes of the IV.
- */
-TARGET_AVX2
-void
-Zf(prng_refill)(prng *p)
-{
-#if FALCON_AVX2 // yyyAVX2+1
-
-	static const uint32_t CW[] = {
-		0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
-	};
-
-	uint64_t cc;
-	size_t u;
-	int i;
-	uint32_t *sw;
-	union {
-		uint32_t w[16];
-		__m256i y[2];  /* for alignment */
-	} t;
-	__m256i state[16], init[16];
-
-	sw = (uint32_t *)p->state.d;
-
-	/*
-	 * XOR next counter values into state.
-	 */
-	cc = *(uint64_t *)(p->state.d + 48);
-	for (u = 0; u < 8; u ++) {
-		t.w[u] = (uint32_t)(cc + u);
-		t.w[u + 8] = (uint32_t)((cc + u) >> 32);
-	}
-	*(uint64_t *)(p->state.d + 48) = cc + 8;
-
-	/*
-	 * Load state.
-	 */
-	for (u = 0; u < 4; u ++) {
-		state[u] = init[u] =
-			_mm256_broadcastd_epi32(_mm_cvtsi32_si128(CW[u]));
-	}
-	for (u = 0; u < 10; u ++) {
-		state[u + 4] = init[u + 4] =
-			_mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[u]));
-	}
-	state[14] = init[14] = _mm256_xor_si256(
-		_mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[10])),
-		_mm256_loadu_si256((__m256i *)&t.w[0]));
-	state[15] = init[15] = _mm256_xor_si256(
-		_mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[11])),
-		_mm256_loadu_si256((__m256i *)&t.w[8]));
-
-	/*
-	 * Do all rounds.
-	 */
-	for (i = 0; i < 10; i ++) {
-
-#define QROUND(a, b, c, d)   do { \
-		state[a] = _mm256_add_epi32(state[a], state[b]); \
-		state[d] = _mm256_xor_si256(state[d], state[a]); \
-		state[d] = _mm256_or_si256( \
-			_mm256_slli_epi32(state[d], 16), \
-			_mm256_srli_epi32(state[d], 16)); \
-		state[c] = _mm256_add_epi32(state[c], state[d]); \
-		state[b] = _mm256_xor_si256(state[b], state[c]); \
-		state[b] = _mm256_or_si256( \
-			_mm256_slli_epi32(state[b], 12), \
-			_mm256_srli_epi32(state[b], 20)); \
-		state[a] = _mm256_add_epi32(state[a], state[b]); \
-		state[d] = _mm256_xor_si256(state[d], state[a]); \
-		state[d] = _mm256_or_si256( \
-			_mm256_slli_epi32(state[d],  8), \
-			_mm256_srli_epi32(state[d], 24)); \
-		state[c] = _mm256_add_epi32(state[c], state[d]); \
-		state[b] = _mm256_xor_si256(state[b], state[c]); \
-		state[b] = _mm256_or_si256( \
-			_mm256_slli_epi32(state[b], 7), \
-			_mm256_srli_epi32(state[b], 25)); \
-	} while (0)
-
-		QROUND( 0,  4,  8, 12);
-		QROUND( 1,  5,  9, 13);
-		QROUND( 2,  6, 10, 14);
-		QROUND( 3,  7, 11, 15);
-		QROUND( 0,  5, 10, 15);
-		QROUND( 1,  6, 11, 12);
-		QROUND( 2,  7,  8, 13);
-		QROUND( 3,  4,  9, 14);
-
-#undef QROUND
-
-	}
-
-	/*
-	 * Add initial state back and encode the result in the destination
-	 * buffer. We can dump the AVX2 values "as is" because the non-AVX2
-	 * code uses a compatible order of values.
-	 */
-	for (u = 0; u < 16; u ++) {
-		_mm256_storeu_si256((__m256i *)&p->buf.d[u << 5],
-			_mm256_add_epi32(state[u], init[u]));
-	}
-
-#else // yyyAVX2+0
-
-	static const uint32_t CW[] = {
-		0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
-	};
-
-	uint64_t cc;
-	size_t u;
-
-	/*
-	 * State uses local endianness. Only the output bytes must be
-	 * converted to little endian (if used on a big-endian machine).
-	 */
-	cc = *(uint64_t *)(p->state.d + 48);
-	for (u = 0; u < 8; u ++) {
-		uint32_t state[16];
-		size_t v;
-		int i;
-
-		memcpy(&state[0], CW, sizeof CW);
-		memcpy(&state[4], p->state.d, 48);
-		state[14] ^= (uint32_t)cc;
-		state[15] ^= (uint32_t)(cc >> 32);
-		for (i = 0; i < 10; i ++) {
-
-#define QROUND(a, b, c, d)   do { \
-		state[a] += state[b]; \
-		state[d] ^= state[a]; \
-		state[d] = (state[d] << 16) | (state[d] >> 16); \
-		state[c] += state[d]; \
-		state[b] ^= state[c]; \
-		state[b] = (state[b] << 12) | (state[b] >> 20); \
-		state[a] += state[b]; \
-		state[d] ^= state[a]; \
-		state[d] = (state[d] <<  8) | (state[d] >> 24); \
-		state[c] += state[d]; \
-		state[b] ^= state[c]; \
-		state[b] = (state[b] <<  7) | (state[b] >> 25); \
-	} while (0)
-
-			QROUND( 0,  4,  8, 12);
-			QROUND( 1,  5,  9, 13);
-			QROUND( 2,  6, 10, 14);
-			QROUND( 3,  7, 11, 15);
-			QROUND( 0,  5, 10, 15);
-			QROUND( 1,  6, 11, 12);
-			QROUND( 2,  7,  8, 13);
-			QROUND( 3,  4,  9, 14);
-
-#undef QROUND
-
-		}
-
-		for (v = 0; v < 4; v ++) {
-			state[v] += CW[v];
-		}
-		for (v = 4; v < 14; v ++) {
-			state[v] += ((uint32_t *)p->state.d)[v - 4];
-		}
-		state[14] += ((uint32_t *)p->state.d)[10]
-			^ (uint32_t)cc;
-		state[15] += ((uint32_t *)p->state.d)[11]
-			^ (uint32_t)(cc >> 32);
-		cc ++;
-
-		/*
-		 * We mimic the interleaving that is used in the AVX2
-		 * implementation.
-		 */
-		for (v = 0; v < 16; v ++) {
-#if FALCON_LE  // yyyLE+1
-			((uint32_t *)p->buf.d)[u + (v << 3)] = state[v];
-#else  // yyyLE+0
-			p->buf.d[(u << 2) + (v << 5) + 0] =
-				(uint8_t)state[v];
-			p->buf.d[(u << 2) + (v << 5) + 1] =
-				(uint8_t)(state[v] >> 8);
-			p->buf.d[(u << 2) + (v << 5) + 2] =
-				(uint8_t)(state[v] >> 16);
-			p->buf.d[(u << 2) + (v << 5) + 3] =
-				(uint8_t)(state[v] >> 24);
-#endif  // yyyLE-
-		}
-	}
-	*(uint64_t *)(p->state.d + 48) = cc;
-
-#endif // yyyAVX2-
-
-	p->ptr = 0;
-}
-
-/* see inner.h */
-void
-Zf(prng_get_bytes)(prng *p, void *dst, size_t len)
-{
-	uint8_t *buf;
-
-	buf = dst;
-	while (len > 0) {
-		size_t clen;
-
-		clen = (sizeof p->buf.d) - p->ptr;
-		if (clen > len) {
-			clen = len;
-		}
-		memcpy(buf, p->buf.d, clen);
-		buf += clen;
-		len -= clen;
-		p->ptr += clen;
-		if (p->ptr == sizeof p->buf.d) {
-			Zf(prng_refill)(p);
-		}
-	}
-}
diff --git a/crypto_sign/falcon-512-tree/m4-ct/sign.c b/crypto_sign/falcon-512-tree/m4-ct/sign.c
deleted file mode 100644
index 752fb8ba..00000000
--- a/crypto_sign/falcon-512-tree/m4-ct/sign.c
+++ /dev/null
@@ -1,1532 +0,0 @@
-/*
- * Falcon signature generation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-/* =================================================================== */
-
-/*
- * Compute degree N from logarithm 'logn'.
- */
-#define MKN(logn)   ((size_t)1 << (logn))
-
-/* =================================================================== */
-/*
- * Binary case:
- *   N = 2^logn
- *   phi = X^N+1
- */
-
-/*
- * Get the size of the LDL tree for an input with polynomials of size
- * 2^logn. The size is expressed in the number of elements.
- */
-static inline unsigned
-ffLDL_treesize(unsigned logn)
-{
-	/*
-	 * For logn = 0 (polynomials are constant), the "tree" is a
-	 * single element. Otherwise, the tree node has size 2^logn, and
-	 * has two child trees for size logn-1 each. Thus, treesize s()
-	 * must fulfill these two relations:
-	 *
-	 *   s(0) = 1
-	 *   s(logn) = (2^logn) + 2*s(logn-1)
-	 */
-	return (logn + 1) << logn;
-}
-
-/*
- * Inner function for ffLDL_fft(). It expects the matrix to be both
- * auto-adjoint and quasicyclic; also, it uses the source operands
- * as modifiable temporaries.
- *
- * tmp[] must have room for at least one polynomial.
- */
-static void
-ffLDL_fft_inner(fpr *restrict tree,
-	fpr *restrict g0, fpr *restrict g1, unsigned logn, fpr *restrict tmp)
-{
-	size_t n, hn;
-
-	n = MKN(logn);
-	if (n == 1) {
-		tree[0] = g0[0];
-		return;
-	}
-	hn = n >> 1;
-
-	/*
-	 * The LDL decomposition yields L (which is written in the tree)
-	 * and the diagonal of D. Since d00 = g0, we just write d11
-	 * into tmp.
-	 */
-	Zf(poly_LDLmv_fft)(tmp, tree, g0, g1, g0, logn);
-
-	/*
-	 * Split d00 (currently in g0) and d11 (currently in tmp). We
-	 * reuse g0 and g1 as temporary storage spaces:
-	 *   d00 splits into g1, g1+hn
-	 *   d11 splits into g0, g0+hn
-	 */
-	Zf(poly_split_fft)(g1, g1 + hn, g0, logn);
-	Zf(poly_split_fft)(g0, g0 + hn, tmp, logn);
-
-	/*
-	 * Each split result is the first row of a new auto-adjoint
-	 * quasicyclic matrix for the next recursive step.
-	 */
-	ffLDL_fft_inner(tree + n,
-		g1, g1 + hn, logn - 1, tmp);
-	ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
-		g0, g0 + hn, logn - 1, tmp);
-}
-
-/*
- * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix
- * is provided as three polynomials (FFT representation).
- *
- * The "tree" array is filled with the computed tree, of size
- * (logn+1)*(2^logn) elements (see ffLDL_treesize()).
- *
- * Input arrays MUST NOT overlap, except possibly the three unmodified
- * arrays g00, g01 and g11. tmp[] should have room for at least three
- * polynomials of 2^logn elements each.
- */
-static void
-ffLDL_fft(fpr *restrict tree, const fpr *restrict g00,
-	const fpr *restrict g01, const fpr *restrict g11,
-	unsigned logn, fpr *restrict tmp)
-{
-	size_t n, hn;
-	fpr *d00, *d11;
-
-	n = MKN(logn);
-	if (n == 1) {
-		tree[0] = g00[0];
-		return;
-	}
-	hn = n >> 1;
-	d00 = tmp;
-	d11 = tmp + n;
-	tmp += n << 1;
-
-	memcpy(d00, g00, n * sizeof *g00);
-	Zf(poly_LDLmv_fft)(d11, tree, g00, g01, g11, logn);
-
-	Zf(poly_split_fft)(tmp, tmp + hn, d00, logn);
-	Zf(poly_split_fft)(d00, d00 + hn, d11, logn);
-	memcpy(d11, tmp, n * sizeof *tmp);
-	ffLDL_fft_inner(tree + n,
-		d11, d11 + hn, logn - 1, tmp);
-	ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
-		d00, d00 + hn, logn - 1, tmp);
-}
-
-/*
- * Normalize an ffLDL tree: each leaf of value x is replaced with
- * sigma / sqrt(x).
- */
-static void
-ffLDL_binary_normalize(fpr *tree, unsigned logn)
-{
-	/*
-	 * TODO: make an iterative version.
-	 */
-	size_t n;
-
-	n = MKN(logn);
-	if (n == 1) {
-		/*
-		 * We actually store in the tree leaf the inverse of
-		 * the value mandated by the specification: this
-		 * saves a division both here and in the sampler.
-		 */
-		tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma);
-	} else {
-		ffLDL_binary_normalize(tree + n, logn - 1);
-		ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1),
-			logn - 1);
-	}
-}
-
-/* =================================================================== */
-
-/*
- * Convert an integer polynomial (with small values) into the
- * representation with complex numbers.
- */
-static void
-smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn)
-{
-	size_t n, u;
-
-	n = MKN(logn);
-	for (u = 0; u < n; u ++) {
-		r[u] = fpr_of(t[u]);
-	}
-}
-
-/*
- * The expanded private key contains:
- *  - The B0 matrix (four elements)
- *  - The ffLDL tree
- */
-
-static inline size_t
-skoff_b00(unsigned logn)
-{
-	(void)logn;
-	return 0;
-}
-
-static inline size_t
-skoff_b01(unsigned logn)
-{
-	return MKN(logn);
-}
-
-static inline size_t
-skoff_b10(unsigned logn)
-{
-	return 2 * MKN(logn);
-}
-
-static inline size_t
-skoff_b11(unsigned logn)
-{
-	return 3 * MKN(logn);
-}
-
-static inline size_t
-skoff_tree(unsigned logn)
-{
-	return 4 * MKN(logn);
-}
-
-/* see inner.h */
-void
-Zf(expand_privkey)(fpr *restrict expanded_key,
-	const int8_t *f, const int8_t *g,
-	const int8_t *F, const int8_t *G,
-	unsigned logn, uint8_t *restrict tmp)
-{
-	size_t n;
-	fpr *rf, *rg, *rF, *rG;
-	fpr *b00, *b01, *b10, *b11;
-	fpr *g00, *g01, *g11, *gxx;
-	fpr *tree;
-
-	n = MKN(logn);
-	b00 = expanded_key + skoff_b00(logn);
-	b01 = expanded_key + skoff_b01(logn);
-	b10 = expanded_key + skoff_b10(logn);
-	b11 = expanded_key + skoff_b11(logn);
-	tree = expanded_key + skoff_tree(logn);
-
-	/*
-	 * We load the private key elements directly into the B0 matrix,
-	 * since B0 = [[g, -f], [G, -F]].
-	 */
-	rf = b01;
-	rg = b00;
-	rF = b11;
-	rG = b10;
-
-	smallints_to_fpr(rf, f, logn);
-	smallints_to_fpr(rg, g, logn);
-	smallints_to_fpr(rF, F, logn);
-	smallints_to_fpr(rG, G, logn);
-
-	/*
-	 * Compute the FFT for the key elements, and negate f and F.
-	 */
-	Zf(FFT)(rf, logn);
-	Zf(FFT)(rg, logn);
-	Zf(FFT)(rF, logn);
-	Zf(FFT)(rG, logn);
-	Zf(poly_neg)(rf, logn);
-	Zf(poly_neg)(rF, logn);
-
-	/*
-	 * The Gram matrix is G = B·B*. Formulas are:
-	 *   g00 = b00*adj(b00) + b01*adj(b01)
-	 *   g01 = b00*adj(b10) + b01*adj(b11)
-	 *   g10 = b10*adj(b00) + b11*adj(b01)
-	 *   g11 = b10*adj(b10) + b11*adj(b11)
-	 *
-	 * For historical reasons, this implementation uses
-	 * g00, g01 and g11 (upper triangle).
-	 */
-	g00 = (fpr *)tmp;
-	g01 = g00 + n;
-	g11 = g01 + n;
-	gxx = g11 + n;
-
-	memcpy(g00, b00, n * sizeof *b00);
-	Zf(poly_mulselfadj_fft)(g00, logn);
-	memcpy(gxx, b01, n * sizeof *b01);
-	Zf(poly_mulselfadj_fft)(gxx, logn);
-	Zf(poly_add)(g00, gxx, logn);
-
-	memcpy(g01, b00, n * sizeof *b00);
-	Zf(poly_muladj_fft)(g01, b10, logn);
-	memcpy(gxx, b01, n * sizeof *b01);
-	Zf(poly_muladj_fft)(gxx, b11, logn);
-	Zf(poly_add)(g01, gxx, logn);
-
-	memcpy(g11, b10, n * sizeof *b10);
-	Zf(poly_mulselfadj_fft)(g11, logn);
-	memcpy(gxx, b11, n * sizeof *b11);
-	Zf(poly_mulselfadj_fft)(gxx, logn);
-	Zf(poly_add)(g11, gxx, logn);
-
-	/*
-	 * Compute the Falcon tree.
-	 */
-	ffLDL_fft(tree, g00, g01, g11, logn, gxx);
-
-	/*
-	 * Normalize tree.
-	 */
-	ffLDL_binary_normalize(tree, logn);
-}
-
-typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma);
-
-/*
- * Perform Fast Fourier Sampling for target vector t. The Gram matrix
- * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector
- * is written over (t0,t1). The Gram matrix is modified as well. The
- * tmp[] buffer must have room for four polynomials.
- */
-TARGET_AVX2
-static void
-ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx,
-	fpr *restrict t0, fpr *restrict t1,
-	fpr *restrict g00, fpr *restrict g01, fpr *restrict g11,
-	unsigned logn, fpr *restrict tmp)
-{
-	size_t n, hn;
-	fpr *z0, *z1;
-
-	/*
-	 * Deepest level: the LDL tree leaf value is just g00 (the
-	 * array has length only 1 at this point); we normalize it
-	 * with regards to sigma, then use it for sampling.
-	 */
-	if (logn == 0) {
-		fpr leaf;
-
-		leaf = g00[0];
-		leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma);
-		t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf));
-		t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf));
-		return;
-	}
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-
-	/*
-	 * Decompose G into LDL. We only need d00 (identical to g00),
-	 * d11, and l10; we do that in place.
-	 */
-	Zf(poly_LDL_fft)(g00, g01, g11, logn);
-
-	/*
-	 * Split d00 and d11 and expand them into half-size quasi-cyclic
-	 * Gram matrices. We also save l10 in tmp[].
-	 */
-	Zf(poly_split_fft)(tmp, tmp + hn, g00, logn);
-	memcpy(g00, tmp, n * sizeof *tmp);
-	Zf(poly_split_fft)(tmp, tmp + hn, g11, logn);
-	memcpy(g11, tmp, n * sizeof *tmp);
-	memcpy(tmp, g01, n * sizeof *g01);
-	memcpy(g01, g00, hn * sizeof *g00);
-	memcpy(g01 + hn, g11, hn * sizeof *g00);
-
-	/*
-	 * The half-size Gram matrices for the recursive LDL tree
-	 * building are now:
-	 *   - left sub-tree: g00, g00+hn, g01
-	 *   - right sub-tree: g11, g11+hn, g01+hn
-	 * l10 is in tmp[].
-	 */
-
-	/*
-	 * We split t1 and use the first recursive call on the two
-	 * halves, using the right sub-tree. The result is merged
-	 * back into tmp + 2*n.
-	 */
-	z1 = tmp + n;
-	Zf(poly_split_fft)(z1, z1 + hn, t1, logn);
-	ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn,
-		g11, g11 + hn, g01 + hn, logn - 1, z1 + n);
-	Zf(poly_merge_fft)(tmp + (n << 1), z1, z1 + hn, logn);
-
-	/*
-	 * Compute tb0 = t0 + (t1 - z1) * l10.
-	 * At that point, l10 is in tmp, t1 is unmodified, and z1 is
-	 * in tmp + (n << 1). The buffer in z1 is free.
-	 *
-	 * In the end, z1 is written over t1, and tb0 is in t0.
-	 */
-	memcpy(z1, t1, n * sizeof *t1);
-	Zf(poly_sub)(z1, tmp + (n << 1), logn);
-	memcpy(t1, tmp + (n << 1), n * sizeof *tmp);
-	Zf(poly_mul_fft)(tmp, z1, logn);
-	Zf(poly_add)(t0, tmp, logn);
-
-	/*
-	 * Second recursive invocation, on the split tb0 (currently in t0)
-	 * and the left sub-tree.
-	 */
-	z0 = tmp;
-	Zf(poly_split_fft)(z0, z0 + hn, t0, logn);
-	ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn,
-		g00, g00 + hn, g01, logn - 1, z0 + n);
-	Zf(poly_merge_fft)(t0, z0, z0 + hn, logn);
-}
-
-/*
- * Perform Fast Fourier Sampling for target vector t and LDL tree T.
- * tmp[] must have size for at least two polynomials of size 2^logn.
- */
-TARGET_AVX2
-static void
-ffSampling_fft(samplerZ samp, void *samp_ctx,
-	fpr *restrict z0, fpr *restrict z1,
-	const fpr *restrict tree,
-	const fpr *restrict t0, const fpr *restrict t1, unsigned logn,
-	fpr *restrict tmp)
-{
-	size_t n, hn;
-	const fpr *tree0, *tree1;
-
-	/*
-	 * When logn == 2, we inline the last two recursion levels.
-	 */
-	if (logn == 2) {
-#if FALCON_AVX2  // yyyAVX2+1
-		fpr w0, w1, w2, w3, sigma;
-		__m128d ww0, ww1, wa, wb, wc, wd;
-		__m128d wy0, wy1, wz0, wz1;
-		__m128d half, invsqrt8, invsqrt2, neghi, neglo;
-		int si0, si1, si2, si3;
-
-		tree0 = tree + 4;
-		tree1 = tree + 8;
-
-		half = _mm_set1_pd(0.5);
-		invsqrt8 = _mm_set1_pd(0.353553390593273762200422181052);
-		invsqrt2 = _mm_set1_pd(0.707106781186547524400844362105);
-		neghi = _mm_set_pd(-0.0, 0.0);
-		neglo = _mm_set_pd(0.0, -0.0);
-
-		/*
-		 * We split t1 into w*, then do the recursive invocation,
-		 * with output in w*. We finally merge back into z1.
-		 */
-		ww0 = _mm_loadu_pd(&t1[0].v);
-		ww1 = _mm_loadu_pd(&t1[2].v);
-		wa = _mm_unpacklo_pd(ww0, ww1);
-		wb = _mm_unpackhi_pd(ww0, ww1);
-		wc = _mm_add_pd(wa, wb);
-		ww0 = _mm_mul_pd(wc, half);
-		wc = _mm_sub_pd(wa, wb);
-		wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi);
-		ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8);
-
-		w2.v = _mm_cvtsd_f64(ww1);
-		w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1));
-		wa = ww1;
-		sigma = tree1[3];
-		si2 = samp(samp_ctx, w2, sigma);
-		si3 = samp(samp_ctx, w3, sigma);
-		ww1 = _mm_set_pd((double)si3, (double)si2);
-		wa = _mm_sub_pd(wa, ww1);
-		wb = _mm_loadu_pd(&tree1[0].v);
-		wc = _mm_mul_pd(wa, wb);
-		wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1));
-		wa = _mm_unpacklo_pd(wc, wd);
-		wb = _mm_unpackhi_pd(wc, wd);
-		ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo)));
-		w0.v = _mm_cvtsd_f64(ww0);
-		w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1));
-		sigma = tree1[2];
-		si0 = samp(samp_ctx, w0, sigma);
-		si1 = samp(samp_ctx, w1, sigma);
-		ww0 = _mm_set_pd((double)si1, (double)si0);
-
-		wc = _mm_mul_pd(
-			_mm_set_pd((double)(si2 + si3), (double)(si2 - si3)),
-			invsqrt2);
-		wa = _mm_add_pd(ww0, wc);
-		wb = _mm_sub_pd(ww0, wc);
-		ww0 = _mm_unpacklo_pd(wa, wb);
-		ww1 = _mm_unpackhi_pd(wa, wb);
-		_mm_storeu_pd(&z1[0].v, ww0);
-		_mm_storeu_pd(&z1[2].v, ww1);
-
-		/*
-		 * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
-		 */
-		wy0 = _mm_sub_pd(_mm_loadu_pd(&t1[0].v), ww0);
-		wy1 = _mm_sub_pd(_mm_loadu_pd(&t1[2].v), ww1);
-		wz0 = _mm_loadu_pd(&tree[0].v);
-		wz1 = _mm_loadu_pd(&tree[2].v);
-		ww0 = _mm_sub_pd(_mm_mul_pd(wy0, wz0), _mm_mul_pd(wy1, wz1));
-		ww1 = _mm_add_pd(_mm_mul_pd(wy0, wz1), _mm_mul_pd(wy1, wz0));
-		ww0 = _mm_add_pd(ww0, _mm_loadu_pd(&t0[0].v));
-		ww1 = _mm_add_pd(ww1, _mm_loadu_pd(&t0[2].v));
-
-		/*
-		 * Second recursive invocation.
-		 */
-		wa = _mm_unpacklo_pd(ww0, ww1);
-		wb = _mm_unpackhi_pd(ww0, ww1);
-		wc = _mm_add_pd(wa, wb);
-		ww0 = _mm_mul_pd(wc, half);
-		wc = _mm_sub_pd(wa, wb);
-		wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi);
-		ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8);
-
-		w2.v = _mm_cvtsd_f64(ww1);
-		w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1));
-		wa = ww1;
-		sigma = tree0[3];
-		si2 = samp(samp_ctx, w2, sigma);
-		si3 = samp(samp_ctx, w3, sigma);
-		ww1 = _mm_set_pd((double)si3, (double)si2);
-		wa = _mm_sub_pd(wa, ww1);
-		wb = _mm_loadu_pd(&tree0[0].v);
-		wc = _mm_mul_pd(wa, wb);
-		wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1));
-		wa = _mm_unpacklo_pd(wc, wd);
-		wb = _mm_unpackhi_pd(wc, wd);
-		ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo)));
-		w0.v = _mm_cvtsd_f64(ww0);
-		w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1));
-		sigma = tree0[2];
-		si0 = samp(samp_ctx, w0, sigma);
-		si1 = samp(samp_ctx, w1, sigma);
-		ww0 = _mm_set_pd((double)si1, (double)si0);
-
-		wc = _mm_mul_pd(
-			_mm_set_pd((double)(si2 + si3), (double)(si2 - si3)),
-			invsqrt2);
-		wa = _mm_add_pd(ww0, wc);
-		wb = _mm_sub_pd(ww0, wc);
-		ww0 = _mm_unpacklo_pd(wa, wb);
-		ww1 = _mm_unpackhi_pd(wa, wb);
-		_mm_storeu_pd(&z0[0].v, ww0);
-		_mm_storeu_pd(&z0[2].v, ww1);
-
-		return;
-#else  // yyyAVX2+0
-		fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma;
-		fpr a_re, a_im, b_re, b_im, c_re, c_im;
-
-		tree0 = tree + 4;
-		tree1 = tree + 8;
-
-		/*
-		 * We split t1 into w*, then do the recursive invocation,
-		 * with output in w*. We finally merge back into z1.
-		 */
-		a_re = t1[0];
-		a_im = t1[2];
-		b_re = t1[1];
-		b_im = t1[3];
-		c_re = fpr_add(a_re, b_re);
-		c_im = fpr_add(a_im, b_im);
-		w0 = fpr_half(c_re);
-		w1 = fpr_half(c_im);
-		c_re = fpr_sub(a_re, b_re);
-		c_im = fpr_sub(a_im, b_im);
-		w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
-		w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
-
-		x0 = w2;
-		x1 = w3;
-		sigma = tree1[3];
-		w2 = fpr_of(samp(samp_ctx, x0, sigma));
-		w3 = fpr_of(samp(samp_ctx, x1, sigma));
-		a_re = fpr_sub(x0, w2);
-		a_im = fpr_sub(x1, w3);
-		b_re = tree1[0];
-		b_im = tree1[1];
-		c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
-		c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
-		x0 = fpr_add(c_re, w0);
-		x1 = fpr_add(c_im, w1);
-		sigma = tree1[2];
-		w0 = fpr_of(samp(samp_ctx, x0, sigma));
-		w1 = fpr_of(samp(samp_ctx, x1, sigma));
-
-		a_re = w0;
-		a_im = w1;
-		b_re = w2;
-		b_im = w3;
-		c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
-		c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
-		z1[0] = w0 = fpr_add(a_re, c_re);
-		z1[2] = w2 = fpr_add(a_im, c_im);
-		z1[1] = w1 = fpr_sub(a_re, c_re);
-		z1[3] = w3 = fpr_sub(a_im, c_im);
-
-		/*
-		 * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
-		 */
-		w0 = fpr_sub(t1[0], w0);
-		w1 = fpr_sub(t1[1], w1);
-		w2 = fpr_sub(t1[2], w2);
-		w3 = fpr_sub(t1[3], w3);
-
-		a_re = w0;
-		a_im = w2;
-		b_re = tree[0];
-		b_im = tree[2];
-		w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
-		w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
-		a_re = w1;
-		a_im = w3;
-		b_re = tree[1];
-		b_im = tree[3];
-		w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
-		w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
-
-		w0 = fpr_add(w0, t0[0]);
-		w1 = fpr_add(w1, t0[1]);
-		w2 = fpr_add(w2, t0[2]);
-		w3 = fpr_add(w3, t0[3]);
-
-		/*
-		 * Second recursive invocation.
-		 */
-		a_re = w0;
-		a_im = w2;
-		b_re = w1;
-		b_im = w3;
-		c_re = fpr_add(a_re, b_re);
-		c_im = fpr_add(a_im, b_im);
-		w0 = fpr_half(c_re);
-		w1 = fpr_half(c_im);
-		c_re = fpr_sub(a_re, b_re);
-		c_im = fpr_sub(a_im, b_im);
-		w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
-		w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
-
-		x0 = w2;
-		x1 = w3;
-		sigma = tree0[3];
-		w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma));
-		w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma));
-		a_re = fpr_sub(x0, y0);
-		a_im = fpr_sub(x1, y1);
-		b_re = tree0[0];
-		b_im = tree0[1];
-		c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
-		c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
-		x0 = fpr_add(c_re, w0);
-		x1 = fpr_add(c_im, w1);
-		sigma = tree0[2];
-		w0 = fpr_of(samp(samp_ctx, x0, sigma));
-		w1 = fpr_of(samp(samp_ctx, x1, sigma));
-
-		a_re = w0;
-		a_im = w1;
-		b_re = w2;
-		b_im = w3;
-		c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
-		c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
-		z0[0] = fpr_add(a_re, c_re);
-		z0[2] = fpr_add(a_im, c_im);
-		z0[1] = fpr_sub(a_re, c_re);
-		z0[3] = fpr_sub(a_im, c_im);
-
-		return;
-#endif  // yyyAVX2-
-	}
-
-	/*
-	 * Case logn == 1 is reachable only when using Falcon-2 (the
-	 * smallest size for which Falcon is mathematically defined, but
-	 * of course way too insecure to be of any use).
-	 */
-	if (logn == 1) {
-		fpr x0, x1, y0, y1, sigma;
-		fpr a_re, a_im, b_re, b_im, c_re, c_im;
-
-		x0 = t1[0];
-		x1 = t1[1];
-		sigma = tree[3];
-		z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
-		z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
-		a_re = fpr_sub(x0, y0);
-		a_im = fpr_sub(x1, y1);
-		b_re = tree[0];
-		b_im = tree[1];
-		c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
-		c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
-		x0 = fpr_add(c_re, t0[0]);
-		x1 = fpr_add(c_im, t0[1]);
-		sigma = tree[2];
-		z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
-		z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
-
-		return;
-	}
-
-	/*
-	 * Normal end of recursion is for logn == 0. Since the last
-	 * steps of the recursions were inlined in the blocks above
-	 * (when logn == 1 or 2), this case is not reachable, and is
-	 * retained here only for documentation purposes.
-
-	if (logn == 0) {
-		fpr x0, x1, sigma;
-
-		x0 = t0[0];
-		x1 = t1[0];
-		sigma = tree[0];
-		z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
-		z1[0] = fpr_of(samp(samp_ctx, x1, sigma));
-		return;
-	}
-
-	 */
-
-	/*
-	 * General recursive case (logn >= 3).
-	 */
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-	tree0 = tree + n;
-	tree1 = tree + n + ffLDL_treesize(logn - 1);
-
-	/*
-	 * We split t1 into z1 (reused as temporary storage), then do
-	 * the recursive invocation, with output in tmp. We finally
-	 * merge back into z1.
-	 */
-	Zf(poly_split_fft)(z1, z1 + hn, t1, logn);
-	ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
-		tree1, z1, z1 + hn, logn - 1, tmp + n);
-	Zf(poly_merge_fft)(z1, tmp, tmp + hn, logn);
-
-	/*
-	 * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[].
-	 */
-	memcpy(tmp, t1, n * sizeof *t1);
-	Zf(poly_sub)(tmp, z1, logn);
-	Zf(poly_mul_fft)(tmp, tree, logn);
-	Zf(poly_add)(tmp, t0, logn);
-
-	/*
-	 * Second recursive invocation.
-	 */
-	Zf(poly_split_fft)(z0, z0 + hn, tmp, logn);
-	ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
-		tree0, z0, z0 + hn, logn - 1, tmp + n);
-	Zf(poly_merge_fft)(z0, tmp, tmp + hn, logn);
-}
-
-/*
- * Compute a signature: the signature contains two vectors, s1 and s2.
- * The s1 vector is not returned. The squared norm of (s1,s2) is
- * computed, and if it is short enough, then s2 is returned into the
- * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
- * returned; the caller should then try again. This function uses an
- * expanded key.
- *
- * tmp[] must have room for at least six polynomials.
- */
-static int
-do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
-	const fpr *restrict expanded_key,
-	const uint16_t *hm,
-	unsigned logn, fpr *restrict tmp)
-{
-	size_t n, u;
-	fpr *t0, *t1, *tx, *ty;
-	const fpr *b00, *b01, *b10, *b11, *tree;
-	fpr ni;
-	uint32_t sqn, ng;
-	int16_t *s1tmp, *s2tmp;
-
-	n = MKN(logn);
-	t0 = tmp;
-	t1 = t0 + n;
-	b00 = expanded_key + skoff_b00(logn);
-	b01 = expanded_key + skoff_b01(logn);
-	b10 = expanded_key + skoff_b10(logn);
-	b11 = expanded_key + skoff_b11(logn);
-	tree = expanded_key + skoff_tree(logn);
-
-	/*
-	 * Set the target vector to [hm, 0] (hm is the hashed message).
-	 */
-	for (u = 0; u < n; u ++) {
-		t0[u] = fpr_of(hm[u]);
-		/* This is implicit.
-		t1[u] = fpr_zero;
-		*/
-	}
-
-	/*
-	 * Apply the lattice basis to obtain the real target
-	 * vector (after normalization with regards to modulus).
-	 */
-	Zf(FFT)(t0, logn);
-	ni = fpr_inverse_of_q;
-	memcpy(t1, t0, n * sizeof *t0);
-	Zf(poly_mul_fft)(t1, b01, logn);
-	Zf(poly_mulconst)(t1, fpr_neg(ni), logn);
-	Zf(poly_mul_fft)(t0, b11, logn);
-	Zf(poly_mulconst)(t0, ni, logn);
-
-	tx = t1 + n;
-	ty = tx + n;
-
-	/*
-	 * Apply sampling. Output is written back in [tx, ty].
-	 */
-	ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, logn, ty + n);
-
-	/*
-	 * Get the lattice point corresponding to that tiny vector.
-	 */
-	memcpy(t0, tx, n * sizeof *tx);
-	memcpy(t1, ty, n * sizeof *ty);
-	Zf(poly_mul_fft)(tx, b00, logn);
-	Zf(poly_mul_fft)(ty, b10, logn);
-	Zf(poly_add)(tx, ty, logn);
-	memcpy(ty, t0, n * sizeof *t0);
-	Zf(poly_mul_fft)(ty, b01, logn);
-
-	memcpy(t0, tx, n * sizeof *tx);
-	Zf(poly_mul_fft)(t1, b11, logn);
-	Zf(poly_add)(t1, ty, logn);
-
-	Zf(iFFT)(t0, logn);
-	Zf(iFFT)(t1, logn);
-
-	/*
-	 * Compute the signature.
-	 */
-	s1tmp = (int16_t *)tx;
-	sqn = 0;
-	ng = 0;
-	for (u = 0; u < n; u ++) {
-		int32_t z;
-
-		z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
-		sqn += (uint32_t)(z * z);
-		ng |= sqn;
-		s1tmp[u] = (int16_t)z;
-	}
-	sqn |= -(ng >> 31);
-
-	/*
-	 * With "normal" degrees (e.g. 512 or 1024), it is very
-	 * improbable that the computed vector is not short enough;
-	 * however, it may happen in practice for the very reduced
-	 * versions (e.g. degree 16 or below). In that case, the caller
-	 * will loop, and we must not write anything into s2[] because
-	 * s2[] may overlap with the hashed message hm[] and we need
-	 * hm[] for the next iteration.
-	 */
-	s2tmp = (int16_t *)tmp;
-	for (u = 0; u < n; u ++) {
-		s2tmp[u] = (int16_t)-fpr_rint(t1[u]);
-	}
-	if (Zf(is_short_half)(sqn, s2tmp, logn)) {
-		memcpy(s2, s2tmp, n * sizeof *s2);
-		memcpy(tmp, s1tmp, n * sizeof *s1tmp);
-		return 1;
-	}
-	return 0;
-}
-
-/*
- * Compute a signature: the signature contains two vectors, s1 and s2.
- * The s1 vector is not returned. The squared norm of (s1,s2) is
- * computed, and if it is short enough, then s2 is returned into the
- * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
- * returned; the caller should then try again.
- *
- * tmp[] must have room for at least nine polynomials.
- */
-static int
-do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
-	const int8_t *restrict f, const int8_t *restrict g,
-	const int8_t *restrict F, const int8_t *restrict G,
-	const uint16_t *hm, unsigned logn, fpr *restrict tmp)
-{
-	size_t n, u;
-	fpr *t0, *t1, *tx, *ty;
-	fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
-	fpr ni;
-	uint32_t sqn, ng;
-	int16_t *s1tmp, *s2tmp;
-
-	n = MKN(logn);
-
-	/*
-	 * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT.
-	 */
-	b00 = tmp;
-	b01 = b00 + n;
-	b10 = b01 + n;
-	b11 = b10 + n;
-	smallints_to_fpr(b01, f, logn);
-	smallints_to_fpr(b00, g, logn);
-	smallints_to_fpr(b11, F, logn);
-	smallints_to_fpr(b10, G, logn);
-	Zf(FFT)(b01, logn);
-	Zf(FFT)(b00, logn);
-	Zf(FFT)(b11, logn);
-	Zf(FFT)(b10, logn);
-	Zf(poly_neg)(b01, logn);
-	Zf(poly_neg)(b11, logn);
-
-	/*
-	 * Compute the Gram matrix G = B·B*. Formulas are:
-	 *   g00 = b00*adj(b00) + b01*adj(b01)
-	 *   g01 = b00*adj(b10) + b01*adj(b11)
-	 *   g10 = b10*adj(b00) + b11*adj(b01)
-	 *   g11 = b10*adj(b10) + b11*adj(b11)
-	 *
-	 * For historical reasons, this implementation uses
-	 * g00, g01 and g11 (upper triangle). g10 is not kept
-	 * since it is equal to adj(g01).
-	 *
-	 * We _replace_ the matrix B with the Gram matrix, but we
-	 * must keep b01 and b11 for computing the target vector.
-	 */
-	t0 = b11 + n;
-	t1 = t0 + n;
-
-	memcpy(t0, b01, n * sizeof *b01);
-	Zf(poly_mulselfadj_fft)(t0, logn);    // t0 <- b01*adj(b01)
-
-	memcpy(t1, b00, n * sizeof *b00);
-	Zf(poly_muladj_fft)(t1, b10, logn);   // t1 <- b00*adj(b10)
-	Zf(poly_mulselfadj_fft)(b00, logn);   // b00 <- b00*adj(b00)
-	Zf(poly_add)(b00, t0, logn);      // b00 <- g00
-	memcpy(t0, b01, n * sizeof *b01);
-	Zf(poly_muladj_fft)(b01, b11, logn);  // b01 <- b01*adj(b11)
-	Zf(poly_add)(b01, t1, logn);      // b01 <- g01
-
-	Zf(poly_mulselfadj_fft)(b10, logn);   // b10 <- b10*adj(b10)
-	memcpy(t1, b11, n * sizeof *b11);
-	Zf(poly_mulselfadj_fft)(t1, logn);    // t1 <- b11*adj(b11)
-	Zf(poly_add)(b10, t1, logn);      // b10 <- g11
-
-	/*
-	 * We rename variables to make things clearer. The three elements
-	 * of the Gram matrix uses the first 3*n slots of tmp[], followed
-	 * by b11 and b01 (in that order).
-	 */
-	g00 = b00;
-	g01 = b01;
-	g11 = b10;
-	b01 = t0;
-	t0 = b01 + n;
-	t1 = t0 + n;
-
-	/*
-	 * Memory layout at that point:
-	 *   g00 g01 g11 b11 b01 t0 t1
-	 */
-
-	/*
-	 * Set the target vector to [hm, 0] (hm is the hashed message).
-	 */
-	for (u = 0; u < n; u ++) {
-		t0[u] = fpr_of(hm[u]);
-		/* This is implicit.
-		t1[u] = fpr_zero;
-		*/
-	}
-
-	/*
-	 * Apply the lattice basis to obtain the real target
-	 * vector (after normalization with regards to modulus).
-	 */
-	Zf(FFT)(t0, logn);
-	ni = fpr_inverse_of_q;
-	memcpy(t1, t0, n * sizeof *t0);
-	Zf(poly_mul_fft)(t1, b01, logn);
-	Zf(poly_mulconst)(t1, fpr_neg(ni), logn);
-	Zf(poly_mul_fft)(t0, b11, logn);
-	Zf(poly_mulconst)(t0, ni, logn);
-
-	/*
-	 * b01 and b11 can be discarded, so we move back (t0,t1).
-	 * Memory layout is now:
-	 *      g00 g01 g11 t0 t1
-	 */
-	memcpy(b11, t0, n * 2 * sizeof *t0);
-	t0 = g11 + n;
-	t1 = t0 + n;
-
-	/*
-	 * Apply sampling; result is written over (t0,t1).
-	 */
-	ffSampling_fft_dyntree(samp, samp_ctx,
-		t0, t1, g00, g01, g11, logn, t1 + n);
-
-	/*
-	 * We arrange the layout back to:
-	 *     b00 b01 b10 b11 t0 t1
-	 *
-	 * We did not conserve the matrix basis, so we must recompute
-	 * it now.
-	 */
-	b00 = tmp;
-	b01 = b00 + n;
-	b10 = b01 + n;
-	b11 = b10 + n;
-	memmove(b11 + n, t0, n * 2 * sizeof *t0);
-	t0 = b11 + n;
-	t1 = t0 + n;
-	smallints_to_fpr(b01, f, logn);
-	smallints_to_fpr(b00, g, logn);
-	smallints_to_fpr(b11, F, logn);
-	smallints_to_fpr(b10, G, logn);
-	Zf(FFT)(b01, logn);
-	Zf(FFT)(b00, logn);
-	Zf(FFT)(b11, logn);
-	Zf(FFT)(b10, logn);
-	Zf(poly_neg)(b01, logn);
-	Zf(poly_neg)(b11, logn);
-	tx = t1 + n;
-	ty = tx + n;
-
-	/*
-	 * Get the lattice point corresponding to that tiny vector.
-	 */
-	memcpy(tx, t0, n * sizeof *t0);
-	memcpy(ty, t1, n * sizeof *t1);
-	Zf(poly_mul_fft)(tx, b00, logn);
-	Zf(poly_mul_fft)(ty, b10, logn);
-	Zf(poly_add)(tx, ty, logn);
-	memcpy(ty, t0, n * sizeof *t0);
-	Zf(poly_mul_fft)(ty, b01, logn);
-
-	memcpy(t0, tx, n * sizeof *tx);
-	Zf(poly_mul_fft)(t1, b11, logn);
-	Zf(poly_add)(t1, ty, logn);
-	Zf(iFFT)(t0, logn);
-	Zf(iFFT)(t1, logn);
-
-	s1tmp = (int16_t *)tx;
-	sqn = 0;
-	ng = 0;
-	for (u = 0; u < n; u ++) {
-		int32_t z;
-
-		z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
-		sqn += (uint32_t)(z * z);
-		ng |= sqn;
-		s1tmp[u] = (int16_t)z;
-	}
-	sqn |= -(ng >> 31);
-
-	/*
-	 * With "normal" degrees (e.g. 512 or 1024), it is very
-	 * improbable that the computed vector is not short enough;
-	 * however, it may happen in practice for the very reduced
-	 * versions (e.g. degree 16 or below). In that case, the caller
-	 * will loop, and we must not write anything into s2[] because
-	 * s2[] may overlap with the hashed message hm[] and we need
-	 * hm[] for the next iteration.
-	 */
-	s2tmp = (int16_t *)tmp;
-	for (u = 0; u < n; u ++) {
-		s2tmp[u] = (int16_t)-fpr_rint(t1[u]);
-	}
-	if (Zf(is_short_half)(sqn, s2tmp, logn)) {
-		memcpy(s2, s2tmp, n * sizeof *s2);
-		memcpy(tmp, s1tmp, n * sizeof *s1tmp);
-		return 1;
-	}
-	return 0;
-}
-
-/*
- * Sample an integer value along a half-gaussian distribution centered
- * on zero and standard deviation 1.8205, with a precision of 72 bits.
- */
-TARGET_AVX2
-int
-Zf(gaussian0_sampler)(prng *p)
-{
-#if FALCON_AVX2 // yyyAVX2+1
-
-	/*
-	 * High words.
-	 */
-	static const union {
-		uint16_t u16[16];
-		__m256i ymm[1];
-	} rhi15 = {
-		{
-			0x51FB, 0x2A69, 0x113E, 0x0568,
-			0x014A, 0x003B, 0x0008, 0x0000,
-			0x0000, 0x0000, 0x0000, 0x0000,
-			0x0000, 0x0000, 0x0000, 0x0000
-		}
-	};
-
-	static const union {
-		uint64_t u64[20];
-		__m256i ymm[5];
-	} rlo57 = {
-		{
-			0x1F42ED3AC391802, 0x12B181F3F7DDB82,
-			0x1CDD0934829C1FF, 0x1754377C7994AE4,
-			0x1846CAEF33F1F6F, 0x14AC754ED74BD5F,
-			0x024DD542B776AE4, 0x1A1FFDC65AD63DA,
-			0x01F80D88A7B6428, 0x001C3FDB2040C69,
-			0x00012CF24D031FB, 0x00000949F8B091F,
-			0x0000003665DA998, 0x00000000EBF6EBB,
-			0x0000000002F5D7E, 0x000000000007098,
-			0x0000000000000C6, 0x000000000000001,
-			0x000000000000000, 0x000000000000000
-		}
-	};
-
-	uint64_t lo;
-	unsigned hi;
-	__m256i xhi, rhi, gthi, eqhi, eqm;
-	__m256i xlo, gtlo0, gtlo1, gtlo2, gtlo3, gtlo4;
-	__m128i t, zt;
-	int r;
-
-	/*
-	 * Get a 72-bit random value and split it into a low part
-	 * (57 bits) and a high part (15 bits)
-	 */
-	lo = prng_get_u64(p);
-	hi = prng_get_u8(p);
-	hi = (hi << 7) | (unsigned)(lo >> 57);
-	lo &= 0x1FFFFFFFFFFFFFF;
-
-	/*
-	 * Broadcast the high part and compare it with the relevant
-	 * values. We need both a "greater than" and an "equal"
-	 * comparisons.
-	 */
-	xhi = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(hi));
-	rhi = _mm256_loadu_si256(&rhi15.ymm[0]);
-	gthi = _mm256_cmpgt_epi16(rhi, xhi);
-	eqhi = _mm256_cmpeq_epi16(rhi, xhi);
-
-	/*
-	 * The result is the number of 72-bit values (among the list of 19)
-	 * which are greater than the 72-bit random value. We first count
-	 * all non-zero 16-bit elements in the first eight of gthi. Such
-	 * elements have value -1 or 0, so we first negate them.
-	 */
-	t = _mm_srli_epi16(_mm256_castsi256_si128(gthi), 15);
-	zt = _mm_setzero_si128();
-	t = _mm_hadd_epi16(t, zt);
-	t = _mm_hadd_epi16(t, zt);
-	t = _mm_hadd_epi16(t, zt);
-	r = _mm_cvtsi128_si32(t);
-
-	/*
-	 * We must look at the low bits for all values for which the
-	 * high bits are an "equal" match; values 8-18 all have the
-	 * same high bits (0).
-	 * On 32-bit systems, 'lo' really is two registers, requiring
-	 * some extra code.
-	 */
-#if defined(__x86_64__) || defined(_M_X64)
-	xlo = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(*(int64_t *)&lo));
-#else
-	{
-		uint32_t e0, e1;
-		int32_t f0, f1;
-
-		e0 = (uint32_t)lo;
-		e1 = (uint32_t)(lo >> 32);
-		f0 = *(int32_t *)&e0;
-		f1 = *(int32_t *)&e1;
-		xlo = _mm256_set_epi32(f1, f0, f1, f0, f1, f0, f1, f0);
-	}
-#endif
-	gtlo0 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[0]), xlo); 
-	gtlo1 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[1]), xlo); 
-	gtlo2 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[2]), xlo); 
-	gtlo3 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[3]), xlo); 
-	gtlo4 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[4]), xlo); 
-
-	/*
-	 * Keep only comparison results that correspond to the non-zero
-	 * elements in eqhi.
-	 */
-	gtlo0 = _mm256_and_si256(gtlo0, _mm256_cvtepi16_epi64(
-		_mm256_castsi256_si128(eqhi)));
-	gtlo1 = _mm256_and_si256(gtlo1, _mm256_cvtepi16_epi64(
-		_mm256_castsi256_si128(_mm256_bsrli_epi128(eqhi, 8))));
-	eqm = _mm256_permute4x64_epi64(eqhi, 0xFF);
-	gtlo2 = _mm256_and_si256(gtlo2, eqm);
-	gtlo3 = _mm256_and_si256(gtlo3, eqm);
-	gtlo4 = _mm256_and_si256(gtlo4, eqm);
-
-	/*
-	 * Add all values to count the total number of "-1" elements.
-	 * Since the first eight "high" words are all different, only
-	 * one element (at most) in gtlo0:gtlo1 can be non-zero; however,
-	 * if the high word of the random value is zero, then many
-	 * elements of gtlo2:gtlo3:gtlo4 can be non-zero.
-	 */
-	gtlo0 = _mm256_or_si256(gtlo0, gtlo1);
-	gtlo0 = _mm256_add_epi64(
-		_mm256_add_epi64(gtlo0, gtlo2),
-		_mm256_add_epi64(gtlo3, gtlo4));
-	t = _mm_add_epi64(
-		_mm256_castsi256_si128(gtlo0),
-		_mm256_extracti128_si256(gtlo0, 1));
-	t = _mm_add_epi64(t, _mm_srli_si128(t, 8));
-	r -= _mm_cvtsi128_si32(t);
-
-	return r;
-
-#else // yyyAVX2+0
-
-	static const uint32_t dist[] = {
-		10745844u,  3068844u,  3741698u,
-		 5559083u,  1580863u,  8248194u,
-		 2260429u, 13669192u,  2736639u,
-		  708981u,  4421575u, 10046180u,
-		  169348u,  7122675u,  4136815u,
-		   30538u, 13063405u,  7650655u,
-		    4132u, 14505003u,  7826148u,
-		     417u, 16768101u, 11363290u,
-		      31u,  8444042u,  8086568u,
-		       1u, 12844466u,   265321u,
-		       0u,  1232676u, 13644283u,
-		       0u,    38047u,  9111839u,
-		       0u,      870u,  6138264u,
-		       0u,       14u, 12545723u,
-		       0u,        0u,  3104126u,
-		       0u,        0u,    28824u,
-		       0u,        0u,      198u,
-		       0u,        0u,        1u
-	};
-
-	uint32_t v0, v1, v2, hi;
-	uint64_t lo;
-	size_t u;
-	int z;
-
-	/*
-	 * Get a random 72-bit value, into three 24-bit limbs v0..v2.
-	 */
-	lo = prng_get_u64(p);
-	hi = prng_get_u8(p);
-	v0 = (uint32_t)lo & 0xFFFFFF;
-	v1 = (uint32_t)(lo >> 24) & 0xFFFFFF;
-	v2 = (uint32_t)(lo >> 48) | (hi << 16);
-
-	/*
-	 * Sampled value is z, such that v0..v2 is lower than the first
-	 * z elements of the table.
-	 */
-	z = 0;
-	for (u = 0; u < (sizeof dist) / sizeof(dist[0]); u += 3) {
-		uint32_t w0, w1, w2, cc;
-
-		w0 = dist[u + 2];
-		w1 = dist[u + 1];
-		w2 = dist[u + 0];
-		cc = (v0 - w0) >> 31;
-		cc = (v1 - w1 - cc) >> 31;
-		cc = (v2 - w2 - cc) >> 31;
-		z += (int)cc;
-	}
-	return z;
-
-#endif // yyyAVX2-
-}
-
-/*
- * Sample a bit with probability exp(-x) for some x >= 0.
- */
-TARGET_AVX2
-static int
-BerExp(prng *p, fpr x, fpr ccs)
-{
-	int s, i;
-	fpr r;
-	uint32_t sw, w;
-	uint64_t z;
-
-	/*
-	 * Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
-	 * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
-	 */
-	s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
-	r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
-
-	/*
-	 * It may happen (quite rarely) that s >= 64; if sigma = 1.2
-	 * (the minimum value for sigma), r = 0 and b = 1, then we get
-	 * s >= 64 if the half-Gaussian produced a z >= 13, which happens
-	 * with probability about 0.000000000230383991, which is
-	 * approximatively equal to 2^(-32). In any case, if s >= 64,
-	 * then BerExp will be non-zero with probability less than
-	 * 2^(-64), so we can simply saturate s at 63.
-	 */
-	sw = (uint32_t)s;
-	sw ^= (sw ^ 63) & -((63 - sw) >> 31);
-	s = (int)sw;
-
-	/*
-	 * Compute exp(-r); we know that 0 <= r < log(2) at this point, so
-	 * we can use fpr_expm_p63(), which yields a result scaled to 2^63.
-	 * We scale it up to 2^64, then right-shift it by s bits because
-	 * we really want exp(-x) = 2^(-s)*exp(-r).
-	 *
-	 * The "-1" operation makes sure that the value fits on 64 bits
-	 * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
-	 * case). The bias is negligible since fpr_expm_p63() only computes
-	 * with 51 bits of precision or so.
-	 */
-	z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
-
-	/*
-	 * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
-	 * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
-	 * PRNG output to limit its consumption, the sign of the difference
-	 * yields the expected result.
-	 */
-	i = 64;
-	do {
-		i -= 8;
-		w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
-	} while (!w && i > 0);
-	return (int)(w >> 31);
-}
-
-/*
- * The sampler produces a random integer that follows a discrete Gaussian
- * distribution, centered on mu, and with standard deviation sigma. The
- * provided parameter isigma is equal to 1/sigma.
- *
- * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
- * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
- */
-TARGET_AVX2
-int
-Zf(sampler)(void *ctx, fpr mu, fpr isigma)
-{
-	sampler_context *spc;
-	int s;
-	fpr r, dss, ccs;
-
-	spc = ctx;
-
-	/*
-	 * Center is mu. We compute mu = s + r where s is an integer
-	 * and 0 <= r < 1.
-	 */
-	s = (int)fpr_floor(mu);
-	r = fpr_sub(mu, fpr_of(s));
-
-	/*
-	 * dss = 1/(2*sigma^2) = 0.5*(isigma^2).
-	 */
-	dss = fpr_half(fpr_sqr(isigma));
-
-	/*
-	 * ccs = sigma_min / sigma = sigma_min * isigma.
-	 */
-	ccs = fpr_mul(isigma, spc->sigma_min);
-
-	/*
-	 * We now need to sample on center r.
-	 */
-	for (;;) {
-		int z0, z, b;
-		fpr x;
-
-		/*
-		 * Sample z for a Gaussian distribution. Then get a
-		 * random bit b to turn the sampling into a bimodal
-		 * distribution: if b = 1, we use z+1, otherwise we
-		 * use -z. We thus have two situations:
-		 *
-		 *  - b = 1: z >= 1 and sampled against a Gaussian
-		 *    centered on 1.
-		 *  - b = 0: z <= 0 and sampled against a Gaussian
-		 *    centered on 0.
-		 */
-		z0 = Zf(gaussian0_sampler)(&spc->p);
-		b = prng_get_u8(&spc->p) & 1;
-		z = b + ((b << 1) - 1) * z0;
-
-		/*
-		 * Rejection sampling. We want a Gaussian centered on r;
-		 * but we sampled against a Gaussian centered on b (0 or
-		 * 1). But we know that z is always in the range where
-		 * our sampling distribution is greater than the Gaussian
-		 * distribution, so rejection works.
-		 *
-		 * We got z with distribution:
-		 *    G(z) = exp(-((z-b)^2)/(2*sigma0^2))
-		 * We target distribution:
-		 *    S(z) = exp(-((z-r)^2)/(2*sigma^2))
-		 * Rejection sampling works by keeping the value z with
-		 * probability S(z)/G(z), and starting again otherwise.
-		 * This requires S(z) <= G(z), which is the case here.
-		 * Thus, we simply need to keep our z with probability:
-		 *    P = exp(-x)
-		 * where:
-		 *    x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
-		 *
-		 * Here, we scale up the Bernouilli distribution, which
-		 * makes rejection more probable, but makes rejection
-		 * rate sufficiently decorrelated from the Gaussian
-		 * center and standard deviation that the whole sampler
-		 * can be said to be constant-time.
-		 */
-		x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
-		x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
-		if (BerExp(&spc->p, x, ccs)) {
-			/*
-			 * Rejection sampling was centered on r, but the
-			 * actual center is mu = s + r.
-			 */
-			return s + z;
-		}
-	}
-}
-
-/* see inner.h */
-void
-Zf(sign_tree)(int16_t *sig, inner_shake256_context *rng,
-	const fpr *restrict expanded_key,
-	const uint16_t *hm, unsigned logn, uint8_t *tmp)
-{
-	fpr *ftmp;
-
-	ftmp = (fpr *)tmp;
-	for (;;) {
-		/*
-		 * Signature produces short vectors s1 and s2. The
-		 * signature is acceptable only if the aggregate vector
-		 * s1,s2 is short; we must use the same bound as the
-		 * verifier.
-		 *
-		 * If the signature is acceptable, then we return only s2
-		 * (the verifier recomputes s1 from s2, the hashed message,
-		 * and the public key).
-		 */
-		sampler_context spc;
-		samplerZ samp;
-		void *samp_ctx;
-
-		/*
-		 * Normal sampling. We use a fast PRNG seeded from our
-		 * SHAKE context ('rng').
-		 */
-		spc.sigma_min = (logn == 10)
-			? fpr_sigma_min_10
-			: fpr_sigma_min_9;
-		Zf(prng_init)(&spc.p, rng);
-		samp = Zf(sampler);
-		samp_ctx = &spc;
-
-		/*
-		 * Do the actual signature.
-		 */
-		if (do_sign_tree(samp, samp_ctx, sig,
-			expanded_key, hm, logn, ftmp))
-		{
-			break;
-		}
-	}
-}
-
-/* see inner.h */
-void
-Zf(sign_dyn)(int16_t *sig, inner_shake256_context *rng,
-	const int8_t *restrict f, const int8_t *restrict g,
-	const int8_t *restrict F, const int8_t *restrict G,
-	const uint16_t *hm, unsigned logn, uint8_t *tmp)
-{
-	fpr *ftmp;
-
-	ftmp = (fpr *)tmp;
-	for (;;) {
-		/*
-		 * Signature produces short vectors s1 and s2. The
-		 * signature is acceptable only if the aggregate vector
-		 * s1,s2 is short; we must use the same bound as the
-		 * verifier.
-		 *
-		 * If the signature is acceptable, then we return only s2
-		 * (the verifier recomputes s1 from s2, the hashed message,
-		 * and the public key).
-		 */
-		sampler_context spc;
-		samplerZ samp;
-		void *samp_ctx;
-
-		/*
-		 * Normal sampling. We use a fast PRNG seeded from our
-		 * SHAKE context ('rng').
-		 */
-		spc.sigma_min = (logn == 10)
-			? fpr_sigma_min_10
-			: fpr_sigma_min_9;
-		Zf(prng_init)(&spc.p, rng);
-		samp = Zf(sampler);
-		samp_ctx = &spc;
-
-		/*
-		 * Do the actual signature.
-		 */
-		if (do_sign_dyn(samp, samp_ctx, sig,
-			f, g, F, G, hm, logn, ftmp))
-		{
-			break;
-		}
-	}
-}
diff --git a/crypto_sign/falcon-512-tree/m4-ct/vrfy.c b/crypto_sign/falcon-512-tree/m4-ct/vrfy.c
deleted file mode 100644
index c74a3dd3..00000000
--- a/crypto_sign/falcon-512-tree/m4-ct/vrfy.c
+++ /dev/null
@@ -1,871 +0,0 @@
-/*
- * Falcon signature verification.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-/* ===================================================================== */
-/*
- * Constants for NTT.
- *
- *   n = 2^logn  (2 <= n <= 1024)
- *   phi = X^n + 1
- *   q = 12289
- *   q0i = -1/q mod 2^16
- *   R = 2^16 mod q
- *   R2 = 2^32 mod q
- */
-
-#define Q     12289
-#define Q0I   12287
-#define R      4091
-#define R2    10952
-
-/*
- * Table for NTT, binary case:
- *   GMb[x] = R*(g^rev(x)) mod q
- * where g = 7 (it is a 2048-th primitive root of 1 modulo q)
- * and rev() is the bit-reversal function over 10 bits.
- */
-static const uint16_t GMb[] = {
-	 4091,  7888, 11060, 11208,  6960,  4342,  6275,  9759,
-	 1591,  6399,  9477,  5266,   586,  5825,  7538,  9710,
-	 1134,  6407,  1711,   965,  7099,  7674,  3743,  6442,
-	10414,  8100,  1885,  1688,  1364, 10329, 10164,  9180,
-	12210,  6240,   997,   117,  4783,  4407,  1549,  7072,
-	 2829,  6458,  4431,  8877,  7144,  2564,  5664,  4042,
-	12189,   432, 10751,  1237,  7610,  1534,  3983,  7863,
-	 2181,  6308,  8720,  6570,  4843,  1690,    14,  3872,
-	 5569,  9368, 12163,  2019,  7543,  2315,  4673,  7340,
-	 1553,  1156,  8401, 11389,  1020,  2967, 10772,  7045,
-	 3316, 11236,  5285, 11578, 10637, 10086,  9493,  6180,
-	 9277,  6130,  3323,   883, 10469,   489,  1502,  2851,
-	11061,  9729,  2742, 12241,  4970, 10481, 10078,  1195,
-	  730,  1762,  3854,  2030,  5892, 10922,  9020,  5274,
-	 9179,  3604,  3782, 10206,  3180,  3467,  4668,  2446,
-	 7613,  9386,   834,  7703,  6836,  3403,  5351, 12276,
-	 3580,  1739, 10820,  9787, 10209,  4070, 12250,  8525,
-	10401,  2749,  7338, 10574,  6040,   943,  9330,  1477,
-	 6865,  9668,  3585,  6633, 12145,  4063,  3684,  7680,
-	 8188,  6902,  3533,  9807,  6090,   727, 10099,  7003,
-	 6945,  1949,  9731, 10559,  6057,   378,  7871,  8763,
-	 8901,  9229,  8846,  4551,  9589, 11664,  7630,  8821,
-	 5680,  4956,  6251,  8388, 10156,  8723,  2341,  3159,
-	 1467,  5460,  8553,  7783,  2649,  2320,  9036,  6188,
-	  737,  3698,  4699,  5753,  9046,  3687,    16,   914,
-	 5186, 10531,  4552,  1964,  3509,  8436,  7516,  5381,
-	10733,  3281,  7037,  1060,  2895,  7156,  8887,  5357,
-	 6409,  8197,  2962,  6375,  5064,  6634,  5625,   278,
-	  932, 10229,  8927,  7642,   351,  9298,   237,  5858,
-	 7692,  3146, 12126,  7586,  2053, 11285,  3802,  5204,
-	 4602,  1748, 11300,   340,  3711,  4614,   300, 10993,
-	 5070, 10049, 11616, 12247,  7421, 10707,  5746,  5654,
-	 3835,  5553,  1224,  8476,  9237,  3845,   250, 11209,
-	 4225,  6326,  9680, 12254,  4136,  2778,   692,  8808,
-	 6410,  6718, 10105, 10418,  3759,  7356, 11361,  8433,
-	 6437,  3652,  6342,  8978,  5391,  2272,  6476,  7416,
-	 8418, 10824, 11986,  5733,   876,  7030,  2167,  2436,
-	 3442,  9217,  8206,  4858,  5964,  2746,  7178,  1434,
-	 7389,  8879, 10661, 11457,  4220,  1432, 10832,  4328,
-	 8557,  1867,  9454,  2416,  3816,  9076,   686,  5393,
-	 2523,  4339,  6115,   619,   937,  2834,  7775,  3279,
-	 2363,  7488,  6112,  5056,   824, 10204, 11690,  1113,
-	 2727,  9848,   896,  2028,  5075,  2654, 10464,  7884,
-	12169,  5434,  3070,  6400,  9132, 11672, 12153,  4520,
-	 1273,  9739, 11468,  9937, 10039,  9720,  2262,  9399,
-	11192,   315,  4511,  1158,  6061,  6751, 11865,   357,
-	 7367,  4550,   983,  8534,  8352, 10126,  7530,  9253,
-	 4367,  5221,  3999,  8777,  3161,  6990,  4130, 11652,
-	 3374, 11477,  1753,   292,  8681,  2806, 10378, 12188,
-	 5800, 11811,  3181,  1988,  1024,  9340,  2477, 10928,
-	 4582,  6750,  3619,  5503,  5233,  2463,  8470,  7650,
-	 7964,  6395,  1071,  1272,  3474, 11045,  3291, 11344,
-	 8502,  9478,  9837,  1253,  1857,  6233,  4720, 11561,
-	 6034,  9817,  3339,  1797,  2879,  6242,  5200,  2114,
-	 7962,  9353, 11363,  5475,  6084,  9601,  4108,  7323,
-	10438,  9471,  1271,   408,  6911,  3079,   360,  8276,
-	11535,  9156,  9049, 11539,   850,  8617,   784,  7919,
-	 8334, 12170,  1846, 10213, 12184,  7827, 11903,  5600,
-	 9779,  1012,   721,  2784,  6676,  6552,  5348,  4424,
-	 6816,  8405,  9959,  5150,  2356,  5552,  5267,  1333,
-	 8801,  9661,  7308,  5788,  4910,   909, 11613,  4395,
-	 8238,  6686,  4302,  3044,  2285, 12249,  1963,  9216,
-	 4296, 11918,   695,  4371,  9793,  4884,  2411, 10230,
-	 2650,   841,  3890, 10231,  7248,  8505, 11196,  6688,
-	 4059,  6060,  3686,  4722, 11853,  5816,  7058,  6868,
-	11137,  7926,  4894, 12284,  4102,  3908,  3610,  6525,
-	 7938,  7982, 11977,  6755,   537,  4562,  1623,  8227,
-	11453,  7544,   906, 11816,  9548, 10858,  9703,  2815,
-	11736,  6813,  6979,   819,  8903,  6271, 10843,   348,
-	 7514,  8339,  6439,   694,   852,  5659,  2781,  3716,
-	11589,  3024,  1523,  8659,  4114, 10738,  3303,  5885,
-	 2978,  7289, 11884,  9123,  9323, 11830,    98,  2526,
-	 2116,  4131, 11407,  1844,  3645,  3916,  8133,  2224,
-	10871,  8092,  9651,  5989,  7140,  8480,  1670,   159,
-	10923,  4918,   128,  7312,   725,  9157,  5006,  6393,
-	 3494,  6043, 10972,  6181, 11838,  3423, 10514,  7668,
-	 3693,  6658,  6905, 11953, 10212, 11922,  9101,  8365,
-	 5110,    45,  2400,  1921,  4377,  2720,  1695,    51,
-	 2808,   650,  1896,  9997,  9971, 11980,  8098,  4833,
-	 4135,  4257,  5838,  4765, 10985, 11532,   590, 12198,
-	  482, 12173,  2006,  7064, 10018,  3912, 12016, 10519,
-	11362,  6954,  2210,   284,  5413,  6601,  3865, 10339,
-	11188,  6231,   517,  9564, 11281,  3863,  1210,  4604,
-	 8160, 11447,   153,  7204,  5763,  5089,  9248, 12154,
-	11748,  1354,  6672,   179,  5532,  2646,  5941, 12185,
-	  862,  3158,   477,  7279,  5678,  7914,  4254,   302,
-	 2893, 10114,  6890,  9560,  9647, 11905,  4098,  9824,
-	10269,  1353, 10715,  5325,  6254,  3951,  1807,  6449,
-	 5159,  1308,  8315,  3404,  1877,  1231,   112,  6398,
-	11724, 12272,  7286,  1459, 12274,  9896,  3456,   800,
-	 1397, 10678,   103,  7420,  7976,   936,   764,   632,
-	 7996,  8223,  8445,  7758, 10870,  9571,  2508,  1946,
-	 6524, 10158,  1044,  4338,  2457,  3641,  1659,  4139,
-	 4688,  9733, 11148,  3946,  2082,  5261,  2036, 11850,
-	 7636, 12236,  5366,  2380,  1399,  7720,  2100,  3217,
-	10912,  8898,  7578, 11995,  2791,  1215,  3355,  2711,
-	 2267,  2004,  8568, 10176,  3214,  2337,  1750,  4729,
-	 4997,  7415,  6315, 12044,  4374,  7157,  4844,   211,
-	 8003, 10159,  9290, 11481,  1735,  2336,  5793,  9875,
-	 8192,   986,  7527,  1401,   870,  3615,  8465,  2756,
-	 9770,  2034, 10168,  3264,  6132,    54,  2880,  4763,
-	11805,  3074,  8286,  9428,  4881,  6933,  1090, 10038,
-	 2567,   708,   893,  6465,  4962, 10024,  2090,  5718,
-	10743,   780,  4733,  4623,  2134,  2087,  4802,   884,
-	 5372,  5795,  5938,  4333,  6559,  7549,  5269, 10664,
-	 4252,  3260,  5917, 10814,  5768,  9983,  8096,  7791,
-	 6800,  7491,  6272,  1907, 10947,  6289, 11803,  6032,
-	11449,  1171,  9201,  7933,  2479,  7970, 11337,  7062,
-	 8911,  6728,  6542,  8114,  8828,  6595,  3545,  4348,
-	 4610,  2205,  6999,  8106,  5560, 10390,  9321,  2499,
-	 2413,  7272,  6881, 10582,  9308,  9437,  3554,  3326,
-	 5991, 11969,  3415, 12283,  9838, 12063,  4332,  7830,
-	11329,  6605, 12271,  2044, 11611,  7353, 11201, 11582,
-	 3733,  8943,  9978,  1627,  7168,  3935,  5050,  2762,
-	 7496, 10383,   755,  1654, 12053,  4952, 10134,  4394,
-	 6592,  7898,  7497,  8904, 12029,  3581, 10748,  5674,
-	10358,  4901,  7414,  8771,   710,  6764,  8462,  7193,
-	 5371,  7274, 11084,   290,  7864,  6827, 11822,  2509,
-	 6578,  4026,  5807,  1458,  5721,  5762,  4178,  2105,
-	11621,  4852,  8897,  2856, 11510,  9264,  2520,  8776,
-	 7011,  2647,  1898,  7039,  5950, 11163,  5488,  6277,
-	 9182, 11456,   633, 10046, 11554,  5633,  9587,  2333,
-	 7008,  7084,  5047,  7199,  9865,  8997,   569,  6390,
-	10845,  9679,  8268, 11472,  4203,  1997,     2,  9331,
-	  162,  6182,  2000,  3649,  9792,  6363,  7557,  6187,
-	 8510,  9935,  5536,  9019,  3706, 12009,  1452,  3067,
-	 5494,  9692,  4865,  6019,  7106,  9610,  4588, 10165,
-	 6261,  5887,  2652, 10172,  1580, 10379,  4638,  9949
-};
-
-/*
- * Table for inverse NTT, binary case:
- *   iGMb[x] = R*((1/g)^rev(x)) mod q
- * Since g = 7, 1/g = 8778 mod 12289.
- */
-static const uint16_t iGMb[] = {
-	 4091,  4401,  1081,  1229,  2530,  6014,  7947,  5329,
-	 2579,  4751,  6464, 11703,  7023,  2812,  5890, 10698,
-	 3109,  2125,  1960, 10925, 10601, 10404,  4189,  1875,
-	 5847,  8546,  4615,  5190, 11324, 10578,  5882, 11155,
-	 8417, 12275, 10599,  7446,  5719,  3569,  5981, 10108,
-	 4426,  8306, 10755,  4679, 11052,  1538, 11857,   100,
-	 8247,  6625,  9725,  5145,  3412,  7858,  5831,  9460,
-	 5217, 10740,  7882,  7506, 12172, 11292,  6049,    79,
-	   13,  6938,  8886,  5453,  4586, 11455,  2903,  4676,
-	 9843,  7621,  8822,  9109,  2083,  8507,  8685,  3110,
-	 7015,  3269,  1367,  6397, 10259,  8435, 10527, 11559,
-	11094,  2211,  1808,  7319,    48,  9547,  2560,  1228,
-	 9438, 10787, 11800,  1820, 11406,  8966,  6159,  3012,
-	 6109,  2796,  2203,  1652,   711,  7004,  1053,  8973,
-	 5244,  1517,  9322, 11269,   900,  3888, 11133, 10736,
-	 4949,  7616,  9974,  4746, 10270,   126,  2921,  6720,
-	 6635,  6543,  1582,  4868,    42,   673,  2240,  7219,
-	 1296, 11989,  7675,  8578, 11949,   989, 10541,  7687,
-	 7085,  8487,  1004, 10236,  4703,   163,  9143,  4597,
-	 6431, 12052,  2991, 11938,  4647,  3362,  2060, 11357,
-	12011,  6664,  5655,  7225,  5914,  9327,  4092,  5880,
-	 6932,  3402,  5133,  9394, 11229,  5252,  9008,  1556,
-	 6908,  4773,  3853,  8780, 10325,  7737,  1758,  7103,
-	11375, 12273,  8602,  3243,  6536,  7590,  8591, 11552,
-	 6101,  3253,  9969,  9640,  4506,  3736,  6829, 10822,
-	 9130,  9948,  3566,  2133,  3901,  6038,  7333,  6609,
-	 3468,  4659,   625,  2700,  7738,  3443,  3060,  3388,
-	 3526,  4418, 11911,  6232,  1730,  2558, 10340,  5344,
-	 5286,  2190, 11562,  6199,  2482,  8756,  5387,  4101,
-	 4609,  8605,  8226,   144,  5656,  8704,  2621,  5424,
-	10812,  2959, 11346,  6249,  1715,  4951,  9540,  1888,
-	 3764,    39,  8219,  2080,  2502,  1469, 10550,  8709,
-	 5601,  1093,  3784,  5041,  2058,  8399, 11448,  9639,
-	 2059,  9878,  7405,  2496,  7918, 11594,   371,  7993,
-	 3073, 10326,    40, 10004,  9245,  7987,  5603,  4051,
-	 7894,   676, 11380,  7379,  6501,  4981,  2628,  3488,
-	10956,  7022,  6737,  9933,  7139,  2330,  3884,  5473,
-	 7865,  6941,  5737,  5613,  9505, 11568, 11277,  2510,
-	 6689,   386,  4462,   105,  2076, 10443,   119,  3955,
-	 4370, 11505,  3672, 11439,   750,  3240,  3133,   754,
-	 4013, 11929,  9210,  5378, 11881, 11018,  2818,  1851,
-	 4966,  8181,  2688,  6205,  6814,   926,  2936,  4327,
-	10175,  7089,  6047,  9410, 10492,  8950,  2472,  6255,
-	  728,  7569,  6056, 10432, 11036,  2452,  2811,  3787,
-	  945,  8998,  1244,  8815, 11017, 11218,  5894,  4325,
-	 4639,  3819,  9826,  7056,  6786,  8670,  5539,  7707,
-	 1361,  9812,  2949, 11265, 10301,  9108,   478,  6489,
-	  101,  1911,  9483,  3608, 11997, 10536,   812,  8915,
-	  637,  8159,  5299,  9128,  3512,  8290,  7068,  7922,
-	 3036,  4759,  2163,  3937,  3755, 11306,  7739,  4922,
-	11932,   424,  5538,  6228, 11131,  7778, 11974,  1097,
-	 2890, 10027,  2569,  2250,  2352,   821,  2550, 11016,
-	 7769,   136,   617,  3157,  5889,  9219,  6855,   120,
-	 4405,  1825,  9635,  7214, 10261, 11393,  2441,  9562,
-	11176,   599,  2085, 11465,  7233,  6177,  4801,  9926,
-	 9010,  4514,  9455, 11352, 11670,  6174,  7950,  9766,
-	 6896, 11603,  3213,  8473,  9873,  2835, 10422,  3732,
-	 7961,  1457, 10857,  8069,   832,  1628,  3410,  4900,
-	10855,  5111,  9543,  6325,  7431,  4083,  3072,  8847,
-	 9853, 10122,  5259, 11413,  6556,   303,  1465,  3871,
-	 4873,  5813, 10017,  6898,  3311,  5947,  8637,  5852,
-	 3856,   928,  4933,  8530,  1871,  2184,  5571,  5879,
-	 3481, 11597,  9511,  8153,    35,  2609,  5963,  8064,
-	 1080, 12039,  8444,  3052,  3813, 11065,  6736,  8454,
-	 2340,  7651,  1910, 10709,  2117,  9637,  6402,  6028,
-	 2124,  7701,  2679,  5183,  6270,  7424,  2597,  6795,
-	 9222, 10837,   280,  8583,  3270,  6753,  2354,  3779,
-	 6102,  4732,  5926,  2497,  8640, 10289,  6107, 12127,
-	 2958, 12287, 10292,  8086,   817,  4021,  2610,  1444,
-	 5899, 11720,  3292,  2424,  5090,  7242,  5205,  5281,
-	 9956,  2702,  6656,   735,  2243, 11656,   833,  3107,
-	 6012,  6801,  1126,  6339,  5250, 10391,  9642,  5278,
-	 3513,  9769,  3025,   779,  9433,  3392,  7437,   668,
-	10184,  8111,  6527,  6568, 10831,  6482,  8263,  5711,
-	 9780,   467,  5462,  4425, 11999,  1205,  5015,  6918,
-	 5096,  3827,  5525, 11579,  3518,  4875,  7388,  1931,
-	 6615,  1541,  8708,   260,  3385,  4792,  4391,  5697,
-	 7895,  2155,  7337,   236, 10635, 11534,  1906,  4793,
-	 9527,  7239,  8354,  5121, 10662,  2311,  3346,  8556,
-	  707,  1088,  4936,   678, 10245,    18,  5684,   960,
-	 4459,  7957,   226,  2451,     6,  8874,   320,  6298,
-	 8963,  8735,  2852,  2981,  1707,  5408,  5017,  9876,
-	 9790,  2968,  1899,  6729,  4183,  5290, 10084,  7679,
-	 7941,  8744,  5694,  3461,  4175,  5747,  5561,  3378,
-	 5227,   952,  4319,  9810,  4356,  3088, 11118,   840,
-	 6257,   486,  6000,  1342, 10382,  6017,  4798,  5489,
-	 4498,  4193,  2306,  6521,  1475,  6372,  9029,  8037,
-	 1625,  7020,  4740,  5730,  7956,  6351,  6494,  6917,
-	11405,  7487, 10202, 10155,  7666,  7556, 11509,  1546,
-	 6571, 10199,  2265,  7327,  5824, 11396, 11581,  9722,
-	 2251, 11199,  5356,  7408,  2861,  4003,  9215,   484,
-	 7526,  9409, 12235,  6157,  9025,  2121, 10255,  2519,
-	 9533,  3824,  8674, 11419, 10888,  4762, 11303,  4097,
-	 2414,  6496,  9953, 10554,   808,  2999,  2130,  4286,
-	12078,  7445,  5132,  7915,   245,  5974,  4874,  7292,
-	 7560, 10539,  9952,  9075,  2113,  3721, 10285, 10022,
-	 9578,  8934, 11074,  9498,   294,  4711,  3391,  1377,
-	 9072, 10189,  4569, 10890,  9909,  6923,    53,  4653,
-	  439, 10253,  7028, 10207,  8343,  1141,  2556,  7601,
-	 8150, 10630,  8648,  9832,  7951, 11245,  2131,  5765,
-	10343,  9781,  2718,  1419,  4531,  3844,  4066,  4293,
-	11657, 11525, 11353,  4313,  4869, 12186,  1611, 10892,
-	11489,  8833,  2393,    15, 10830,  5003,    17,   565,
-	 5891, 12177, 11058, 10412,  8885,  3974, 10981,  7130,
-	 5840, 10482,  8338,  6035,  6964,  1574, 10936,  2020,
-	 2465,  8191,   384,  2642,  2729,  5399,  2175,  9396,
-	11987,  8035,  4375,  6611,  5010, 11812,  9131, 11427,
-	  104,  6348,  9643,  6757, 12110,  5617, 10935,   541,
-	  135,  3041,  7200,  6526,  5085, 12136,   842,  4129,
-	 7685, 11079,  8426,  1008,  2725, 11772,  6058,  1101,
-	 1950,  8424,  5688,  6876, 12005, 10079,  5335,   927,
-	 1770,   273,  8377,  2271,  5225, 10283,   116, 11807,
-	   91, 11699,   757,  1304,  7524,  6451,  8032,  8154,
-	 7456,  4191,   309,  2318,  2292, 10393, 11639,  9481,
-	12238, 10594,  9569,  7912, 10368,  9889, 12244,  7179,
-	 3924,  3188,   367,  2077,   336,  5384,  5631,  8596,
-	 4621,  1775,  8866,   451,  6108,  1317,  6246,  8795,
-	 5896,  7283,  3132, 11564,  4977, 12161,  7371,  1366,
-	12130, 10619,  3809,  5149,  6300,  2638,  4197,  1418,
-	10065,  4156,  8373,  8644, 10445,   882,  8158, 10173,
-	 9763, 12191,   459,  2966,  3166,   405,  5000,  9311,
-	 6404,  8986,  1551,  8175,  3630, 10766,  9265,   700,
-	 8573,  9508,  6630, 11437, 11595,  5850,  3950,  4775,
-	11941,  1446,  6018,  3386, 11470,  5310,  5476,   553,
-	 9474,  2586,  1431,  2741,   473, 11383,  4745,   836,
-	 4062, 10666,  7727, 11752,  5534,   312,  4307,  4351,
-	 5764,  8679,  8381,  8187,     5,  7395,  4363,  1152,
-	 5421,  5231,  6473,   436,  7567,  8603,  6229,  8230
-};
-
-/*
- * Reduce a small signed integer modulo q. The source integer MUST
- * be between -q/2 and +q/2.
- */
-static inline uint32_t
-mq_conv_small(int x)
-{
-	/*
-	 * If x < 0, the cast to uint32_t will set the high bit to 1.
-	 */
-	uint32_t y;
-
-	y = (uint32_t)x;
-	y += Q & -(y >> 31);
-	return y;
-}
-
-/*
- * Addition modulo q. Operands must be in the 0..q-1 range.
- */
-static inline uint32_t
-mq_add(uint32_t x, uint32_t y)
-{
-	/*
-	 * We compute x + y - q. If the result is negative, then the
-	 * high bit will be set, and 'd >> 31' will be equal to 1;
-	 * thus '-(d >> 31)' will be an all-one pattern. Otherwise,
-	 * it will be an all-zero pattern. In other words, this
-	 * implements a conditional addition of q.
-	 */
-	uint32_t d;
-
-	d = x + y - Q;
-	d += Q & -(d >> 31);
-	return d;
-}
-
-/*
- * Subtraction modulo q. Operands must be in the 0..q-1 range.
- */
-static inline uint32_t
-mq_sub(uint32_t x, uint32_t y)
-{
-	/*
-	 * As in mq_add(), we use a conditional addition to ensure the
-	 * result is in the 0..q-1 range.
-	 */
-	uint32_t d;
-
-	d = x - y;
-	d += Q & -(d >> 31);
-	return d;
-}
-
-/*
- * Division by 2 modulo q. Operand must be in the 0..q-1 range.
- */
-static inline uint32_t
-mq_rshift1(uint32_t x)
-{
-	x += Q & -(x & 1);
-	return (x >> 1);
-}
-
-/*
- * Montgomery multiplication modulo q. If we set R = 2^16 mod q, then
- * this function computes: x * y / R mod q
- * Operands must be in the 0..q-1 range.
- */
-static inline uint32_t
-mq_montymul(uint32_t x, uint32_t y)
-{
-	uint32_t z, w;
-
-	/*
-	 * We compute x*y + k*q with a value of k chosen so that the 16
-	 * low bits of the result are 0. We can then shift the value.
-	 * After the shift, result may still be larger than q, but it
-	 * will be lower than 2*q, so a conditional subtraction works.
-	 */
-
-	z = x * y;
-	w = ((z * Q0I) & 0xFFFF) * Q;
-
-	/*
-	 * When adding z and w, the result will have its low 16 bits
-	 * equal to 0. Since x, y and z are lower than q, the sum will
-	 * be no more than (2^15 - 1) * q + (q - 1)^2, which will
-	 * fit on 29 bits.
-	 */
-	z = (z + w) >> 16;
-
-	/*
-	 * After the shift, analysis shows that the value will be less
-	 * than 2q. We do a subtraction then conditional subtraction to
-	 * ensure the result is in the expected range.
-	 */
-	z -= Q;
-	z += Q & -(z >> 31);
-	return z;
-}
-
-/*
- * Montgomery squaring (computes (x^2)/R).
- */
-static inline uint32_t
-mq_montysqr(uint32_t x)
-{
-	return mq_montymul(x, x);
-}
-
-/*
- * Divide x by y modulo q = 12289.
- */
-static inline uint32_t
-mq_div_12289(uint32_t x, uint32_t y)
-{
-	/*
-	 * We invert y by computing y^(q-2) mod q.
-	 *
-	 * We use the following addition chain for exponent e = 12287:
-	 *
-	 *   e0 = 1
-	 *   e1 = 2 * e0 = 2
-	 *   e2 = e1 + e0 = 3
-	 *   e3 = e2 + e1 = 5
-	 *   e4 = 2 * e3 = 10
-	 *   e5 = 2 * e4 = 20
-	 *   e6 = 2 * e5 = 40
-	 *   e7 = 2 * e6 = 80
-	 *   e8 = 2 * e7 = 160
-	 *   e9 = e8 + e2 = 163
-	 *   e10 = e9 + e8 = 323
-	 *   e11 = 2 * e10 = 646
-	 *   e12 = 2 * e11 = 1292
-	 *   e13 = e12 + e9 = 1455
-	 *   e14 = 2 * e13 = 2910
-	 *   e15 = 2 * e14 = 5820
-	 *   e16 = e15 + e10 = 6143
-	 *   e17 = 2 * e16 = 12286
-	 *   e18 = e17 + e0 = 12287
-	 *
-	 * Additions on exponents are converted to Montgomery
-	 * multiplications. We define all intermediate results as so
-	 * many local variables, and let the C compiler work out which
-	 * must be kept around.
-	 */
-	uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
-	uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18;
-
-	y0 = mq_montymul(y, R2);
-	y1 = mq_montysqr(y0);
-	y2 = mq_montymul(y1, y0);
-	y3 = mq_montymul(y2, y1);
-	y4 = mq_montysqr(y3);
-	y5 = mq_montysqr(y4);
-	y6 = mq_montysqr(y5);
-	y7 = mq_montysqr(y6);
-	y8 = mq_montysqr(y7);
-	y9 = mq_montymul(y8, y2);
-	y10 = mq_montymul(y9, y8);
-	y11 = mq_montysqr(y10);
-	y12 = mq_montysqr(y11);
-	y13 = mq_montymul(y12, y9);
-	y14 = mq_montysqr(y13);
-	y15 = mq_montysqr(y14);
-	y16 = mq_montymul(y15, y10);
-	y17 = mq_montysqr(y16);
-	y18 = mq_montymul(y17, y0);
-
-	/*
-	 * Final multiplication with x, which is not in Montgomery
-	 * representation, computes the correct division result.
-	 */
-	return mq_montymul(y18, x);
-}
-
-/*
- * Compute NTT on a ring element.
- */
-static void
-mq_NTT(uint16_t *a, unsigned logn)
-{
-	size_t n, t, m;
-
-	n = (size_t)1 << logn;
-	t = n;
-	for (m = 1; m < n; m <<= 1) {
-		size_t ht, i, j1;
-
-		ht = t >> 1;
-		for (i = 0, j1 = 0; i < m; i ++, j1 += t) {
-			size_t j, j2;
-			uint32_t s;
-
-			s = GMb[m + i];
-			j2 = j1 + ht;
-			for (j = j1; j < j2; j ++) {
-				uint32_t u, v;
-
-				u = a[j];
-				v = mq_montymul(a[j + ht], s);
-				a[j] = (uint16_t)mq_add(u, v);
-				a[j + ht] = (uint16_t)mq_sub(u, v);
-			}
-		}
-		t = ht;
-	}
-}
-
-/*
- * Compute the inverse NTT on a ring element, binary case.
- */
-static void
-mq_iNTT(uint16_t *a, unsigned logn)
-{
-	size_t n, t, m;
-	uint32_t ni;
-
-	n = (size_t)1 << logn;
-	t = 1;
-	m = n;
-	while (m > 1) {
-		size_t hm, dt, i, j1;
-
-		hm = m >> 1;
-		dt = t << 1;
-		for (i = 0, j1 = 0; i < hm; i ++, j1 += dt) {
-			size_t j, j2;
-			uint32_t s;
-
-			j2 = j1 + t;
-			s = iGMb[hm + i];
-			for (j = j1; j < j2; j ++) {
-				uint32_t u, v, w;
-
-				u = a[j];
-				v = a[j + t];
-				a[j] = (uint16_t)mq_add(u, v);
-				w = mq_sub(u, v);
-				a[j + t] = (uint16_t)
-					mq_montymul(w, s);
-			}
-		}
-		t = dt;
-		m = hm;
-	}
-
-	/*
-	 * To complete the inverse NTT, we must now divide all values by
-	 * n (the vector size). We thus need the inverse of n, i.e. we
-	 * need to divide 1 by 2 logn times. But we also want it in
-	 * Montgomery representation, i.e. we also want to multiply it
-	 * by R = 2^16. In the common case, this should be a simple right
-	 * shift. The loop below is generic and works also in corner cases;
-	 * its computation time is negligible.
-	 */
-	ni = R;
-	for (m = n; m > 1; m >>= 1) {
-		ni = mq_rshift1(ni);
-	}
-	for (m = 0; m < n; m ++) {
-		a[m] = (uint16_t)mq_montymul(a[m], ni);
-	}
-}
-
-/*
- * Convert a polynomial (mod q) to Montgomery representation.
- */
-static void
-mq_poly_tomonty(uint16_t *f, unsigned logn)
-{
-	size_t u, n;
-
-	n = (size_t)1 << logn;
-	for (u = 0; u < n; u ++) {
-		f[u] = (uint16_t)mq_montymul(f[u], R2);
-	}
-}
-
-/*
- * Multiply two polynomials together (NTT representation, and using
- * a Montgomery multiplication). Result f*g is written over f.
- */
-static void
-mq_poly_montymul_ntt(uint16_t *f, const uint16_t *g, unsigned logn)
-{
-	size_t u, n;
-
-	n = (size_t)1 << logn;
-	for (u = 0; u < n; u ++) {
-		f[u] = (uint16_t)mq_montymul(f[u], g[u]);
-	}
-}
-
-/*
- * Subtract polynomial g from polynomial f.
- */
-static void
-mq_poly_sub(uint16_t *f, const uint16_t *g, unsigned logn)
-{
-	size_t u, n;
-
-	n = (size_t)1 << logn;
-	for (u = 0; u < n; u ++) {
-		f[u] = (uint16_t)mq_sub(f[u], g[u]);
-	}
-}
-
-/* ===================================================================== */
-
-/* see inner.h */
-void
-Zf(to_ntt_monty)(uint16_t *h, unsigned logn)
-{
-	mq_NTT(h, logn);
-	mq_poly_tomonty(h, logn);
-}
-
-/* see inner.h */
-int
-Zf(verify_raw)(const uint16_t *c0, const int16_t *s2,
-	const uint16_t *h, unsigned logn, uint8_t *tmp)
-{
-	size_t u, n;
-	uint16_t *tt;
-
-	n = (size_t)1 << logn;
-	tt = (uint16_t *)tmp;
-
-	/*
-	 * Reduce s2 elements modulo q ([0..q-1] range).
-	 */
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = (uint32_t)s2[u];
-		w += Q & -(w >> 31);
-		tt[u] = (uint16_t)w;
-	}
-
-	/*
-	 * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]).
-	 */
-	mq_NTT(tt, logn);
-	mq_poly_montymul_ntt(tt, h, logn);
-	mq_iNTT(tt, logn);
-	mq_poly_sub(tt, c0, logn);
-
-	/*
-	 * Normalize -s1 elements into the [-q/2..q/2] range.
-	 */
-	for (u = 0; u < n; u ++) {
-		int32_t w;
-
-		w = (int32_t)tt[u];
-		w -= (int32_t)(Q & -(((Q >> 1) - (uint32_t)w) >> 31));
-		((int16_t *)tt)[u] = (int16_t)w;
-	}
-
-	/*
-	 * Signature is valid if and only if the aggregate (-s1,s2) vector
-	 * is short enough.
-	 */
-	return Zf(is_short)((int16_t *)tt, s2, logn);
-}
-
-/* see inner.h */
-int
-Zf(compute_public)(uint16_t *h,
-	const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp)
-{
-	size_t u, n;
-	uint16_t *tt;
-
-	n = (size_t)1 << logn;
-	tt = (uint16_t *)tmp;
-	for (u = 0; u < n; u ++) {
-		tt[u] = (uint16_t)mq_conv_small(f[u]);
-		h[u] = (uint16_t)mq_conv_small(g[u]);
-	}
-	mq_NTT(h, logn);
-	mq_NTT(tt, logn);
-	for (u = 0; u < n; u ++) {
-		if (tt[u] == 0) {
-			return 0;
-		}
-		h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
-	}
-	mq_iNTT(h, logn);
-	return 1;
-}
-
-/* see inner.h */
-int
-Zf(complete_private)(int8_t *G,
-	const int8_t *f, const int8_t *g, const int8_t *F,
-	unsigned logn, uint8_t *tmp)
-{
-	size_t u, n;
-	uint16_t *t1, *t2;
-
-	n = (size_t)1 << logn;
-	t1 = (uint16_t *)tmp;
-	t2 = t1 + n;
-	for (u = 0; u < n; u ++) {
-		t1[u] = (uint16_t)mq_conv_small(g[u]);
-		t2[u] = (uint16_t)mq_conv_small(F[u]);
-	}
-	mq_NTT(t1, logn);
-	mq_NTT(t2, logn);
-	mq_poly_tomonty(t1, logn);
-	mq_poly_montymul_ntt(t1, t2, logn);
-	for (u = 0; u < n; u ++) {
-		t2[u] = (uint16_t)mq_conv_small(f[u]);
-	}
-	mq_NTT(t2, logn);
-	for (u = 0; u < n; u ++) {
-		if (t2[u] == 0) {
-			return 0;
-		}
-		t1[u] = (uint16_t)mq_div_12289(t1[u], t2[u]);
-	}
-	mq_iNTT(t1, logn);
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-		int32_t gi;
-
-		w = t1[u];
-		w -= (Q & ~-((w - (Q >> 1)) >> 31));
-		gi = *(int32_t *)&w;
-		if (gi < -127 || gi > +127) {
-			return 0;
-		}
-		G[u] = (int8_t)gi;
-	}
-	return 1;
-}
-
-/* see inner.h */
-int
-Zf(is_invertible)(
-	const int16_t *s2, unsigned logn, uint8_t *tmp)
-{
-	size_t u, n;
-	uint16_t *tt;
-	uint32_t r;
-
-	n = (size_t)1 << logn;
-	tt = (uint16_t *)tmp;
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = (uint32_t)s2[u];
-		w += Q & -(w >> 31);
-		tt[u] = (uint16_t)w;
-	}
-	mq_NTT(tt, logn);
-	r = 0;
-	for (u = 0; u < n; u ++) {
-		r |= (uint32_t)(tt[u] - 1);
-	}
-	return (int)(1u - (r >> 31));
-}
-
-/* see inner.h */
-int
-Zf(verify_recover)(uint16_t *h,
-	const uint16_t *c0, const int16_t *s1, const int16_t *s2,
-	unsigned logn, uint8_t *tmp)
-{
-	size_t u, n;
-	uint16_t *tt;
-	uint32_t r;
-
-	n = (size_t)1 << logn;
-
-	/*
-	 * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
-	 * and c0 - s1 into h[].
-	 */
-	tt = (uint16_t *)tmp;
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = (uint32_t)s2[u];
-		w += Q & -(w >> 31);
-		tt[u] = (uint16_t)w;
-
-		w = (uint32_t)s1[u];
-		w += Q & -(w >> 31);
-		w = mq_sub(c0[u], w);
-		h[u] = (uint16_t)w;
-	}
-
-	/*
-	 * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
-	 * is zero (in NTT representation) then the operation fails. We
-	 * keep that information into a flag so that we do not deviate
-	 * from strict constant-time processing; if all coefficients of
-	 * s2 are non-zero, then the high bit of r will be zero.
-	 */
-	mq_NTT(tt, logn);
-	mq_NTT(h, logn);
-	r = 0;
-	for (u = 0; u < n; u ++) {
-		r |= (uint32_t)(tt[u] - 1);
-		h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
-	}
-	mq_iNTT(h, logn);
-
-	/*
-	 * Signature is acceptable if and only if it is short enough,
-	 * and s2 was invertible mod phi mod q. The caller must still
-	 * check that the rebuilt public key matches the expected
-	 * value (e.g. through a hash).
-	 */
-	r = ~r & (uint32_t)-Zf(is_short)(s1, s2, logn);
-	return (int)(r >> 31);
-}
-
-/* see inner.h */
-int
-Zf(count_nttzero)(const int16_t *sig, unsigned logn, uint8_t *tmp)
-{
-	uint16_t *s2;
-	size_t u, n;
-	uint32_t r;
-
-	n = (size_t)1 << logn;
-	s2 = (uint16_t *)tmp;
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = (uint32_t)sig[u];
-		w += Q & -(w >> 31);
-		s2[u] = (uint16_t)w;
-	}
-	mq_NTT(s2, logn);
-	r = 0;
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = (uint32_t)s2[u] - 1u;
-		r += (w >> 31);
-	}
-	return (int)r;
-}
diff --git a/crypto_sign/falcon-512/m4-ct/README.txt b/crypto_sign/falcon-512/m4-ct/README.txt
deleted file mode 100644
index 7bedf7f1..00000000
--- a/crypto_sign/falcon-512/m4-ct/README.txt
+++ /dev/null
@@ -1,137 +0,0 @@
-Falcon implementation for PQM4 (or even mupq in general).
-
-
-There are multiple variants. Each variant is selected with the choice of
-api.h (four choices: api512dyn.h, api512tree.h, api1024dyn.h,
-api1024tree.h), and additional compile-time macro that are documented in
-config.h and can be set either in config.h, or through command-line
-flags passed to the C compiler.
-
-Choice of api.h:
-
-    api512dyn.h
-        "Normal" Falcon-512. Private key is reasonably compact. The
-        Falcon LDL tree is internally recomputed for each signature.
-
-    api512tree.h
-        Falcon-512 is key expansion. The Falcon LDL tree is computed
-        as part of the keygen, and returned as private key. This
-        speeds up signature generation, but also greatly enlarges
-        the private key size.
-
-    api1024dyn.h
-        "Normal" Falcon-1024.
-
-    api1024tree.h
-        Falcon-1024 with key expansion.
-
-Compile-time options (config.h):
-
-    FALCON_FPEMU
-        Set to 1 to enable use of the internal constant-time emulation
-        of floating-point operations.
-
-    FALCON_FPNATIVE
-        Set to 1 to use the native 'double' type and floating-point
-        operations. On architectures that lack a FPU, this will use the
-        compiler-provided floating-point emulation routines, which are
-        usually not constant-time (and sometimes return values which
-        do not follow IEEE-754 rounding rules).
-
-    FALCON_ASM_CORTEXM4
-        Set to 1 to use the M4 assembly routine for the constant-time
-        emulation of floating-point operations. These are faster than
-        the generic routines in C activated by FALCON_FPEMU.
-
-There is some internal autodetection that tries to select the right
-values automatically, but it's safer to explicitly select things:
-
-    To use the native 'double' type:
-        -DFALCON_FPNATIVE=1
-
-    To use the generic FP emulation code:
-        -DFALCON_FPEMU=1 -DFALCON_ASM_CORTEXM4=0
-
-    To use the M4 assembly code for FP emulation:
-        -DFALCON_FPEMU=1 -DFALCON_ASM_CORTEXM4=1
-
-The code relying on the native 'double' type requires an implementation
-that follows IEEE-754 rules with a 64-bit type. It works on 64-bit x86
-and PowerPC / POWER systems. On 32-bit x86, it tends to fail because the
-80387 FPU is used with more precision; on such a system, use
-'-msse2 -mfpmath=sse' to force use of the SSE2 unit (this might be the
-default on some systems, e.g. Darwin / macOS).
-
-
-IMPORTANT NOTES
-===============
-
-  * The PQM4 API is implemented in pqm4.c. Since the M4 stack is usually
-    small (usual default is 4 kB), temporary buffers are statically
-    allocated. This implies that the crypto_sign_keypair(), crypto_sign()
-    and crypto_sign_open() functions are not thread-safe or reentrant.
-    Also, the static allocation is "forever".
-
-    See the comments for the 'tmp' variable in pqm4.c; this gives the
-    relevant sizes.
-
-  * When using expanded keys, the private key contains 64-bit values
-    (floating-point, i.e. 'double' or 'uint64_t' depending on the kind
-    of floating-point emulation that is used). On many systems, this
-    implies some alignment requirements. I.e. crypto_sign_keypair() and
-    crypto_sign() then require the 'sk' pointer to be suitably aligned.
-    On an ARM Cortex M4, 32-bit alignment is required (while the basic
-    RAM access opcodes tolerate unaligned accesses, the 'ldm' and 'stm'
-    opcodes need 32-bit aligned pointers).
-
-  * When using the native 'double' type, the code has a dependency on
-    the sqrt() function. On x86, the relevant SSE2 opcode is inlined,
-    but the library function is still (potentially) invoked in case the
-    operand is negative, so that proper error management is performed.
-    This case does not happen in Falcon, but the library function is
-    still referenced, and explicitly linking with '-lm' may be
-    necessary.
-
-  * When using the native 'double' type, do _NOT_ enable -ffast-math.
-    The internal rounding function relies on the usual trick:
-        when x >= 0, round(x) = (x + 2**52) - 2**52
-
-    This trick works only as long as each addition is rounded as per
-    the IEEE-754 rules to the exact precision of the 64-bit type.
-    When -ffast-math is enabled, the compiler may assume commutativity
-    and "optimize" that expression into 'round(x) = x', which does not
-    work at all.
-
-
-TESTS
-=====
-
-In the 'tests/' directory is a generator for known-answer tests, and the
-expected file. The code comes from the NIST, but was modified to avoid a
-dependency on OpenSSL. When compiling the C source file against the
-selected Falcon implementation, an executable is produced, that, when
-executed, generates an '*.req' and an '*.rsp' files. The .req file is
-redundant (the .rsp file contains all the information, and some more).
-
-The expected .rsp files are provided as:
-    KAT512dyn.rsp        Falcon-512, no expanded key
-    KAT512tree.rsp       Falcon-512, with expanded key
-    KAT1024dyn.rsp       Falcon-1024, no expanded key
-    KAT1024tree.rsp      Falcon-1024, with expanded key
-
-
-Normally, all computations are exact and the files are exactly
-reproducible. However, some discrepancies may occur with the '*tree'
-files in the following cases:
-
-  - On big-endian architectures, the bytes in sk[] will be in a
-    different order. This is a side effect of putting the raw bytes
-    of the expanded key in sk[] (this could be fixed with some
-    reencoding pass, but this was not implemented yet).
-
-  - If a non-exact IEEE-754 implementation is used, some of the
-    low bits of the values may be changed. This may happen if the
-    underlying implementation is not strictly faithful to rounding.
-
-As long as only the 'sk' lines are changed, then the public keys
-and signature values are unimpacted.
diff --git a/crypto_sign/falcon-512/m4-ct/api.h b/crypto_sign/falcon-512/m4-ct/api.h
deleted file mode 100644
index 9275eaf9..00000000
--- a/crypto_sign/falcon-512/m4-ct/api.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#include <stddef.h>
-
-#define CRYPTO_SECRETKEYBYTES   1281
-#define CRYPTO_PUBLICKEYBYTES   897
-#define CRYPTO_BYTES            690
-
-#define CRYPTO_ALGNAME          "Falcon-512"
-
-int crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
-
-int crypto_sign(unsigned char *sm, size_t *smlen,
-	const unsigned char *m, size_t mlen,
-	const unsigned char *sk);
-
-int crypto_sign_open(unsigned char *m, size_t *mlen,
-	const unsigned char *sm, size_t smlen,
-	const unsigned char *pk);
diff --git a/crypto_sign/falcon-512/m4-ct/codec.c b/crypto_sign/falcon-512/m4-ct/codec.c
deleted file mode 100644
index 5bd61424..00000000
--- a/crypto_sign/falcon-512/m4-ct/codec.c
+++ /dev/null
@@ -1,559 +0,0 @@
-/*
- * Encoding/decoding of keys and signatures.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-/* see inner.h */
-size_t
-Zf(modq_encode)(
-	void *out, size_t max_out_len,
-	const uint16_t *x, unsigned logn)
-{
-	size_t n, out_len, u;
-	uint8_t *buf;
-	uint32_t acc;
-	int acc_len;
-
-	n = (size_t)1 << logn;
-	for (u = 0; u < n; u ++) {
-		if (x[u] >= 12289) {
-			return 0;
-		}
-	}
-	out_len = ((n * 14) + 7) >> 3;
-	if (out == NULL) {
-		return out_len;
-	}
-	if (out_len > max_out_len) {
-		return 0;
-	}
-	buf = out;
-	acc = 0;
-	acc_len = 0;
-	for (u = 0; u < n; u ++) {
-		acc = (acc << 14) | x[u];
-		acc_len += 14;
-		while (acc_len >= 8) {
-			acc_len -= 8;
-			*buf ++ = (uint8_t)(acc >> acc_len);
-		}
-	}
-	if (acc_len > 0) {
-		*buf = (uint8_t)(acc << (8 - acc_len));
-	}
-	return out_len;
-}
-
-/* see inner.h */
-size_t
-Zf(modq_decode)(
-	uint16_t *x, unsigned logn,
-	const void *in, size_t max_in_len)
-{
-	size_t n, in_len, u;
-	const uint8_t *buf;
-	uint32_t acc;
-	int acc_len;
-
-	n = (size_t)1 << logn;
-	in_len = ((n * 14) + 7) >> 3;
-	if (in_len > max_in_len) {
-		return 0;
-	}
-	buf = in;
-	acc = 0;
-	acc_len = 0;
-	u = 0;
-	while (u < n) {
-		acc = (acc << 8) | (*buf ++);
-		acc_len += 8;
-		if (acc_len >= 14) {
-			unsigned w;
-
-			acc_len -= 14;
-			w = (acc >> acc_len) & 0x3FFF;
-			if (w >= 12289) {
-				return 0;
-			}
-			x[u ++] = (uint16_t)w;
-		}
-	}
-	if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
-		return 0;
-	}
-	return in_len;
-}
-
-/* see inner.h */
-size_t
-Zf(trim_i16_encode)(
-	void *out, size_t max_out_len,
-	const int16_t *x, unsigned logn, unsigned bits)
-{
-	size_t n, u, out_len;
-	int minv, maxv;
-	uint8_t *buf;
-	uint32_t acc, mask;
-	unsigned acc_len;
-
-	n = (size_t)1 << logn;
-	maxv = (1 << (bits - 1)) - 1;
-	minv = -maxv;
-	for (u = 0; u < n; u ++) {
-		if (x[u] < minv || x[u] > maxv) {
-			return 0;
-		}
-	}
-	out_len = ((n * bits) + 7) >> 3;
-	if (out == NULL) {
-		return out_len;
-	}
-	if (out_len > max_out_len) {
-		return 0;
-	}
-	buf = out;
-	acc = 0;
-	acc_len = 0;
-	mask = ((uint32_t)1 << bits) - 1;
-	for (u = 0; u < n; u ++) {
-		acc = (acc << bits) | ((uint16_t)x[u] & mask);
-		acc_len += bits;
-		while (acc_len >= 8) {
-			acc_len -= 8;
-			*buf ++ = (uint8_t)(acc >> acc_len);
-		}
-	}
-	if (acc_len > 0) {
-		*buf ++ = (uint8_t)(acc << (8 - acc_len));
-	}
-	return out_len;
-}
-
-/* see inner.h */
-size_t
-Zf(trim_i16_decode)(
-	int16_t *x, unsigned logn, unsigned bits,
-	const void *in, size_t max_in_len)
-{
-	size_t n, in_len;
-	const uint8_t *buf;
-	size_t u;
-	uint32_t acc, mask1, mask2;
-	unsigned acc_len;
-
-	n = (size_t)1 << logn;
-	in_len = ((n * bits) + 7) >> 3;
-	if (in_len > max_in_len) {
-		return 0;
-	}
-	buf = in;
-	u = 0;
-	acc = 0;
-	acc_len = 0;
-	mask1 = ((uint32_t)1 << bits) - 1;
-	mask2 = (uint32_t)1 << (bits - 1);
-	while (u < n) {
-		acc = (acc << 8) | *buf ++;
-		acc_len += 8;
-		while (acc_len >= bits && u < n) {
-			uint32_t w;
-
-			acc_len -= bits;
-			w = (acc >> acc_len) & mask1;
-			w |= -(w & mask2);
-			if (w == -mask2) {
-				/*
-				 * The -2^(bits-1) value is forbidden.
-				 */
-				return 0;
-			}
-			w |= -(w & mask2);
-			x[u ++] = (int16_t)*(int32_t *)&w;
-		}
-	}
-	if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
-		/*
-		 * Extra bits in the last byte must be zero.
-		 */
-		return 0;
-	}
-	return in_len;
-}
-
-/* see inner.h */
-size_t
-Zf(trim_i8_encode)(
-	void *out, size_t max_out_len,
-	const int8_t *x, unsigned logn, unsigned bits)
-{
-	size_t n, u, out_len;
-	int minv, maxv;
-	uint8_t *buf;
-	uint32_t acc, mask;
-	unsigned acc_len;
-
-	n = (size_t)1 << logn;
-	maxv = (1 << (bits - 1)) - 1;
-	minv = -maxv;
-	for (u = 0; u < n; u ++) {
-		if (x[u] < minv || x[u] > maxv) {
-			return 0;
-		}
-	}
-	out_len = ((n * bits) + 7) >> 3;
-	if (out == NULL) {
-		return out_len;
-	}
-	if (out_len > max_out_len) {
-		return 0;
-	}
-	buf = out;
-	acc = 0;
-	acc_len = 0;
-	mask = ((uint32_t)1 << bits) - 1;
-	for (u = 0; u < n; u ++) {
-		acc = (acc << bits) | ((uint8_t)x[u] & mask);
-		acc_len += bits;
-		while (acc_len >= 8) {
-			acc_len -= 8;
-			*buf ++ = (uint8_t)(acc >> acc_len);
-		}
-	}
-	if (acc_len > 0) {
-		*buf ++ = (uint8_t)(acc << (8 - acc_len));
-	}
-	return out_len;
-}
-
-/* see inner.h */
-size_t
-Zf(trim_i8_decode)(
-	int8_t *x, unsigned logn, unsigned bits,
-	const void *in, size_t max_in_len)
-{
-	size_t n, in_len;
-	const uint8_t *buf;
-	size_t u;
-	uint32_t acc, mask1, mask2;
-	unsigned acc_len;
-
-	n = (size_t)1 << logn;
-	in_len = ((n * bits) + 7) >> 3;
-	if (in_len > max_in_len) {
-		return 0;
-	}
-	buf = in;
-	u = 0;
-	acc = 0;
-	acc_len = 0;
-	mask1 = ((uint32_t)1 << bits) - 1;
-	mask2 = (uint32_t)1 << (bits - 1);
-	while (u < n) {
-		acc = (acc << 8) | *buf ++;
-		acc_len += 8;
-		while (acc_len >= bits && u < n) {
-			uint32_t w;
-
-			acc_len -= bits;
-			w = (acc >> acc_len) & mask1;
-			w |= -(w & mask2);
-			if (w == -mask2) {
-				/*
-				 * The -2^(bits-1) value is forbidden.
-				 */
-				return 0;
-			}
-			x[u ++] = (int8_t)*(int32_t *)&w;
-		}
-	}
-	if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
-		/*
-		 * Extra bits in the last byte must be zero.
-		 */
-		return 0;
-	}
-	return in_len;
-}
-
-/* see inner.h */
-size_t
-Zf(comp_encode)(
-	void *out, size_t max_out_len,
-	const int16_t *x, unsigned logn)
-{
-	uint8_t *buf;
-	size_t n, u, v;
-	uint32_t acc;
-	unsigned acc_len;
-
-	n = (size_t)1 << logn;
-	buf = out;
-
-	/*
-	 * Make sure that all values are within the -2047..+2047 range.
-	 */
-	for (u = 0; u < n; u ++) {
-		if (x[u] < -2047 || x[u] > +2047) {
-			return 0;
-		}
-	}
-
-	acc = 0;
-	acc_len = 0;
-	v = 0;
-	for (u = 0; u < n; u ++) {
-		int t;
-		unsigned w;
-
-		/*
-		 * Get sign and absolute value of next integer; push the
-		 * sign bit.
-		 */
-		acc <<= 1;
-		t = x[u];
-		if (t < 0) {
-			t = -t;
-			acc |= 1;
-		}
-		w = (unsigned)t;
-
-		/*
-		 * Push the low 7 bits of the absolute value.
-		 */
-		acc <<= 7;
-		acc |= w & 127u;
-		w >>= 7;
-
-		/*
-		 * We pushed exactly 8 bits.
-		 */
-		acc_len += 8;
-
-		/*
-		 * Push as many zeros as necessary, then a one. Since the
-		 * absolute value is at most 2047, w can only range up to
-		 * 15 at this point, thus we will add at most 16 bits
-		 * here. With the 8 bits above and possibly up to 7 bits
-		 * from previous iterations, we may go up to 31 bits, which
-		 * will fit in the accumulator, which is an uint32_t.
-		 */
-		acc <<= (w + 1);
-		acc |= 1;
-		acc_len += w + 1;
-
-		/*
-		 * Produce all full bytes.
-		 */
-		while (acc_len >= 8) {
-			acc_len -= 8;
-			if (buf != NULL) {
-				if (v >= max_out_len) {
-					return 0;
-				}
-				buf[v] = (uint8_t)(acc >> acc_len);
-			}
-			v ++;
-		}
-	}
-
-	/*
-	 * Flush remaining bits (if any).
-	 */
-	if (acc_len > 0) {
-		if (buf != NULL) {
-			if (v >= max_out_len) {
-				return 0;
-			}
-			buf[v] = (uint8_t)(acc << (8 - acc_len));
-		}
-		v ++;
-	}
-
-	return v;
-}
-
-/* see inner.h */
-size_t
-Zf(comp_decode)(
-	int16_t *x, unsigned logn,
-	const void *in, size_t max_in_len)
-{
-	const uint8_t *buf;
-	size_t n, u, v;
-	uint32_t acc;
-	unsigned acc_len;
-
-	n = (size_t)1 << logn;
-	buf = in;
-	acc = 0;
-	acc_len = 0;
-	v = 0;
-	for (u = 0; u < n; u ++) {
-		unsigned b, s, m;
-
-		/*
-		 * Get next eight bits: sign and low seven bits of the
-		 * absolute value.
-		 */
-		if (v >= max_in_len) {
-			return 0;
-		}
-		acc = (acc << 8) | (uint32_t)buf[v ++];
-		b = acc >> acc_len;
-		s = b & 128;
-		m = b & 127;
-
-		/*
-		 * Get next bits until a 1 is reached.
-		 */
-		for (;;) {
-			if (acc_len == 0) {
-				if (v >= max_in_len) {
-					return 0;
-				}
-				acc = (acc << 8) | (uint32_t)buf[v ++];
-				acc_len = 8;
-			}
-			acc_len --;
-			if (((acc >> acc_len) & 1) != 0) {
-				break;
-			}
-			m += 128;
-			if (m > 2047) {
-				return 0;
-			}
-		}
-		x[u] = (int16_t)(s ? -(int)m : (int)m);
-	}
-	return v;
-}
-
-/*
- * Key elements and signatures are polynomials with small integer
- * coefficients. Here are some statistics gathered over many
- * generated key pairs (10000 or more for each degree):
- *
- *   log(n)     n   max(f,g)   std(f,g)   max(F,G)   std(F,G)
- *      1       2     129       56.31       143       60.02
- *      2       4     123       40.93       160       46.52
- *      3       8      97       28.97       159       38.01
- *      4      16     100       21.48       154       32.50
- *      5      32      71       15.41       151       29.36
- *      6      64      59       11.07       138       27.77
- *      7     128      39        7.91       144       27.00
- *      8     256      32        5.63       148       26.61
- *      9     512      22        4.00       137       26.46
- *     10    1024      15        2.84       146       26.41
- *
- * We want a compact storage format for private key, and, as part of
- * key generation, we are allowed to reject some keys which would
- * otherwise be fine (this does not induce any noticeable vulnerability
- * as long as we reject only a small proportion of possible keys).
- * Hence, we enforce at key generation time maximum values for the
- * elements of f, g, F and G, so that their encoding can be expressed
- * in fixed-width values. Limits have been chosen so that generated
- * keys are almost always within bounds, thus not impacting neither
- * security or performance.
- *
- * IMPORTANT: the code assumes that all coefficients of f, g, F and G
- * ultimately fit in the -127..+127 range. Thus, none of the elements
- * of max_fg_bits[] and max_FG_bits[] shall be greater than 8.
- */
-
-const uint8_t Zf(max_fg_bits)[] = {
-	0, /* unused */
-	8,
-	8,
-	8,
-	8,
-	8,
-	7,
-	7,
-	6,
-	6,
-	5
-};
-
-const uint8_t Zf(max_FG_bits)[] = {
-	0, /* unused */
-	8,
-	8,
-	8,
-	8,
-	8,
-	8,
-	8,
-	8,
-	8,
-	8
-};
-
-/*
- * When generating a new key pair, we can always reject keys which
- * feature an abnormally large coefficient. This can also be done for
- * signatures, albeit with some care: in case the signature process is
- * used in a derandomized setup (explicitly seeded with the message and
- * private key), we have to follow the specification faithfully, and the
- * specification only enforces a limit on the L2 norm of the signature
- * vector. The limit on the L2 norm implies that the absolute value of
- * a coefficient of the signature cannot be more than the following:
- *
- *   log(n)     n   max sig coeff (theoretical)
- *      1       2       412
- *      2       4       583
- *      3       8       824
- *      4      16      1166
- *      5      32      1649
- *      6      64      2332
- *      7     128      3299
- *      8     256      4665
- *      9     512      6598
- *     10    1024      9331
- *
- * However, the largest observed signature coefficients during our
- * experiments was 1077 (in absolute value), hence we can assume that,
- * with overwhelming probability, signature coefficients will fit
- * in -2047..2047, i.e. 12 bits.
- */
-
-const uint8_t Zf(max_sig_bits)[] = {
-	0, /* unused */
-	10,
-	11,
-	11,
-	12,
-	12,
-	12,
-	12,
-	12,
-	12,
-	12
-};
diff --git a/crypto_sign/falcon-512/m4-ct/common.c b/crypto_sign/falcon-512/m4-ct/common.c
deleted file mode 100644
index ef30028b..00000000
--- a/crypto_sign/falcon-512/m4-ct/common.c
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * Support functions for signatures (hash-to-point, norm).
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-/* see inner.h */
-void
-Zf(hash_to_point_vartime)(
-	inner_shake256_context *sc,
-	uint16_t *x, unsigned logn)
-{
-	/*
-	 * This is the straightforward per-the-spec implementation. It
-	 * is not constant-time, thus it might reveal information on the
-	 * plaintext (at least, enough to check the plaintext against a
-	 * list of potential plaintexts) in a scenario where the
-	 * attacker does not have access to the signature value or to
-	 * the public key, but knows the nonce (without knowledge of the
-	 * nonce, the hashed output cannot be matched against potential
-	 * plaintexts).
-	 */
-	size_t n;
-
-	n = (size_t)1 << logn;
-	while (n > 0) {
-		uint8_t buf[2];
-		uint32_t w;
-
-		inner_shake256_extract(sc, (void *)buf, sizeof buf);
-		w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
-		if (w < 61445) {
-			while (w >= 12289) {
-				w -= 12289;
-			}
-			*x ++ = (uint16_t)w;
-			n --;
-		}
-	}
-}
-
-/* see inner.h */
-void
-Zf(hash_to_point_ct)(
-	inner_shake256_context *sc,
-	uint16_t *x, unsigned logn, uint8_t *tmp)
-{
-	/*
-	 * Each 16-bit sample is a value in 0..65535. The value is
-	 * kept if it falls in 0..61444 (because 61445 = 5*12289)
-	 * and rejected otherwise; thus, each sample has probability
-	 * about 0.93758 of being selected.
-	 *
-	 * We want to oversample enough to be sure that we will
-	 * have enough values with probability at least 1 - 2^(-256).
-	 * Depending on degree N, this leads to the following
-	 * required oversampling:
-	 *
-	 *   logn     n  oversampling
-	 *     1      2     65
-	 *     2      4     67
-	 *     3      8     71
-	 *     4     16     77
-	 *     5     32     86
-	 *     6     64    100
-	 *     7    128    122
-	 *     8    256    154
-	 *     9    512    205
-	 *    10   1024    287
-	 *
-	 * If logn >= 7, then the provided temporary buffer is large
-	 * enough. Otherwise, we use a stack buffer of 63 entries
-	 * (i.e. 126 bytes) for the values that do not fit in tmp[].
-	 */
-
-	static const uint16_t overtab[] = {
-		0, /* unused */
-		65,
-		67,
-		71,
-		77,
-		86,
-		100,
-		122,
-		154,
-		205,
-		287
-	};
-
-	unsigned n, n2, u, m, p, over;
-	uint16_t *tt1, tt2[63];
-
-	/*
-	 * We first generate m 16-bit value. Values 0..n-1 go to x[].
-	 * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[].
-	 * We also reduce modulo q the values; rejected values are set
-	 * to 0xFFFF.
-	 */
-	n = 1U << logn;
-	n2 = n << 1;
-	over = overtab[logn];
-	m = n + over;
-	tt1 = (uint16_t *)tmp;
-	for (u = 0; u < m; u ++) {
-		uint8_t buf[2];
-		uint32_t w, wr;
-
-		inner_shake256_extract(sc, buf, sizeof buf);
-		w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
-		wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
-		wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
-		wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1));
-		wr |= ((w - 61445) >> 31) - 1;
-		if (u < n) {
-			x[u] = (uint16_t)wr;
-		} else if (u < n2) {
-			tt1[u - n] = (uint16_t)wr;
-		} else {
-			tt2[u - n2] = (uint16_t)wr;
-		}
-	}
-
-	/*
-	 * Now we must "squeeze out" the invalid values. We do this in
-	 * a logarithmic sequence of passes; each pass computes where a
-	 * value should go, and moves it down by 'p' slots if necessary,
-	 * where 'p' uses an increasing powers-of-two scale. It can be
-	 * shown that in all cases where the loop decides that a value
-	 * has to be moved down by p slots, the destination slot is
-	 * "free" (i.e. contains an invalid value).
-	 */
-	for (p = 1; p <= over; p <<= 1) {
-		unsigned v;
-
-		/*
-		 * In the loop below:
-		 *
-		 *   - v contains the index of the final destination of
-		 *     the value; it is recomputed dynamically based on
-		 *     whether values are valid or not.
-		 *
-		 *   - u is the index of the value we consider ("source");
-		 *     its address is s.
-		 *
-		 *   - The loop may swap the value with the one at index
-		 *     u-p. The address of the swap destination is d.
-		 */
-		v = 0;
-		for (u = 0; u < m; u ++) {
-			uint16_t *s, *d;
-			unsigned j, sv, dv, mk;
-
-			if (u < n) {
-				s = &x[u];
-			} else if (u < n2) {
-				s = &tt1[u - n];
-			} else {
-				s = &tt2[u - n2];
-			}
-			sv = *s;
-
-			/*
-			 * The value in sv should ultimately go to
-			 * address v, i.e. jump back by u-v slots.
-			 */
-			j = u - v;
-
-			/*
-			 * We increment v for the next iteration, but
-			 * only if the source value is valid. The mask
-			 * 'mk' is -1 if the value is valid, 0 otherwise,
-			 * so we _subtract_ mk.
-			 */
-			mk = (sv >> 15) - 1U;
-			v -= mk;
-
-			/*
-			 * In this loop we consider jumps by p slots; if
-			 * u < p then there is nothing more to do.
-			 */
-			if (u < p) {
-				continue;
-			}
-
-			/*
-			 * Destination for the swap: value at address u-p.
-			 */
-			if ((u - p) < n) {
-				d = &x[u - p];
-			} else if ((u - p) < n2) {
-				d = &tt1[(u - p) - n];
-			} else {
-				d = &tt2[(u - p) - n2];
-			}
-			dv = *d;
-
-			/*
-			 * The swap should be performed only if the source
-			 * is valid AND the jump j has its 'p' bit set.
-			 */
-			mk &= -(((j & p) + 0x1FF) >> 9);
-
-			*s = (uint16_t)(sv ^ (mk & (sv ^ dv)));
-			*d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
-		}
-	}
-}
-
-/* see inner.h */
-int
-Zf(is_short)(
-	const int16_t *s1, const int16_t *s2, unsigned logn)
-{
-	/*
-	 * We use the l2-norm. Code below uses only 32-bit operations to
-	 * compute the square of the norm with saturation to 2^32-1 if
-	 * the value exceeds 2^31-1.
-	 */
-	size_t n, u;
-	uint32_t s, ng;
-
-	n = (size_t)1 << logn;
-	s = 0;
-	ng = 0;
-	for (u = 0; u < n; u ++) {
-		int32_t z;
-
-		z = s1[u];
-		s += (uint32_t)(z * z);
-		ng |= s;
-		z = s2[u];
-		s += (uint32_t)(z * z);
-		ng |= s;
-	}
-	s |= -(ng >> 31);
-
-	/*
-	 * Acceptance bound on the l2-norm is:
-	 *   1.2*1.55*sqrt(q)*sqrt(2*N)
-	 * Value 7085 is floor((1.2^2)*(1.55^2)*2*1024).
-	 */
-	return s < (((uint32_t)7085 * (uint32_t)12289) >> (10 - logn));
-}
-
-/* see inner.h */
-int
-Zf(is_short_half)(
-	uint32_t sqn, const int16_t *s2, unsigned logn)
-{
-	size_t n, u;
-	uint32_t ng;
-
-	n = (size_t)1 << logn;
-	ng = -(sqn >> 31);
-	for (u = 0; u < n; u ++) {
-		int32_t z;
-
-		z = s2[u];
-		sqn += (uint32_t)(z * z);
-		ng |= sqn;
-	}
-	sqn |= -(ng >> 31);
-
-	/*
-	 * Acceptance bound on the l2-norm is:
-	 *   1.2*1.55*sqrt(q)*sqrt(2*N)
-	 * Value 7085 is floor((1.2^2)*(1.55^2)*2*1024).
-	 */
-	return sqn < (((uint32_t)7085 * (uint32_t)12289) >> (10 - logn));
-}
diff --git a/crypto_sign/falcon-512/m4-ct/config.h b/crypto_sign/falcon-512/m4-ct/config.h
deleted file mode 100644
index cd78727e..00000000
--- a/crypto_sign/falcon-512/m4-ct/config.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Manual configuration file for the Falcon implementation. Here can
- * be set some compilation-time options.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#ifndef FALCON_CONFIG_H__
-#define FALCON_CONFIG_H__
-
-/*
- * Each option is a macro which should be defined to either 1 or 0.
- * If any of the options below is left undefined, then a default value
- * will be used by the code, possibly using compile-time autodetection
- * from compiler-defined macros.
- *
- * Explicitly setting a parameter can be done by uncommenting/modifying
- * its definition below, in this file, or equivalently by setting it as
- * a compiler flag.
- */
-
-/*
- * Use the native 'double' C type for floating-point computations. Exact
- * reproducibility of all tests requires that type to faithfully follow
- * IEEE-754 "round-to-nearest" rules.
- *
- * Native double support will use the CPU hardware and/or
- * compiler-provided functions; the latter is typically NOT
- * constant-time, while the former MAY be constant-time, or not. On
- * recent x86 CPU in 64-bit mode, SSE2 opcodes are used and they provide
- * constant-time operations for all the operations used in Falcon,
- * except for some special cases of divisions and square roots, but it
- * can be shown that theses cases imply only negligible leak of
- * information that cannot be leveraged into a full attack.
- *
- * If neither FALCON_FPNATIVE nor FALCON_FPEMU is defined, then use of
- * the native 'double' C type is the default behaviour unless
- * FALCON_ASM_CORTEXM4 is defined to 1, in which case the emulated code
- * will be used.
- *
-#define FALCON_FPNATIVE   1
- */
-
-/*
- * Use emulated floating-point implementation.
- *
- * Emulation uses only integer operations with uint32_t and uint64_t
- * types. This is constant-time, provided that the underlying platform
- * offers constant-time opcodes for the following operations:
- *
- *  - Multiplication of two 32-bit unsigned integers into a 64-bit result.
- *  - Left-shift or right-shift of a 32-bit unsigned integer by a
- *    potentially secret shift count in the 0..31 range.
- *
- * Notably, the ARM Cortex M3 does not fulfill the first condition,
- * while the Pentium IV does not fulfill the second.
- *
- * If neither FALCON_FPNATIVE nor FALCON_FPEMU is defined, then use of
- * the native 'double' C type is the default behaviour unless
- * FALCON_ASM_CORTEXM4 is defined to 1, in which case the emulated code
- * will be used.
- *
-#define FALCON_FPEMU   1
- */
-
-/*
- * Enable use of assembly for ARM Cortex-M4 CPU. By default, such
- * support will be used based on some autodection on the compiler
- * version and target architecture. Define this variable to 1 to force
- * use of the assembly code, or 0 to disable it regardless of the
- * autodetection.
- *
- * When FALCON_ASM_CORTEXM4 is enabled (whether defined explicitly or
- * autodetected), emulated floating-point code will be used, unless
- * FALCON_FPNATIVE or FALCON_FPEMU is explicitly set to override the
- * choice. Emulated code with ARM assembly is constant-time and provides
- * better performance than emulated code with plain C.
- *
- * The assembly code for the M4 can also work on a Cortex-M3. If the
- * compiler is instructed to target the M3 (e.g. '-mcpu=cortex-m3' with
- * GCC) then FALCON_ASM_CORTEXM4 won't be autodetected, but it can be
- * enabled explicitly. Take care, though, that the M3 multiplication
- * opcode (multiplication of two 32-bit unsigned integers with a 64-bit
- * result) is NOT constant-time.
- *
-#define FALCON_ASM_CORTEXM4   1
- */
-
-#define FALCON_ASM_CORTEXM4   1
-
-/*
- * Enable use of AVX2 intrinsics. If enabled, then the code will compile
- * only when targeting x86 with a compiler that supports AVX2 intrinsics
- * (tested with GCC 7.4.0, Clang 6.0.0, and MSVC 2015, both in 32-bit
- * and 64-bit modes), and run only on systems that offer the AVX2
- * opcodes. Some operations leverage AVX2 for better performance.
- *
-#define FALCON_AVX2   1
- */
-
-/*
- * Enable use of FMA intrinsics. This setting has any effect only if
- * FALCON_AVX2 is also enabled. The FMA intrinsics are normally available
- * on any x86 CPU that also has AVX2. Note that setting this option will
- * slightly modify the values of expanded private keys, but will normally
- * not change the values of non-expanded private keys, public keys or
- * signatures, for a given keygen/sign seed (non-expanded private keys
- * and signatures might theoretically change, but only with low probability,
- * less than 2^(-40); produced signatures are still safe and interoperable).
- *
-#define FALCON_FMA   1
- */
-
-/*
- * Assert that the platform uses little-endian encoding. If enabled,
- * then encoding and decoding of aligned multibyte values will be
- * slightly faster (especially for hashing and random number
- * generation). If not defined explicitly, then autodetection is
- * applied.
- *
-#define FALCON_LE   1
- */
-
-/*
- * Assert that the platform tolerates accesses to unaligned multibyte
- * values. If enabled, then some operations are slightly faster. Note
- * that ARM Cortex M4 do _not_ fully tolerate unaligned accesses; for
- * such systems, this option should not be enabled. If not defined
- * explicitly, then autodetection is applied.
- *
-#define FALCON_UNALIGNED   1
- */
-
-/*
- * Use a PRNG based on ChaCha20 and seeded with SHAKE256, instead of
- * SHAKE256 directly, for key pair generation purposes. This speeds up
- * key pair generation, especially on platforms where SHAKE256 is
- * comparatively slow: on the ARM Cortex M4, average key generation time
- * is reduced by 19% with this setting; on a recent x86 Skylake, the
- * reduction is smaller (less than 8%).
- *
- * However, this setting changes the private/public key pair obtained
- * from a given seed, thus preventing reproducibility of the
- * known-answer tests vectors. For compatibility with existing KAT
- * vectors (e.g. in PQClean, pqm4 and NIST implementations), this
- * setting is not enabled by default.
- *
-#define FALCON_KG_CHACHA20   1
- */
-
-/*
- * Use an explicit OS-provided source of randomness for seeding (for the
- * Zf(get_seed)() function implementation). Three possible sources are
- * defined:
- *
- *  - getentropy() system call
- *  - /dev/urandom special file
- *  - CryptGenRandom() function call
- *
- * More than one source may be enabled, in which case they will be tried
- * in the order above, until a success is reached.
- *
- * By default, sources are enabled at compile-time based on these
- * conditions:
- *
- *  - getentropy(): target is one of: Linux with Glibc-2.25+, FreeBSD 12+,
- *    or OpenBSD.
- *  - /dev/urandom: target is a Unix-like system (including Linux,
- *    FreeBSD, NetBSD, OpenBSD, DragonFly, macOS, Android, Solaris, AIX).
- *  - CryptGenRandom(): target is Windows (Win32 or Win64).
- *
- * On most small embedded systems, none will be enabled and Zf(get_seed)()
- * will always return 0. Applications will need to provide their own seeds.
- *
-#define FALCON_RAND_GETENTROPY   1
-#define FALCON_RAND_URANDOM      1
-#define FALCON_RAND_WIN32        1
- */
-
-#endif
diff --git a/crypto_sign/falcon-512/m4-ct/fft.c b/crypto_sign/falcon-512/m4-ct/fft.c
deleted file mode 100644
index b1904b24..00000000
--- a/crypto_sign/falcon-512/m4-ct/fft.c
+++ /dev/null
@@ -1,1412 +0,0 @@
-/*
- * FFT code.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-/*
- * Rules for complex number macros:
- * --------------------------------
- *
- * Operand order is: destination, source1, source2...
- *
- * Each operand is a real and an imaginary part.
- *
- * All overlaps are allowed.
- */
-
-/*
- * Addition of two complex numbers (d = a + b).
- */
-#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
-		fpr fpct_re, fpct_im; \
-		fpct_re = fpr_add(a_re, b_re); \
-		fpct_im = fpr_add(a_im, b_im); \
-		(d_re) = fpct_re; \
-		(d_im) = fpct_im; \
-	} while (0)
-
-/*
- * Subtraction of two complex numbers (d = a - b).
- */
-#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
-		fpr fpct_re, fpct_im; \
-		fpct_re = fpr_sub(a_re, b_re); \
-		fpct_im = fpr_sub(a_im, b_im); \
-		(d_re) = fpct_re; \
-		(d_im) = fpct_im; \
-	} while (0)
-
-/*
- * Multplication of two complex numbers (d = a * b).
- */
-#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
-		fpr fpct_a_re, fpct_a_im; \
-		fpr fpct_b_re, fpct_b_im; \
-		fpr fpct_d_re, fpct_d_im; \
-		fpct_a_re = (a_re); \
-		fpct_a_im = (a_im); \
-		fpct_b_re = (b_re); \
-		fpct_b_im = (b_im); \
-		fpct_d_re = fpr_sub( \
-			fpr_mul(fpct_a_re, fpct_b_re), \
-			fpr_mul(fpct_a_im, fpct_b_im)); \
-		fpct_d_im = fpr_add( \
-			fpr_mul(fpct_a_re, fpct_b_im), \
-			fpr_mul(fpct_a_im, fpct_b_re)); \
-		(d_re) = fpct_d_re; \
-		(d_im) = fpct_d_im; \
-	} while (0)
-
-/*
- * Squaring of a complex number (d = a * a).
- */
-#define FPC_SQR(d_re, d_im, a_re, a_im)   do { \
-		fpr fpct_a_re, fpct_a_im; \
-		fpr fpct_d_re, fpct_d_im; \
-		fpct_a_re = (a_re); \
-		fpct_a_im = (a_im); \
-		fpct_d_re = fpr_sub(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
-		fpct_d_im = fpr_double(fpr_mul(fpct_a_re, fpct_a_im)); \
-		(d_re) = fpct_d_re; \
-		(d_im) = fpct_d_im; \
-	} while (0)
-
-/*
- * Inversion of a complex number (d = 1 / a).
- */
-#define FPC_INV(d_re, d_im, a_re, a_im)   do { \
-		fpr fpct_a_re, fpct_a_im; \
-		fpr fpct_d_re, fpct_d_im; \
-		fpr fpct_m; \
-		fpct_a_re = (a_re); \
-		fpct_a_im = (a_im); \
-		fpct_m = fpr_add(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
-		fpct_m = fpr_inv(fpct_m); \
-		fpct_d_re = fpr_mul(fpct_a_re, fpct_m); \
-		fpct_d_im = fpr_mul(fpr_neg(fpct_a_im), fpct_m); \
-		(d_re) = fpct_d_re; \
-		(d_im) = fpct_d_im; \
-	} while (0)
-
-/*
- * Division of complex numbers (d = a / b).
- */
-#define FPC_DIV(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
-		fpr fpct_a_re, fpct_a_im; \
-		fpr fpct_b_re, fpct_b_im; \
-		fpr fpct_d_re, fpct_d_im; \
-		fpr fpct_m; \
-		fpct_a_re = (a_re); \
-		fpct_a_im = (a_im); \
-		fpct_b_re = (b_re); \
-		fpct_b_im = (b_im); \
-		fpct_m = fpr_add(fpr_sqr(fpct_b_re), fpr_sqr(fpct_b_im)); \
-		fpct_m = fpr_inv(fpct_m); \
-		fpct_b_re = fpr_mul(fpct_b_re, fpct_m); \
-		fpct_b_im = fpr_mul(fpr_neg(fpct_b_im), fpct_m); \
-		fpct_d_re = fpr_sub( \
-			fpr_mul(fpct_a_re, fpct_b_re), \
-			fpr_mul(fpct_a_im, fpct_b_im)); \
-		fpct_d_im = fpr_add( \
-			fpr_mul(fpct_a_re, fpct_b_im), \
-			fpr_mul(fpct_a_im, fpct_b_re)); \
-		(d_re) = fpct_d_re; \
-		(d_im) = fpct_d_im; \
-	} while (0)
-
-/*
- * Let w = exp(i*pi/N); w is a primitive 2N-th root of 1. We define the
- * values w_j = w^(2j+1) for all j from 0 to N-1: these are the roots
- * of X^N+1 in the field of complex numbers. A crucial property is that
- * w_{N-1-j} = conj(w_j) = 1/w_j for all j.
- *
- * FFT representation of a polynomial f (taken modulo X^N+1) is the
- * set of values f(w_j). Since f is real, conj(f(w_j)) = f(conj(w_j)),
- * thus f(w_{N-1-j}) = conj(f(w_j)). We thus store only half the values,
- * for j = 0 to N/2-1; the other half can be recomputed easily when (if)
- * needed. A consequence is that FFT representation has the same size
- * as normal representation: N/2 complex numbers use N real numbers (each
- * complex number is the combination of a real and an imaginary part).
- *
- * We use a specific ordering which makes computations easier. Let rev()
- * be the bit-reversal function over log(N) bits. For j in 0..N/2-1, we
- * store the real and imaginary parts of f(w_j) in slots:
- *
- *    Re(f(w_j)) -> slot rev(j)/2
- *    Im(f(w_j)) -> slot rev(j)/2+N/2
- *
- * (Note that rev(j) is even for j < N/2.)
- */
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(FFT)(fpr *f, unsigned logn)
-{
-	/*
-	 * FFT algorithm in bit-reversal order uses the following
-	 * iterative algorithm:
-	 *
-	 *   t = N
-	 *   for m = 1; m < N; m *= 2:
-	 *       ht = t/2
-	 *       for i1 = 0; i1 < m; i1 ++:
-	 *           j1 = i1 * t
-	 *           s = GM[m + i1]
-	 *           for j = j1; j < (j1 + ht); j ++:
-	 *               x = f[j]
-	 *               y = s * f[j + ht]
-	 *               f[j] = x + y
-	 *               f[j + ht] = x - y
-	 *       t = ht
-	 *
-	 * GM[k] contains w^rev(k) for primitive root w = exp(i*pi/N).
-	 *
-	 * In the description above, f[] is supposed to contain complex
-	 * numbers. In our in-memory representation, the real and
-	 * imaginary parts of f[k] are in array slots k and k+N/2.
-	 *
-	 * We only keep the first half of the complex numbers. We can
-	 * see that after the first iteration, the first and second halves
-	 * of the array of complex numbers have separate lives, so we
-	 * simply ignore the second part.
-	 */
-
-	unsigned u;
-	size_t t, n, hn, m;
-
-	/*
-	 * First iteration: compute f[j] + i * f[j+N/2] for all j < N/2
-	 * (because GM[1] = w^rev(1) = w^(N/2) = i).
-	 * In our chosen representation, this is a no-op: everything is
-	 * already where it should be.
-	 */
-
-	/*
-	 * Subsequent iterations are truncated to use only the first
-	 * half of values.
-	 */
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-	t = hn;
-	for (u = 1, m = 2; u < logn; u ++, m <<= 1) {
-		size_t ht, hm, i1, j1;
-
-		ht = t >> 1;
-		hm = m >> 1;
-		for (i1 = 0, j1 = 0; i1 < hm; i1 ++, j1 += t) {
-			size_t j, j2;
-
-			j2 = j1 + ht;
-#if FALCON_AVX2 // yyyAVX2+1
-			if (ht >= 4) {
-				__m256d s_re, s_im;
-
-				s_re = _mm256_set1_pd(
-					fpr_gm_tab[((m + i1) << 1) + 0].v);
-				s_im = _mm256_set1_pd(
-					fpr_gm_tab[((m + i1) << 1) + 1].v);
-				for (j = j1; j < j2; j += 4) {
-					__m256d x_re, x_im, y_re, y_im;
-					__m256d z_re, z_im;
-
-					x_re = _mm256_loadu_pd(&f[j].v);
-					x_im = _mm256_loadu_pd(&f[j + hn].v);
-					z_re = _mm256_loadu_pd(&f[j+ht].v);
-					z_im = _mm256_loadu_pd(&f[j+ht + hn].v);
-					y_re = FMSUB(z_re, s_re,
-						_mm256_mul_pd(z_im, s_im));
-					y_im = FMADD(z_re, s_im,
-						_mm256_mul_pd(z_im, s_re));
-					_mm256_storeu_pd(&f[j].v,
-						_mm256_add_pd(x_re, y_re));
-					_mm256_storeu_pd(&f[j + hn].v,
-						_mm256_add_pd(x_im, y_im));
-					_mm256_storeu_pd(&f[j + ht].v,
-						_mm256_sub_pd(x_re, y_re));
-					_mm256_storeu_pd(&f[j + ht + hn].v,
-						_mm256_sub_pd(x_im, y_im));
-				}
-			} else {
-				fpr s_re, s_im;
-
-				s_re = fpr_gm_tab[((m + i1) << 1) + 0];
-				s_im = fpr_gm_tab[((m + i1) << 1) + 1];
-				for (j = j1; j < j2; j ++) {
-					fpr x_re, x_im, y_re, y_im;
-
-					x_re = f[j];
-					x_im = f[j + hn];
-					y_re = f[j + ht];
-					y_im = f[j + ht + hn];
-					FPC_MUL(y_re, y_im,
-						y_re, y_im, s_re, s_im);
-					FPC_ADD(f[j], f[j + hn],
-						x_re, x_im, y_re, y_im);
-					FPC_SUB(f[j + ht], f[j + ht + hn],
-						x_re, x_im, y_re, y_im);
-				}
-			}
-#else // yyyAVX2+0
-			fpr s_re, s_im;
-
-			s_re = fpr_gm_tab[((m + i1) << 1) + 0];
-			s_im = fpr_gm_tab[((m + i1) << 1) + 1];
-			for (j = j1; j < j2; j ++) {
-				fpr x_re, x_im, y_re, y_im;
-
-				x_re = f[j];
-				x_im = f[j + hn];
-				y_re = f[j + ht];
-				y_im = f[j + ht + hn];
-				FPC_MUL(y_re, y_im, y_re, y_im, s_re, s_im);
-				FPC_ADD(f[j], f[j + hn],
-					x_re, x_im, y_re, y_im);
-				FPC_SUB(f[j + ht], f[j + ht + hn],
-					x_re, x_im, y_re, y_im);
-			}
-#endif // yyyAVX2-
-		}
-		t = ht;
-	}
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(iFFT)(fpr *f, unsigned logn)
-{
-	/*
-	 * Inverse FFT algorithm in bit-reversal order uses the following
-	 * iterative algorithm:
-	 *
-	 *   t = 1
-	 *   for m = N; m > 1; m /= 2:
-	 *       hm = m/2
-	 *       dt = t*2
-	 *       for i1 = 0; i1 < hm; i1 ++:
-	 *           j1 = i1 * dt
-	 *           s = iGM[hm + i1]
-	 *           for j = j1; j < (j1 + t); j ++:
-	 *               x = f[j]
-	 *               y = f[j + t]
-	 *               f[j] = x + y
-	 *               f[j + t] = s * (x - y)
-	 *       t = dt
-	 *   for i1 = 0; i1 < N; i1 ++:
-	 *       f[i1] = f[i1] / N
-	 *
-	 * iGM[k] contains (1/w)^rev(k) for primitive root w = exp(i*pi/N)
-	 * (actually, iGM[k] = 1/GM[k] = conj(GM[k])).
-	 *
-	 * In the main loop (not counting the final division loop), in
-	 * all iterations except the last, the first and second half of f[]
-	 * (as an array of complex numbers) are separate. In our chosen
-	 * representation, we do not keep the second half.
-	 *
-	 * The last iteration recombines the recomputed half with the
-	 * implicit half, and should yield only real numbers since the
-	 * target polynomial is real; moreover, s = i at that step.
-	 * Thus, when considering x and y:
-	 *    y = conj(x) since the final f[j] must be real
-	 *    Therefore, f[j] is filled with 2*Re(x), and f[j + t] is
-	 *    filled with 2*Im(x).
-	 * But we already have Re(x) and Im(x) in array slots j and j+t
-	 * in our chosen representation. That last iteration is thus a
-	 * simple doubling of the values in all the array.
-	 *
-	 * We make the last iteration a no-op by tweaking the final
-	 * division into a division by N/2, not N.
-	 */
-	size_t u, n, hn, t, m;
-
-	n = (size_t)1 << logn;
-	t = 1;
-	m = n;
-	hn = n >> 1;
-	for (u = logn; u > 1; u --) {
-		size_t hm, dt, i1, j1;
-
-		hm = m >> 1;
-		dt = t << 1;
-		for (i1 = 0, j1 = 0; j1 < hn; i1 ++, j1 += dt) {
-			size_t j, j2;
-
-			j2 = j1 + t;
-#if FALCON_AVX2 // yyyAVX2+1
-			if (t >= 4) {
-				__m256d s_re, s_im;
-
-				s_re = _mm256_set1_pd(
-					fpr_gm_tab[((hm + i1) << 1) + 0].v);
-				s_im = _mm256_set1_pd(
-					fpr_gm_tab[((hm + i1) << 1) + 1].v);
-				for (j = j1; j < j2; j += 4) {
-					__m256d x_re, x_im, y_re, y_im;
-					__m256d z_re, z_im;
-
-					x_re = _mm256_loadu_pd(&f[j].v);
-					x_im = _mm256_loadu_pd(&f[j + hn].v);
-					y_re = _mm256_loadu_pd(&f[j+t].v);
-					y_im = _mm256_loadu_pd(&f[j+t + hn].v);
-					_mm256_storeu_pd(&f[j].v,
-						_mm256_add_pd(x_re, y_re));
-					_mm256_storeu_pd(&f[j + hn].v,
-						_mm256_add_pd(x_im, y_im));
-					x_re = _mm256_sub_pd(y_re, x_re);
-					x_im = _mm256_sub_pd(x_im, y_im);
-					z_re = FMSUB(x_im, s_im,
-						_mm256_mul_pd(x_re, s_re));
-					z_im = FMADD(x_re, s_im,
-						_mm256_mul_pd(x_im, s_re));
-					_mm256_storeu_pd(&f[j+t].v, z_re);
-					_mm256_storeu_pd(&f[j+t + hn].v, z_im);
-				}
-			} else {
-				fpr s_re, s_im;
-
-				s_re = fpr_gm_tab[((hm + i1) << 1)+0];
-				s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1)+1]);
-				for (j = j1; j < j2; j ++) {
-					fpr x_re, x_im, y_re, y_im;
-
-					x_re = f[j];
-					x_im = f[j + hn];
-					y_re = f[j + t];
-					y_im = f[j + t + hn];
-					FPC_ADD(f[j], f[j + hn],
-						x_re, x_im, y_re, y_im);
-					FPC_SUB(x_re, x_im,
-						x_re, x_im, y_re, y_im);
-					FPC_MUL(f[j + t], f[j + t + hn],
-						x_re, x_im, s_re, s_im);
-				}
-			}
-#else // yyyAVX2+0
-			fpr s_re, s_im;
-
-			s_re = fpr_gm_tab[((hm + i1) << 1) + 0];
-			s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1) + 1]);
-			for (j = j1; j < j2; j ++) {
-				fpr x_re, x_im, y_re, y_im;
-
-				x_re = f[j];
-				x_im = f[j + hn];
-				y_re = f[j + t];
-				y_im = f[j + t + hn];
-				FPC_ADD(f[j], f[j + hn],
-					x_re, x_im, y_re, y_im);
-				FPC_SUB(x_re, x_im, x_re, x_im, y_re, y_im);
-				FPC_MUL(f[j + t], f[j + t + hn],
-					x_re, x_im, s_re, s_im);
-			}
-#endif // yyyAVX2-
-		}
-		t = dt;
-		m = hm;
-	}
-
-	/*
-	 * Last iteration is a no-op, provided that we divide by N/2
-	 * instead of N. We need to make a special case for logn = 0.
-	 */
-	if (logn > 0) {
-		fpr ni;
-
-		ni = fpr_p2_tab[logn];
-		for (u = 0; u < n; u ++) {
-			f[u] = fpr_mul(f[u], ni);
-		}
-	}
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_add)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, u;
-
-	n = (size_t)1 << logn;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 4) {
-		for (u = 0; u < n; u += 4) {
-			_mm256_storeu_pd(&a[u].v,
-				_mm256_add_pd(
-					_mm256_loadu_pd(&a[u].v),
-					_mm256_loadu_pd(&b[u].v)));
-		}
-	} else {
-		for (u = 0; u < n; u ++) {
-			a[u] = fpr_add(a[u], b[u]);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < n; u ++) {
-		a[u] = fpr_add(a[u], b[u]);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_sub)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, u;
-
-	n = (size_t)1 << logn;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 4) {
-		for (u = 0; u < n; u += 4) {
-			_mm256_storeu_pd(&a[u].v,
-				_mm256_sub_pd(
-					_mm256_loadu_pd(&a[u].v),
-					_mm256_loadu_pd(&b[u].v)));
-		}
-	} else {
-		for (u = 0; u < n; u ++) {
-			a[u] = fpr_sub(a[u], b[u]);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < n; u ++) {
-		a[u] = fpr_sub(a[u], b[u]);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_neg)(fpr *a, unsigned logn)
-{
-	size_t n, u;
-
-	n = (size_t)1 << logn;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 4) {
-		__m256d s;
-
-		s = _mm256_set1_pd(-0.0);
-		for (u = 0; u < n; u += 4) {
-			_mm256_storeu_pd(&a[u].v,
-				_mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
-		}
-	} else {
-		for (u = 0; u < n; u ++) {
-			a[u] = fpr_neg(a[u]);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < n; u ++) {
-		a[u] = fpr_neg(a[u]);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_adj_fft)(fpr *a, unsigned logn)
-{
-	size_t n, u;
-
-	n = (size_t)1 << logn;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d s;
-
-		s = _mm256_set1_pd(-0.0);
-		for (u = (n >> 1); u < n; u += 4) {
-			_mm256_storeu_pd(&a[u].v,
-				_mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
-		}
-	} else {
-		for (u = (n >> 1); u < n; u ++) {
-			a[u] = fpr_neg(a[u]);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = (n >> 1); u < n; u ++) {
-		a[u] = fpr_neg(a[u]);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_mul_fft)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		for (u = 0; u < hn; u += 4) {
-			__m256d a_re, a_im, b_re, b_im, c_re, c_im;
-
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			b_re = _mm256_loadu_pd(&b[u].v);
-			b_im = _mm256_loadu_pd(&b[u + hn].v);
-			c_re = FMSUB(
-				a_re, b_re, _mm256_mul_pd(a_im, b_im));
-			c_im = FMADD(
-				a_re, b_im, _mm256_mul_pd(a_im, b_re));
-			_mm256_storeu_pd(&a[u].v, c_re);
-			_mm256_storeu_pd(&a[u + hn].v, c_im);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr a_re, a_im, b_re, b_im;
-
-			a_re = a[u];
-			a_im = a[u + hn];
-			b_re = b[u];
-			b_im = b[u + hn];
-			FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr a_re, a_im, b_re, b_im;
-
-		a_re = a[u];
-		a_im = a[u + hn];
-		b_re = b[u];
-		b_im = b[u + hn];
-		FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_muladj_fft)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		for (u = 0; u < hn; u += 4) {
-			__m256d a_re, a_im, b_re, b_im, c_re, c_im;
-
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			b_re = _mm256_loadu_pd(&b[u].v);
-			b_im = _mm256_loadu_pd(&b[u + hn].v);
-			c_re = FMADD(
-				a_re, b_re, _mm256_mul_pd(a_im, b_im));
-			c_im = FMSUB(
-				a_im, b_re, _mm256_mul_pd(a_re, b_im));
-			_mm256_storeu_pd(&a[u].v, c_re);
-			_mm256_storeu_pd(&a[u + hn].v, c_im);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr a_re, a_im, b_re, b_im;
-
-			a_re = a[u];
-			a_im = a[u + hn];
-			b_re = b[u];
-			b_im = fpr_neg(b[u + hn]);
-			FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr a_re, a_im, b_re, b_im;
-
-		a_re = a[u];
-		a_im = a[u + hn];
-		b_re = b[u];
-		b_im = fpr_neg(b[u + hn]);
-		FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn)
-{
-	/*
-	 * Since each coefficient is multiplied with its own conjugate,
-	 * the result contains only real values.
-	 */
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d zero;
-
-		zero = _mm256_setzero_pd();
-		for (u = 0; u < hn; u += 4) {
-			__m256d a_re, a_im;
-
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			_mm256_storeu_pd(&a[u].v,
-				FMADD(a_re, a_re,
-					_mm256_mul_pd(a_im, a_im)));
-			_mm256_storeu_pd(&a[u + hn].v, zero);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr a_re, a_im;
-
-			a_re = a[u];
-			a_im = a[u + hn];
-			a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
-			a[u + hn] = fpr_zero;
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr a_re, a_im;
-
-		a_re = a[u];
-		a_im = a[u + hn];
-		a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
-		a[u + hn] = fpr_zero;
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_mulconst)(fpr *a, fpr x, unsigned logn)
-{
-	size_t n, u;
-
-	n = (size_t)1 << logn;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 4) {
-		__m256d x4;
-
-		x4 = _mm256_set1_pd(x.v);
-		for (u = 0; u < n; u += 4) {
-			_mm256_storeu_pd(&a[u].v,
-				_mm256_mul_pd(x4, _mm256_loadu_pd(&a[u].v)));
-		}
-	} else {
-		for (u = 0; u < n; u ++) {
-			a[u] = fpr_mul(a[u], x);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < n; u ++) {
-		a[u] = fpr_mul(a[u], x);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_div_fft)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d one;
-
-		one = _mm256_set1_pd(1.0);
-		for (u = 0; u < hn; u += 4) {
-			__m256d a_re, a_im, b_re, b_im, c_re, c_im, t;
-
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			b_re = _mm256_loadu_pd(&b[u].v);
-			b_im = _mm256_loadu_pd(&b[u + hn].v);
-			t = _mm256_div_pd(one,
-				FMADD(b_re, b_re,
-					_mm256_mul_pd(b_im, b_im)));
-			b_re = _mm256_mul_pd(b_re, t);
-			b_im = _mm256_mul_pd(b_im, t);
-			c_re = FMADD(
-				a_re, b_re, _mm256_mul_pd(a_im, b_im));
-			c_im = FMSUB(
-				a_im, b_re, _mm256_mul_pd(a_re, b_im));
-			_mm256_storeu_pd(&a[u].v, c_re);
-			_mm256_storeu_pd(&a[u + hn].v, c_im);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr a_re, a_im, b_re, b_im;
-
-			a_re = a[u];
-			a_im = a[u + hn];
-			b_re = b[u];
-			b_im = b[u + hn];
-			FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr a_re, a_im, b_re, b_im;
-
-		a_re = a[u];
-		a_im = a[u + hn];
-		b_re = b[u];
-		b_im = b[u + hn];
-		FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_invnorm2_fft)(fpr *restrict d,
-	const fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d one;
-
-		one = _mm256_set1_pd(1.0);
-		for (u = 0; u < hn; u += 4) {
-			__m256d a_re, a_im, b_re, b_im, dv;
-
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			b_re = _mm256_loadu_pd(&b[u].v);
-			b_im = _mm256_loadu_pd(&b[u + hn].v);
-			dv = _mm256_div_pd(one,
-				_mm256_add_pd(
-					FMADD(a_re, a_re,
-						_mm256_mul_pd(a_im, a_im)),
-					FMADD(b_re, b_re,
-						_mm256_mul_pd(b_im, b_im))));
-			_mm256_storeu_pd(&d[u].v, dv);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr a_re, a_im;
-			fpr b_re, b_im;
-
-			a_re = a[u];
-			a_im = a[u + hn];
-			b_re = b[u];
-			b_im = b[u + hn];
-			d[u] = fpr_inv(fpr_add(
-				fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
-				fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr a_re, a_im;
-		fpr b_re, b_im;
-
-		a_re = a[u];
-		a_im = a[u + hn];
-		b_re = b[u];
-		b_im = b[u + hn];
-		d[u] = fpr_inv(fpr_add(
-			fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
-			fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_add_muladj_fft)(fpr *restrict d,
-	const fpr *restrict F, const fpr *restrict G,
-	const fpr *restrict f, const fpr *restrict g, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		for (u = 0; u < hn; u += 4) {
-			__m256d F_re, F_im, G_re, G_im;
-			__m256d f_re, f_im, g_re, g_im;
-			__m256d a_re, a_im, b_re, b_im;
-
-			F_re = _mm256_loadu_pd(&F[u].v);
-			F_im = _mm256_loadu_pd(&F[u + hn].v);
-			G_re = _mm256_loadu_pd(&G[u].v);
-			G_im = _mm256_loadu_pd(&G[u + hn].v);
-			f_re = _mm256_loadu_pd(&f[u].v);
-			f_im = _mm256_loadu_pd(&f[u + hn].v);
-			g_re = _mm256_loadu_pd(&g[u].v);
-			g_im = _mm256_loadu_pd(&g[u + hn].v);
-
-			a_re = FMADD(F_re, f_re,
-				_mm256_mul_pd(F_im, f_im));
-			a_im = FMSUB(F_im, f_re,
-				_mm256_mul_pd(F_re, f_im));
-			b_re = FMADD(G_re, g_re,
-				_mm256_mul_pd(G_im, g_im));
-			b_im = FMSUB(G_im, g_re,
-				_mm256_mul_pd(G_re, g_im));
-			_mm256_storeu_pd(&d[u].v,
-				_mm256_add_pd(a_re, b_re));
-			_mm256_storeu_pd(&d[u + hn].v,
-				_mm256_add_pd(a_im, b_im));
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr F_re, F_im, G_re, G_im;
-			fpr f_re, f_im, g_re, g_im;
-			fpr a_re, a_im, b_re, b_im;
-
-			F_re = F[u];
-			F_im = F[u + hn];
-			G_re = G[u];
-			G_im = G[u + hn];
-			f_re = f[u];
-			f_im = f[u + hn];
-			g_re = g[u];
-			g_im = g[u + hn];
-
-			FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
-			FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
-			d[u] = fpr_add(a_re, b_re);
-			d[u + hn] = fpr_add(a_im, b_im);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr F_re, F_im, G_re, G_im;
-		fpr f_re, f_im, g_re, g_im;
-		fpr a_re, a_im, b_re, b_im;
-
-		F_re = F[u];
-		F_im = F[u + hn];
-		G_re = G[u];
-		G_im = G[u + hn];
-		f_re = f[u];
-		f_im = f[u + hn];
-		g_re = g[u];
-		g_im = g[u + hn];
-
-		FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
-		FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
-		d[u] = fpr_add(a_re, b_re);
-		d[u + hn] = fpr_add(a_im, b_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_mul_autoadj_fft)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		for (u = 0; u < hn; u += 4) {
-			__m256d a_re, a_im, bv;
-
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			bv = _mm256_loadu_pd(&b[u].v);
-			_mm256_storeu_pd(&a[u].v,
-				_mm256_mul_pd(a_re, bv));
-			_mm256_storeu_pd(&a[u + hn].v,
-				_mm256_mul_pd(a_im, bv));
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			a[u] = fpr_mul(a[u], b[u]);
-			a[u + hn] = fpr_mul(a[u + hn], b[u]);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		a[u] = fpr_mul(a[u], b[u]);
-		a[u + hn] = fpr_mul(a[u + hn], b[u]);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_div_autoadj_fft)(
-	fpr *restrict a, const fpr *restrict b, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d one;
-
-		one = _mm256_set1_pd(1.0);
-		for (u = 0; u < hn; u += 4) {
-			__m256d ib, a_re, a_im;
-
-			ib = _mm256_div_pd(one, _mm256_loadu_pd(&b[u].v));
-			a_re = _mm256_loadu_pd(&a[u].v);
-			a_im = _mm256_loadu_pd(&a[u + hn].v);
-			_mm256_storeu_pd(&a[u].v, _mm256_mul_pd(a_re, ib));
-			_mm256_storeu_pd(&a[u + hn].v, _mm256_mul_pd(a_im, ib));
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr ib;
-
-			ib = fpr_inv(b[u]);
-			a[u] = fpr_mul(a[u], ib);
-			a[u + hn] = fpr_mul(a[u + hn], ib);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr ib;
-
-		ib = fpr_inv(b[u]);
-		a[u] = fpr_mul(a[u], ib);
-		a[u + hn] = fpr_mul(a[u + hn], ib);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_LDL_fft)(
-	const fpr *restrict g00,
-	fpr *restrict g01, fpr *restrict g11, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d one;
-
-		one = _mm256_set1_pd(1.0);
-		for (u = 0; u < hn; u += 4) {
-			__m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
-			__m256d t, mu_re, mu_im, xi_re, xi_im;
-
-			g00_re = _mm256_loadu_pd(&g00[u].v);
-			g00_im = _mm256_loadu_pd(&g00[u + hn].v);
-			g01_re = _mm256_loadu_pd(&g01[u].v);
-			g01_im = _mm256_loadu_pd(&g01[u + hn].v);
-			g11_re = _mm256_loadu_pd(&g11[u].v);
-			g11_im = _mm256_loadu_pd(&g11[u + hn].v);
-
-			t = _mm256_div_pd(one,
-				FMADD(g00_re, g00_re,
-					_mm256_mul_pd(g00_im, g00_im)));
-			g00_re = _mm256_mul_pd(g00_re, t);
-			g00_im = _mm256_mul_pd(g00_im, t);
-			mu_re = FMADD(g01_re, g00_re,
-				_mm256_mul_pd(g01_im, g00_im));
-			mu_im = FMSUB(g01_re, g00_im,
-				_mm256_mul_pd(g01_im, g00_re));
-			xi_re = FMSUB(mu_re, g01_re,
-				_mm256_mul_pd(mu_im, g01_im));
-			xi_im = FMADD(mu_im, g01_re,
-				_mm256_mul_pd(mu_re, g01_im));
-			_mm256_storeu_pd(&g11[u].v,
-				_mm256_sub_pd(g11_re, xi_re));
-			_mm256_storeu_pd(&g11[u + hn].v,
-				_mm256_add_pd(g11_im, xi_im));
-			_mm256_storeu_pd(&g01[u].v, mu_re);
-			_mm256_storeu_pd(&g01[u + hn].v, mu_im);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
-			fpr mu_re, mu_im;
-
-			g00_re = g00[u];
-			g00_im = g00[u + hn];
-			g01_re = g01[u];
-			g01_im = g01[u + hn];
-			g11_re = g11[u];
-			g11_im = g11[u + hn];
-			FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
-			FPC_MUL(g01_re, g01_im,
-				mu_re, mu_im, g01_re, fpr_neg(g01_im));
-			FPC_SUB(g11[u], g11[u + hn],
-				g11_re, g11_im, g01_re, g01_im);
-			g01[u] = mu_re;
-			g01[u + hn] = fpr_neg(mu_im);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
-		fpr mu_re, mu_im;
-
-		g00_re = g00[u];
-		g00_im = g00[u + hn];
-		g01_re = g01[u];
-		g01_im = g01[u + hn];
-		g11_re = g11[u];
-		g11_im = g11[u + hn];
-		FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
-		FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
-		FPC_SUB(g11[u], g11[u + hn], g11_re, g11_im, g01_re, g01_im);
-		g01[u] = mu_re;
-		g01[u + hn] = fpr_neg(mu_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_LDLmv_fft)(
-	fpr *restrict d11, fpr *restrict l10,
-	const fpr *restrict g00, const fpr *restrict g01,
-	const fpr *restrict g11, unsigned logn)
-{
-	size_t n, hn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d one;
-
-		one = _mm256_set1_pd(1.0);
-		for (u = 0; u < hn; u += 4) {
-			__m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
-			__m256d t, mu_re, mu_im, xi_re, xi_im;
-
-			g00_re = _mm256_loadu_pd(&g00[u].v);
-			g00_im = _mm256_loadu_pd(&g00[u + hn].v);
-			g01_re = _mm256_loadu_pd(&g01[u].v);
-			g01_im = _mm256_loadu_pd(&g01[u + hn].v);
-			g11_re = _mm256_loadu_pd(&g11[u].v);
-			g11_im = _mm256_loadu_pd(&g11[u + hn].v);
-
-			t = _mm256_div_pd(one,
-				FMADD(g00_re, g00_re,
-					_mm256_mul_pd(g00_im, g00_im)));
-			g00_re = _mm256_mul_pd(g00_re, t);
-			g00_im = _mm256_mul_pd(g00_im, t);
-			mu_re = FMADD(g01_re, g00_re,
-				_mm256_mul_pd(g01_im, g00_im));
-			mu_im = FMSUB(g01_re, g00_im,
-				_mm256_mul_pd(g01_im, g00_re));
-			xi_re = FMSUB(mu_re, g01_re,
-				_mm256_mul_pd(mu_im, g01_im));
-			xi_im = FMADD(mu_im, g01_re,
-				_mm256_mul_pd(mu_re, g01_im));
-			_mm256_storeu_pd(&d11[u].v,
-				_mm256_sub_pd(g11_re, xi_re));
-			_mm256_storeu_pd(&d11[u + hn].v,
-				_mm256_add_pd(g11_im, xi_im));
-			_mm256_storeu_pd(&l10[u].v, mu_re);
-			_mm256_storeu_pd(&l10[u + hn].v, mu_im);
-		}
-	} else {
-		for (u = 0; u < hn; u ++) {
-			fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
-			fpr mu_re, mu_im;
-
-			g00_re = g00[u];
-			g00_im = g00[u + hn];
-			g01_re = g01[u];
-			g01_im = g01[u + hn];
-			g11_re = g11[u];
-			g11_im = g11[u + hn];
-			FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
-			FPC_MUL(g01_re, g01_im,
-				mu_re, mu_im, g01_re, fpr_neg(g01_im));
-			FPC_SUB(d11[u], d11[u + hn],
-				g11_re, g11_im, g01_re, g01_im);
-			l10[u] = mu_re;
-			l10[u + hn] = fpr_neg(mu_im);
-		}
-	}
-#else // yyyAVX2+0
-	for (u = 0; u < hn; u ++) {
-		fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
-		fpr mu_re, mu_im;
-
-		g00_re = g00[u];
-		g00_im = g00[u + hn];
-		g01_re = g01[u];
-		g01_im = g01[u + hn];
-		g11_re = g11[u];
-		g11_im = g11[u + hn];
-		FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
-		FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
-		FPC_SUB(d11[u], d11[u + hn], g11_re, g11_im, g01_re, g01_im);
-		l10[u] = mu_re;
-		l10[u + hn] = fpr_neg(mu_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_split_fft)(
-	fpr *restrict f0, fpr *restrict f1,
-	const fpr *restrict f, unsigned logn)
-{
-	/*
-	 * The FFT representation we use is in bit-reversed order
-	 * (element i contains f(w^(rev(i))), where rev() is the
-	 * bit-reversal function over the ring degree. This changes
-	 * indexes with regards to the Falcon specification.
-	 */
-	size_t n, hn, qn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-	qn = hn >> 1;
-
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 8) {
-		__m256d half, sv;
-
-		half = _mm256_set1_pd(0.5);
-		sv = _mm256_set_pd(-0.0, 0.0, -0.0, 0.0);
-		for (u = 0; u < qn; u += 2) {
-			__m256d ab_re, ab_im, ff0, ff1, ff2, ff3, gmt;
-
-			ab_re = _mm256_loadu_pd(&f[(u << 1)].v);
-			ab_im = _mm256_loadu_pd(&f[(u << 1) + hn].v);
-			ff0 = _mm256_mul_pd(_mm256_hadd_pd(ab_re, ab_im), half);
-			ff0 = _mm256_permute4x64_pd(ff0, 0xD8);
-			_mm_storeu_pd(&f0[u].v,
-				_mm256_extractf128_pd(ff0, 0));
-			_mm_storeu_pd(&f0[u + qn].v,
-				_mm256_extractf128_pd(ff0, 1));
-
-			ff1 = _mm256_mul_pd(_mm256_hsub_pd(ab_re, ab_im), half);
-			gmt = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
-			ff2 = _mm256_shuffle_pd(ff1, ff1, 0x5);
-			ff3 = _mm256_hadd_pd(
-				_mm256_mul_pd(ff1, gmt),
-				_mm256_xor_pd(_mm256_mul_pd(ff2, gmt), sv));
-			ff3 = _mm256_permute4x64_pd(ff3, 0xD8);
-			_mm_storeu_pd(&f1[u].v,
-				_mm256_extractf128_pd(ff3, 0));
-			_mm_storeu_pd(&f1[u + qn].v,
-				_mm256_extractf128_pd(ff3, 1));
-		}
-	} else {
-		f0[0] = f[0];
-		f1[0] = f[hn];
-
-		for (u = 0; u < qn; u ++) {
-			fpr a_re, a_im, b_re, b_im;
-			fpr t_re, t_im;
-
-			a_re = f[(u << 1) + 0];
-			a_im = f[(u << 1) + 0 + hn];
-			b_re = f[(u << 1) + 1];
-			b_im = f[(u << 1) + 1 + hn];
-
-			FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
-			f0[u] = fpr_half(t_re);
-			f0[u + qn] = fpr_half(t_im);
-
-			FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
-			FPC_MUL(t_re, t_im, t_re, t_im,
-				fpr_gm_tab[((u + hn) << 1) + 0],
-				fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
-			f1[u] = fpr_half(t_re);
-			f1[u + qn] = fpr_half(t_im);
-		}
-	}
-#else // yyyAVX2+0
-	/*
-	 * We process complex values by pairs. For logn = 1, there is only
-	 * one complex value (the other one is the implicit conjugate),
-	 * so we add the two lines below because the loop will be
-	 * skipped.
-	 */
-	f0[0] = f[0];
-	f1[0] = f[hn];
-
-	for (u = 0; u < qn; u ++) {
-		fpr a_re, a_im, b_re, b_im;
-		fpr t_re, t_im;
-
-		a_re = f[(u << 1) + 0];
-		a_im = f[(u << 1) + 0 + hn];
-		b_re = f[(u << 1) + 1];
-		b_im = f[(u << 1) + 1 + hn];
-
-		FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
-		f0[u] = fpr_half(t_re);
-		f0[u + qn] = fpr_half(t_im);
-
-		FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
-		FPC_MUL(t_re, t_im, t_re, t_im,
-			fpr_gm_tab[((u + hn) << 1) + 0],
-			fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
-		f1[u] = fpr_half(t_re);
-		f1[u + qn] = fpr_half(t_im);
-	}
-#endif // yyyAVX2-
-}
-
-/* see inner.h */
-TARGET_AVX2
-void
-Zf(poly_merge_fft)(
-	fpr *restrict f,
-	const fpr *restrict f0, const fpr *restrict f1, unsigned logn)
-{
-	size_t n, hn, qn, u;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-	qn = hn >> 1;
-
-#if FALCON_AVX2 // yyyAVX2+1
-	if (n >= 16) {
-		for (u = 0; u < qn; u += 4) {
-			__m256d a_re, a_im, b_re, b_im, c_re, c_im;
-			__m256d gm1, gm2, g_re, g_im;
-			__m256d t_re, t_im, u_re, u_im;
-			__m256d tu1_re, tu2_re, tu1_im, tu2_im;
-
-			a_re = _mm256_loadu_pd(&f0[u].v);
-			a_im = _mm256_loadu_pd(&f0[u + qn].v);
-			c_re = _mm256_loadu_pd(&f1[u].v);
-			c_im = _mm256_loadu_pd(&f1[u + qn].v);
-
-			gm1 = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
-			gm2 = _mm256_loadu_pd(&fpr_gm_tab[(u + 2 + hn) << 1].v);
-			g_re = _mm256_unpacklo_pd(gm1, gm2);
-			g_im = _mm256_unpackhi_pd(gm1, gm2);
-			g_re = _mm256_permute4x64_pd(g_re, 0xD8);
-			g_im = _mm256_permute4x64_pd(g_im, 0xD8);
-
-			b_re = FMSUB(
-				c_re, g_re, _mm256_mul_pd(c_im, g_im));
-			b_im = FMADD(
-				c_re, g_im, _mm256_mul_pd(c_im, g_re));
-
-			t_re = _mm256_add_pd(a_re, b_re);
-			t_im = _mm256_add_pd(a_im, b_im);
-			u_re = _mm256_sub_pd(a_re, b_re);
-			u_im = _mm256_sub_pd(a_im, b_im);
-
-			tu1_re = _mm256_unpacklo_pd(t_re, u_re);
-			tu2_re = _mm256_unpackhi_pd(t_re, u_re);
-			tu1_im = _mm256_unpacklo_pd(t_im, u_im);
-			tu2_im = _mm256_unpackhi_pd(t_im, u_im);
-			_mm256_storeu_pd(&f[(u << 1)].v,
-				_mm256_permute2f128_pd(tu1_re, tu2_re, 0x20));
-			_mm256_storeu_pd(&f[(u << 1) + 4].v,
-				_mm256_permute2f128_pd(tu1_re, tu2_re, 0x31));
-			_mm256_storeu_pd(&f[(u << 1) + hn].v,
-				_mm256_permute2f128_pd(tu1_im, tu2_im, 0x20));
-			_mm256_storeu_pd(&f[(u << 1) + 4 + hn].v,
-				_mm256_permute2f128_pd(tu1_im, tu2_im, 0x31));
-		}
-	} else {
-		f[0] = f0[0];
-		f[hn] = f1[0];
-
-		for (u = 0; u < qn; u ++) {
-			fpr a_re, a_im, b_re, b_im;
-			fpr t_re, t_im;
-
-			a_re = f0[u];
-			a_im = f0[u + qn];
-			FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
-				fpr_gm_tab[((u + hn) << 1) + 0],
-				fpr_gm_tab[((u + hn) << 1) + 1]);
-			FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
-			f[(u << 1) + 0] = t_re;
-			f[(u << 1) + 0 + hn] = t_im;
-			FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
-			f[(u << 1) + 1] = t_re;
-			f[(u << 1) + 1 + hn] = t_im;
-		}
-	}
-#else // yyyAVX2+0
-	/*
-	 * An extra copy to handle the special case logn = 1.
-	 */
-	f[0] = f0[0];
-	f[hn] = f1[0];
-
-	for (u = 0; u < qn; u ++) {
-		fpr a_re, a_im, b_re, b_im;
-		fpr t_re, t_im;
-
-		a_re = f0[u];
-		a_im = f0[u + qn];
-		FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
-			fpr_gm_tab[((u + hn) << 1) + 0],
-			fpr_gm_tab[((u + hn) << 1) + 1]);
-		FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
-		f[(u << 1) + 0] = t_re;
-		f[(u << 1) + 0 + hn] = t_im;
-		FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
-		f[(u << 1) + 1] = t_re;
-		f[(u << 1) + 1 + hn] = t_im;
-	}
-#endif // yyyAVX2-
-}
diff --git a/crypto_sign/falcon-512/m4-ct/fpr.c b/crypto_sign/falcon-512/m4-ct/fpr.c
deleted file mode 100644
index eb23a44b..00000000
--- a/crypto_sign/falcon-512/m4-ct/fpr.c
+++ /dev/null
@@ -1,3460 +0,0 @@
-/*
- * Floating-point operations.
- *
- * This file implements the non-inline functions declared in
- * fpr.h, as well as the constants for FFT / iFFT.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-#if FALCON_FPEMU // yyyFPEMU+1
-
-/*
- * Normalize a provided unsigned integer to the 2^63..2^64-1 range by
- * left-shifting it if necessary. The exponent e is adjusted accordingly
- * (i.e. if the value was left-shifted by n bits, then n is subtracted
- * from e). If source m is 0, then it remains 0, but e is altered.
- * Both m and e must be simple variables (no expressions allowed).
- */
-#define FPR_NORM64(m, e)   do { \
-		uint32_t nt; \
- \
-		(e) -= 63; \
- \
-		nt = (uint32_t)((m) >> 32); \
-		nt = (nt | -nt) >> 31; \
-		(m) ^= ((m) ^ ((m) << 32)) & ((uint64_t)nt - 1); \
-		(e) += (int)(nt << 5); \
- \
-		nt = (uint32_t)((m) >> 48); \
-		nt = (nt | -nt) >> 31; \
-		(m) ^= ((m) ^ ((m) << 16)) & ((uint64_t)nt - 1); \
-		(e) += (int)(nt << 4); \
- \
-		nt = (uint32_t)((m) >> 56); \
-		nt = (nt | -nt) >> 31; \
-		(m) ^= ((m) ^ ((m) <<  8)) & ((uint64_t)nt - 1); \
-		(e) += (int)(nt << 3); \
- \
-		nt = (uint32_t)((m) >> 60); \
-		nt = (nt | -nt) >> 31; \
-		(m) ^= ((m) ^ ((m) <<  4)) & ((uint64_t)nt - 1); \
-		(e) += (int)(nt << 2); \
- \
-		nt = (uint32_t)((m) >> 62); \
-		nt = (nt | -nt) >> 31; \
-		(m) ^= ((m) ^ ((m) <<  2)) & ((uint64_t)nt - 1); \
-		(e) += (int)(nt << 1); \
- \
-		nt = (uint32_t)((m) >> 63); \
-		(m) ^= ((m) ^ ((m) <<  1)) & ((uint64_t)nt - 1); \
-		(e) += (int)(nt); \
-	} while (0)
-
-#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1
-
-__attribute__((naked))
-fpr
-fpr_scaled(int64_t i __attribute__((unused)), int sc __attribute__((unused)))
-{
-	__asm__ (
-	"push	{ r4, r5, r6, lr }\n\t"
-	"\n\t"
-	"@ Input i is in r0:r1, and sc in r2.\n\t"
-	"@ Extract the sign bit, and compute the absolute value.\n\t"
-	"@ -> sign bit in r3, with value 0 or -1\n\t"
-	"asrs	r3, r1, #31\n\t"
-	"eors	r0, r3\n\t"
-	"eors	r1, r3\n\t"
-	"subs	r0, r3\n\t"
-	"sbcs	r1, r3\n\t"
-	"\n\t"
-	"@ Scale exponent to account for the encoding; if the source is\n\t"
-	"@ zero or if the scaled exponent is negative, it is set to 32.\n\t"
-	"addw	r2, r2, #1022\n\t"
-	"orrs	r4, r0, r1\n\t"
-	"bics	r4, r4, r2, asr #31\n\t"
-	"rsbs	r5, r4, #0\n\t"
-	"orrs	r4, r5\n\t"
-	"ands	r2, r2, r4, asr #31\n\t"
-	"adds	r2, #32\n\t"
-	"\n\t"
-	"@ Normalize value to a full 64-bit width, by shifting it left.\n\t"
-	"@ The shift count is subtracted from the exponent (in r2).\n\t"
-	"@ If the mantissa is 0, the exponent is set to 0.\n\t"
-	"\n\t"
-	"@ If top word is 0, replace with low word; otherwise, add 32 to\n\t"
-	"@ the exponent.\n\t"
-	"rsbs	r4, r1, #0\n\t"
-	"orrs	r4, r1\n\t"
-	"eors	r5, r0, r1\n\t"
-	"bics	r5, r5, r4, asr #31\n\t"
-	"eors	r1, r5\n\t"
-	"ands	r0, r0, r4, asr #31\n\t"
-	"lsrs	r4, r4, #31\n\t"
-	"adds	r2, r2, r4, lsl #5\n\t"
-	"\n\t"
-	"@ Count leading zeros of r1 to finish the shift.\n\t"
-	"clz	r4, r1\n\t"
-	"subs	r2, r4\n\t"
-	"rsbs	r5, r4, #32\n\t"
-	"lsls	r1, r4\n\t"
-	"lsrs	r5, r0, r5\n\t"
-	"lsls	r0, r4\n\t"
-	"orrs	r1, r5\n\t"
-	"\n\t"
-	"@ Clear the top bit; we know it's a 1 (unless the whole mantissa\n\t"
-	"@ was zero, but then it's still OK to clear it)\n\t"
-	"bfc	r1, #31, #1\n\t"
-	"\n\t"
-	"@ Now shift right the value by 11 bits; this puts the value in\n\t"
-	"@ the 2^52..2^53-1 range. We also keep a copy of the pre-shift\n\t"
-	"@ low bits in r5.\n\t"
-	"movs	r5, r0\n\t"
-	"lsrs	r0, #11\n\t"
-	"orrs	r0, r0, r1, lsl #21\n\t"
-	"lsrs	r1, #11\n\t"
-	"\n\t"
-	"@ Also plug the exponent at the right place. This must be done\n\t"
-	"@ now so that, in case the rounding creates a carry, that carry\n\t"
-	"@ adds to the exponent, which would be exactly what we want at\n\t"
-	"@ that point.\n\t"
-	"orrs	r1, r1, r2, lsl #20\n\t"
-	"\n\t"
-	"@ Rounding: we must add 1 to the mantissa in the following cases:\n\t"
-	"@  - bits 11 to 9 of r5 are '011', '110' or '111'\n\t"
-	"@  - bits 11 to 9 of r5 are '010' and one of the\n\t"
-	"@    bits 0 to 8 is non-zero\n\t"
-	"ubfx	r6, r5, #0, #9\n\t"
-	"addw	r6, r6, #511\n\t"
-	"orrs	r5, r6\n\t"
-	"\n\t"
-	"ubfx	r5, r5, #9, #3\n\t"
-	"movs	r6, #0xC8\n\t"
-	"lsrs	r6, r5\n\t"
-	"ands	r6, #1\n\t"
-	"adds	r0, r6\n\t"
-	"adcs	r1, #0\n\t"
-	"\n\t"
-	"@ Put back the sign.\n\t"
-	"orrs	r1, r1, r3, lsl #31\n\t"
-	"\n\t"
-	"pop	{ r4, r5, r6, pc}\n\t"
-	);
-}
-
-#else // yyyASM_CORTEXM4+0
-
-fpr
-fpr_scaled(int64_t i, int sc)
-{
-	/*
-	 * To convert from int to float, we have to do the following:
-	 *  1. Get the absolute value of the input, and its sign
-	 *  2. Shift right or left the value as appropriate
-	 *  3. Pack the result
-	 *
-	 * We can assume that the source integer is not -2^63.
-	 */
-	int s, e;
-	uint32_t t;
-	uint64_t m;
-
-	/*
-	 * Extract sign bit.
-	 * We have: -i = 1 + ~i
-	 */
-	s = (int)((uint64_t)i >> 63);
-	i ^= -(int64_t)s;
-	i += s;
-
-	/*
-	 * For now we suppose that i != 0.
-	 * Otherwise, we set m to i and left-shift it as much as needed
-	 * to get a 1 in the top bit. We can do that in a logarithmic
-	 * number of conditional shifts.
-	 */
-	m = (uint64_t)i;
-	e = 9 + sc;
-	FPR_NORM64(m, e);
-
-	/*
-	 * Now m is in the 2^63..2^64-1 range. We must divide it by 512;
-	 * if one of the dropped bits is a 1, this should go into the
-	 * "sticky bit".
-	 */
-	m |= ((uint32_t)m & 0x1FF) + 0x1FF;
-	m >>= 9;
-
-	/*
-	 * Corrective action: if i = 0 then all of the above was
-	 * incorrect, and we clamp e and m down to zero.
-	 */
-	t = (uint32_t)((uint64_t)(i | -i) >> 63);
-	m &= -(uint64_t)t;
-	e &= -(int)t;
-
-	/*
-	 * Assemble back everything. The FPR() function will handle cases
-	 * where e is too low.
-	 */
-	return FPR(s, e, m);
-}
-
-#endif // yyyASM_CORTEXM4-
-
-#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1
-
-// yyyPQCLEAN+0
-#if 0
-/* Debug code -- To get a printout of registers from a specific point
-   in ARM Cortex M4 assembly code, uncomment this code and add a
-   "bl DEBUG" call where wished for. */
-
-void
-print_regs(uint32_t *rr, uint32_t flags)
-{
-	int i;
-	extern int printf(const char *fmt, ...);
-
-	printf("\nRegs:\n");
-	for (i = 0; i < 7; i ++) {
-		int j;
-
-		j = i + 7;
-		printf("  %2d = %08X    %2d = %08X\n", i, rr[i], j, rr[j]);
-	}
-	printf("  flags = %08X  ", flags);
-	if ((flags >> 31) & 1) {
-		printf("N");
-	}
-	if ((flags >> 30) & 1) {
-		printf("Z");
-	}
-	if ((flags >> 29) & 1) {
-		printf("C");
-	}
-	if ((flags >> 28) & 1) {
-		printf("V");
-	}
-	if ((flags >> 27) & 1) {
-		printf("Q");
-	}
-	printf("\n");
-}
-
-__attribute__((naked))
-void
-DEBUG(void)
-{
-	__asm__ (
-	"push	{ r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr }\n\t"
-	"mov	r0, sp\n\t"
-	"mrs	r1, apsr\n\t"
-	"bl	print_regs\n\t"
-	"pop	{ r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, pc }\n\t"
-	);
-}
-#endif
-// yyyPQCLEAN-
-
-__attribute__((naked))
-fpr
-fpr_add(fpr x __attribute__((unused)), fpr y __attribute__((unused)))
-{
-	__asm__ (
-	"push	{ r4, r5, r6, r7, r8, r10, r11, lr }\n\t"
-	"\n\t"
-	"@ Make sure that the first operand (x) has the larger absolute\n\t"
-	"@ value. This guarantees that the exponent of y is less than\n\t"
-	"@ or equal to the exponent of x, and, if they are equal, then\n\t"
-	"@ the mantissa of y will not be greater than the mantissa of x.\n\t"
-	"@ However, if absolute values are equal and the sign of x is 1,\n\t"
-	"@ then we want to also swap the values.\n\t"
-	"ubfx	r4, r1, #0, #31  @ top word without sign bit\n\t"
-	"ubfx	r5, r3, #0, #31  @ top word without sign bit\n\t"
-	"subs	r7, r0, r2       @ difference in r7:r4\n\t"
-	"sbcs	r4, r5\n\t"
-	"orrs	r7, r4\n\t"
-	"rsbs	r5, r7, #0\n\t"
-	"orrs	r7, r5      @ bit 31 of r7 is 0 iff difference is zero\n\t"
-	"bics	r6, r1, r7\n\t"
-	"orrs	r6, r4      @ bit 31 of r6 is 1 iff the swap must be done\n\t"
-	"\n\t"
-	"@ Conditional swap\n\t"
-	"eors	r4, r0, r2\n\t"
-	"eors	r5, r1, r3\n\t"
-	"ands	r4, r4, r6, asr #31\n\t"
-	"ands	r5, r5, r6, asr #31\n\t"
-	"eors	r0, r4\n\t"
-	"eors	r1, r5\n\t"
-	"eors	r2, r4\n\t"
-	"eors	r3, r5\n\t"
-	"\n\t"
-	"@ Extract mantissa of x into r0:r1, exponent in r4, sign in r5\n\t"
-	"ubfx	r4, r1, #20, #11   @ Exponent in r4 (without sign)\n\t"
-	"addw	r5, r4, #2047 @ Get a carry to test r4 for zero\n\t"
-	"lsrs	r5, #11       @ r5 is the mantissa implicit high bit\n\t"
-	"bfc	r1, #20, #11  @ Clear exponent bits (not the sign)\n\t"
-	"orrs	r1, r1, r5, lsl #20  @ Set mantissa high bit\n\t"
-	"asrs	r5, r1, #31   @ Get sign bit (sign-extended)\n\t"
-	"bfc	r1, #31, #1   @ Clear the sign bit\n\t"
-	"\n\t"
-	"@ Extract mantissa of y into r2:r3, exponent in r6, sign in r7\n\t"
-	"ubfx	r6, r3, #20, #11   @ Exponent in r6 (without sign)\n\t"
-	"addw	r7, r6, #2047 @ Get a carry to test r6 for zero\n\t"
-	"lsrs	r7, #11       @ r7 is the mantissa implicit high bit\n\t"
-	"bfc	r3, #20, #11  @ Clear exponent bits (not the sign)\n\t"
-	"orrs	r3, r3, r7, lsl #20  @ Set mantissa high bit\n\t"
-	"asrs	r7, r3, #31   @ Get sign bit (sign-extended)\n\t"
-	"bfc	r3, #31, #1   @ Clear the sign bit\n\t"
-	"\n\t"
-	"@ Scale mantissas up by three bits.\n\t"
-	"lsls	r1, #3\n\t"
-	"orrs	r1, r1, r0, lsr #29\n\t"
-	"lsls	r0, #3\n\t"
-	"lsls	r3, #3\n\t"
-	"orrs	r3, r3, r2, lsr #29\n\t"
-	"lsls	r2, #3\n\t"
-	"\n\t"
-	"@ x: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t"
-	"@ y: exponent=r6, sign=r7, mantissa=r2:r3 (scaled up 3 bits)\n\t"
-	"\n\t"
-	"@ At that point, the exponent of x (in r4) is larger than that\n\t"
-	"@ of y (in r6). The difference is the amount of shifting that\n\t"
-	"@ should be done on y. If that amount is larger than 59 then\n\t"
-	"@ we clamp y to 0. We won't need y's exponent beyond that point,\n\t"
-	"@ so we store that shift count in r6.\n\t"
-	"subs	r6, r4, r6\n\t"
-	"subs	r8, r6, #60\n\t"
-	"ands	r2, r2, r8, asr #31\n\t"
-	"ands	r3, r3, r8, asr #31\n\t"
-	"\n\t"
-	"@ Shift right r2:r3 by r6 bits. The shift count is in the 0..59\n\t"
-	"@ range. r11 will be non-zero if and only if some non-zero bits\n\t"
-	"@ were dropped.\n\t"
-	"subs	r8, r6, #32\n\t"
-	"bics	r11, r2, r8, asr #31\n\t"
-	"ands	r2, r2, r8, asr #31\n\t"
-	"bics	r10, r3, r8, asr #31\n\t"
-	"orrs	r2, r2, r10\n\t"
-	"ands	r3, r3, r8, asr #31\n\t"
-	"ands	r6, r6, #31\n\t"
-	"rsbs	r8, r6, #32\n\t"
-	"lsls	r10, r2, r8\n\t"
-	"orrs	r11, r11, r10\n\t"
-	"lsrs	r2, r2, r6\n\t"
-	"lsls	r10, r3, r8\n\t"
-	"orrs	r2, r2, r10\n\t"
-	"lsrs	r3, r3, r6\n\t"
-	"\n\t"
-	"@ If r11 is non-zero then some non-zero bit was dropped and the\n\t"
-	"@ low bit of r2 must be forced to 1 ('sticky bit').\n\t"
-	"rsbs	r6, r11, #0\n\t"
-	"orrs	r6, r6, r11\n\t"
-	"orrs	r2, r2, r6, lsr #31\n\t"
-	"\n\t"
-	"@ x: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t"
-	"@ y: sign=r7, value=r2:r3 (scaled to same exponent as x)\n\t"
-	"\n\t"
-	"@ If x and y don't have the same sign, then we should negate r2:r3\n\t"
-	"@ (i.e. subtract the mantissa instead of adding it). Signs of x\n\t"
-	"@ and y are in r5 and r7, as full-width words. We won't need r7\n\t"
-	"@ afterwards.\n\t"
-	"eors	r7, r5    @ r7 = -1 if y must be negated, 0 otherwise\n\t"
-	"eors	r2, r7\n\t"
-	"eors	r3, r7\n\t"
-	"subs	r2, r7\n\t"
-	"sbcs	r3, r7\n\t"
-	"\n\t"
-	"@ r2:r3 has been shifted, we can add to r0:r1.\n\t"
-	"adds	r0, r2\n\t"
-	"adcs	r1, r3\n\t"
-	"\n\t"
-	"@ result: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t"
-	"\n\t"
-	"@ Normalize the result with some left-shifting to full 64-bit\n\t"
-	"@ width. Shift count goes to r2, and exponent (r4) is adjusted.\n\t"
-	"clz	r2, r0\n\t"
-	"clz	r3, r1\n\t"
-	"sbfx	r6, r3, #5, #1\n\t"
-	"ands	r2, r6\n\t"
-	"adds	r2, r2, r3\n\t"
-	"subs	r4, r4, r2\n\t"
-	"\n\t"
-	"@ Shift r0:r1 to the left by r2 bits.\n\t"
-	"subs	r7, r2, #32\n\t"
-	"lsls	r7, r0, r7\n\t"
-	"lsls	r1, r1, r2\n\t"
-	"rsbs	r6, r2, #32\n\t"
-	"orrs	r1, r1, r7\n\t"
-	"lsrs	r6, r0, r6\n\t"
-	"orrs	r1, r1, r6\n\t"
-	"lsls	r0, r0, r2\n\t"
-	"\n\t"
-	"@ The exponent of x was in r4. The left-shift operation has\n\t"
-	"@ subtracted some value from it, 8 in case the result has the\n\t"
-	"@ same exponent as x. However, the high bit of the mantissa will\n\t"
-	"@ add 1 to the exponent, so we only add back 7 (the exponent is\n\t"
-	"@ added in because rounding might have produced a carry, which\n\t"
-	"@ should then spill into the exponent).\n\t"
-	"adds	r4, #7\n\t"
-	"\n\t"
-	"@ If the mantissa new mantissa is non-zero, then its bit 63 is\n\t"
-	"@ non-zero (thanks to the normalizing shift). Otherwise, that bit\n\t"
-	"@ is zero, and we should then set the exponent to zero as well.\n\t"
-	"ands	r4, r4, r1, asr #31\n\t"
-	"\n\t"
-	"@ Shrink back the value to a 52-bit mantissa. This requires\n\t"
-	"@ right-shifting by 11 bits; we keep a copy of the pre-shift\n\t"
-	"@ low word in r3.\n\t"
-	"movs	r3, r0\n\t"
-	"lsrs	r0, #11\n\t"
-	"orrs	r0, r0, r1, lsl #21\n\t"
-	"lsrs	r1, #11\n\t"
-	"\n\t"
-	"@ Apply rounding.\n\t"
-	"ubfx	r6, r3, #0, #9\n\t"
-	"addw	r6, r6, #511\n\t"
-	"orrs	r3, r6\n\t"
-	"ubfx	r3, r3, #9, #3\n\t"
-	"movs	r6, #0xC8\n\t"
-	"lsrs	r6, r3\n\t"
-	"ands	r6, #1\n\t"
-	"adds	r0, r6\n\t"
-	"adcs	r1, #0\n\t"
-	"\n\t"
-	"@Plug in the exponent with an addition.\n\t"
-	"adds	r1, r1, r4, lsl #20\n\t"
-	"\n\t"
-	"@ If the new exponent is negative or zero, then it underflowed\n\t"
-	"@ and we must clear the whole mantissa and exponent.\n\t"
-	"rsbs	r4, r4, #0\n\t"
-	"ands	r0, r0, r4, asr #31\n\t"
-	"ands	r1, r1, r4, asr #31\n\t"
-	"\n\t"
-	"@ Put back the sign. This is the sign of x: thanks to the\n\t"
-	"@ conditional swap at the start, this is always correct.\n\t"
-	"bfi	r1, r5, #31, #1\n\t"
-	"\n\t"
-	"pop	{ r4, r5, r6, r7, r8, r10, r11, pc }\n\t"
-	);
-}
-
-#else // yyyASM_CORTEXM4+0
-
-fpr
-fpr_add(fpr x, fpr y)
-{
-	uint64_t m, xu, yu, za;
-	uint32_t cs;
-	int ex, ey, sx, sy, cc;
-
-	/*
-	 * Make sure that the first operand (x) has the larger absolute
-	 * value. This guarantees that the exponent of y is less than
-	 * or equal to the exponent of x, and, if they are equal, then
-	 * the mantissa of y will not be greater than the mantissa of x.
-	 *
-	 * After this swap, the result will have the sign x, except in
-	 * the following edge case: abs(x) = abs(y), and x and y have
-	 * opposite sign bits; in that case, the result shall be +0
-	 * even if the sign bit of x is 1. To handle this case properly,
-	 * we do the swap is abs(x) = abs(y) AND the sign of x is 1.
-	 */
-	m = ((uint64_t)1 << 63) - 1;
-	za = (x & m) - (y & m);
-	cs = (uint32_t)(za >> 63)
-		| ((1U - (uint32_t)(-za >> 63)) & (uint32_t)(x >> 63));
-	m = (x ^ y) & -(uint64_t)cs;
-	x ^= m;
-	y ^= m;
-
-	/*
-	 * Extract sign bits, exponents and mantissas. The mantissas are
-	 * scaled up to 2^55..2^56-1, and the exponent is unbiased. If
-	 * an operand is zero, its mantissa is set to 0 at this step, and
-	 * its exponent will be -1078.
-	 */
-	ex = (int)(x >> 52);
-	sx = ex >> 11;
-	ex &= 0x7FF;
-	m = (uint64_t)(uint32_t)((ex + 0x7FF) >> 11) << 52;
-	xu = ((x & (((uint64_t)1 << 52) - 1)) | m) << 3;
-	ex -= 1078;
-	ey = (int)(y >> 52);
-	sy = ey >> 11;
-	ey &= 0x7FF;
-	m = (uint64_t)(uint32_t)((ey + 0x7FF) >> 11) << 52;
-	yu = ((y & (((uint64_t)1 << 52) - 1)) | m) << 3;
-	ey -= 1078;
-
-	/*
-	 * x has the larger exponent; hence, we only need to right-shift y.
-	 * If the shift count is larger than 59 bits then we clamp the
-	 * value to zero.
-	 */
-	cc = ex - ey;
-	yu &= -(uint64_t)((uint32_t)(cc - 60) >> 31);
-	cc &= 63;
-
-	/*
-	 * The lowest bit of yu is "sticky".
-	 */
-	m = fpr_ulsh(1, cc) - 1;
-	yu |= (yu & m) + m;
-	yu = fpr_ursh(yu, cc);
-
-	/*
-	 * If the operands have the same sign, then we add the mantissas;
-	 * otherwise, we subtract the mantissas.
-	 */
-	xu += yu - ((yu << 1) & -(uint64_t)(sx ^ sy));
-
-	/*
-	 * The result may be smaller, or slightly larger. We normalize
-	 * it to the 2^63..2^64-1 range (if xu is zero, then it stays
-	 * at zero).
-	 */
-	FPR_NORM64(xu, ex);
-
-	/*
-	 * Scale down the value to 2^54..s^55-1, handling the last bit
-	 * as sticky.
-	 */
-	xu |= ((uint32_t)xu & 0x1FF) + 0x1FF;
-	xu >>= 9;
-	ex += 9;
-
-	/*
-	 * In general, the result has the sign of x. However, if the
-	 * result is exactly zero, then the following situations may
-	 * be encountered:
-	 *   x > 0, y = -x   -> result should be +0
-	 *   x < 0, y = -x   -> result should be +0
-	 *   x = +0, y = +0  -> result should be +0
-	 *   x = -0, y = +0  -> result should be +0
-	 *   x = +0, y = -0  -> result should be +0
-	 *   x = -0, y = -0  -> result should be -0
-	 *
-	 * But at the conditional swap step at the start of the
-	 * function, we ensured that if abs(x) = abs(y) and the
-	 * sign of x was 1, then x and y were swapped. Thus, the
-	 * two following cases cannot actually happen:
-	 *   x < 0, y = -x
-	 *   x = -0, y = +0
-	 * In all other cases, the sign bit of x is conserved, which
-	 * is what the FPR() function does. The FPR() function also
-	 * properly clamps values to zero when the exponent is too
-	 * low, but does not alter the sign in that case.
-	 */
-	return FPR(sx, ex, xu);
-}
-
-#endif // yyyASM_CORTEXM4-
-
-#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1
-
-__attribute__((naked))
-fpr
-fpr_mul(fpr x __attribute__((unused)), fpr y __attribute__((unused)))
-{
-	__asm__ (
-	"push	{ r4, r5, r6, r7, r8, r10, r11, lr }\n\t"
-	"\n\t"
-	"@ Extract mantissas: x.m = r4:r5, y.m = r6:r7\n\t"
-	"@ r4 and r6 contain only 25 bits each.\n\t"
-	"bics	r4, r0, #0xFE000000\n\t"
-	"lsls	r5, r1, #7\n\t"
-	"orrs	r5, r5, r0, lsr #25\n\t"
-	"orrs	r5, r5, #0x08000000\n\t"
-	"bics	r5, r5, #0xF0000000\n\t"
-	"bics	r6, r2, #0xFE000000\n\t"
-	"lsls	r7, r3, #7\n\t"
-	"orrs	r7, r7, r2, lsr #25\n\t"
-	"orrs	r7, r7, #0x08000000\n\t"
-	"bics	r7, r7, #0xF0000000\n\t"
-	"\n\t"
-	"@ Perform product. Values are in the 2^52..2^53-1 range, so\n\t"
-	"@ the product is at most 106-bit long. Of the low 50 bits,\n\t"
-	"@ we only want to know if they are all zeros or not. Here,\n\t"
-	"@ we get the top 56 bits in r10:r11, and r8 will be non-zero\n\t"
-	"@ if and only if at least one of the low 50 bits is non-zero.\n\t"
-	"umull	r8, r10, r4, r6      @ x0*y0\n\t"
-	"lsls	r10, #7\n\t"
-	"orrs	r10, r10, r8, lsr #25\n\t"
-	"eors	r11, r11\n\t"
-	"umlal	r10, r11, r4, r7     @ x0*y1\n\t"
-	"umlal	r10, r11, r5, r6     @ x1*y0\n\t"
-	"orrs	r8, r8, r10, lsl #7\n\t"
-	"lsrs	r10, #25\n\t"
-	"orrs	r10, r10, r11, lsl #7\n\t"
-	"eors	r11, r11\n\t"
-	"umlal	r10, r11, r5, r7     @ x1*y1\n\t"
-	"\n\t"
-	"@ Now r0, r2, r4, r5, r6 and r7 are free.\n\t"
-	"@ If any of the low 50 bits was non-zero, then we force the\n\t"
-	"@ low bit of r10 to 1.\n\t"
-	"rsbs	r4, r8, #0\n\t"
-	"orrs	r8, r8, r4\n\t"
-	"orrs	r10, r10, r8, lsr #31\n\t"
-	"\n\t"
-	"@ r8 is free.\n\t"
-	"@ r10:r11 contains the product in the 2^54..2^56-1 range. We\n\t"
-	"@ normalize it to 2^54..2^55-1 (into r6:r7) with a conditional\n\t"
-	"@ shift (low bit is sticky). r5 contains -1 if the shift was done,\n\t"
-	"@ 0 otherwise.\n\t"
-	"ands	r6, r10, #1\n\t"
-	"lsrs	r5, r11, #23\n\t"
-	"rsbs	r5, r5, #0\n\t"
-	"orrs	r6, r6, r10, lsr #1\n\t"
-	"orrs	r6, r6, r11, lsl #31\n\t"
-	"lsrs	r7, r11, #1\n\t"
-	"eors	r10, r10, r6\n\t"
-	"eors	r11, r11, r7\n\t"
-	"bics	r10, r10, r5\n\t"
-	"bics	r11, r11, r5\n\t"
-	"eors	r6, r6, r10\n\t"
-	"eors	r7, r7, r11\n\t"
-	"\n\t"
-	"@ Compute aggregate exponent: ex + ey - 1023 + w\n\t"
-	"@ (where w = 1 if the conditional shift was done, 0 otherwise)\n\t"
-	"@ But we subtract 1 because the injection of the mantissa high\n\t"
-	"@ bit will increment the exponent by 1.\n\t"
-	"lsls	r0, r1, #1\n\t"
-	"lsls	r2, r3, #1\n\t"
-	"lsrs	r0, #21\n\t"
-	"addw	r4, r0, #0x7FF   @ save ex + 2047 in r4\n\t"
-	"lsrs	r2, #21\n\t"
-	"addw	r8, r2, #0x7FF   @ save ey + 2047 in r8\n\t"
-	"adds	r2, r0\n\t"
-	"subw	r2, r2, #1024\n\t"
-	"subs	r2, r5\n\t"
-	"\n\t"
-	"@ r5 is free.\n\t"
-	"@ Also, if either of the source exponents is 0, or the result\n\t"
-	"@ exponent is 0 or negative, then the result is zero and the\n\t"
-	"@ mantissa and the exponent shall be clamped to zero. Since\n\t"
-	"@ r2 contains the result exponent minus 1, we test on r2\n\t"
-	"@ being strictly negative.\n\t"
-	"ands	r4, r8    @ if bit 11 = 0 then one of the exponents was 0\n\t"
-	"mvns	r5, r2\n\t"
-	"ands	r5, r5, r4, lsl #20\n\t"
-	"ands	r2, r2, r5, asr #31\n\t"
-	"ands	r6, r6, r5, asr #31\n\t"
-	"ands	r7, r7, r5, asr #31\n\t"
-	"\n\t"
-	"@ Sign is the XOR of the sign of the operands. This is true in\n\t"
-	"@ all cases, including very small results (exponent underflow)\n\t"
-	"@ and zeros.\n\t"
-	"eors	r1, r3\n\t"
-	"bfc	r1, #0, #31\n\t"
-	"\n\t"
-	"@ Plug in the exponent.\n\t"
-	"bfi	r1, r2, #20, #11\n\t"
-	"\n\t"
-	"@ r2 and r3 are free.\n\t"
-	"@ Shift back to the normal 53-bit mantissa, with rounding.\n\t"
-	"@ Mantissa goes into r0:r1. For r1, we must use an addition\n\t"
-	"@ because the rounding may have triggered a carry, that should\n\t"
-	"@ be added to the exponent.\n\t"
-	"movs	r4, r6\n\t"
-	"lsrs	r0, r6, #2\n\t"
-	"orrs	r0, r0, r7, lsl #30\n\t"
-	"adds	r1, r1, r7, lsr #2\n\t"
-	"ands	r4, #0x7\n\t"
-	"movs	r3, #0xC8\n\t"
-	"lsrs	r3, r4\n\t"
-	"ands	r3, #1\n\t"
-	"adds	r0, r3\n\t"
-	"adcs	r1, #0\n\t"
-	"\n\t"
-	"pop	{ r4, r5, r6, r7, r8, r10, r11, pc }\n\t"
-	);
-}
-
-#else // yyyASM_CORTEXM4+0
-
-fpr
-fpr_mul(fpr x, fpr y)
-{
-	uint64_t xu, yu, w, zu, zv;
-	uint32_t x0, x1, y0, y1, z0, z1, z2;
-	int ex, ey, d, e, s;
-
-	/*
-	 * Extract absolute values as scaled unsigned integers. We
-	 * don't extract exponents yet.
-	 */
-	xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
-	yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
-
-	/*
-	 * We have two 53-bit integers to multiply; we need to split
-	 * each into a lower half and a upper half. Moreover, we
-	 * prefer to have lower halves to be of 25 bits each, for
-	 * reasons explained later on.
-	 */
-	x0 = (uint32_t)xu & 0x01FFFFFF;
-	x1 = (uint32_t)(xu >> 25);
-	y0 = (uint32_t)yu & 0x01FFFFFF;
-	y1 = (uint32_t)(yu >> 25);
-	w = (uint64_t)x0 * (uint64_t)y0;
-	z0 = (uint32_t)w & 0x01FFFFFF;
-	z1 = (uint32_t)(w >> 25);
-	w = (uint64_t)x0 * (uint64_t)y1;
-	z1 += (uint32_t)w & 0x01FFFFFF;
-	z2 = (uint32_t)(w >> 25);
-	w = (uint64_t)x1 * (uint64_t)y0;
-	z1 += (uint32_t)w & 0x01FFFFFF;
-	z2 += (uint32_t)(w >> 25);
-	zu = (uint64_t)x1 * (uint64_t)y1;
-	z2 += (z1 >> 25);
-	z1 &= 0x01FFFFFF;
-	zu += z2;
-
-	/*
-	 * Since xu and yu are both in the 2^52..2^53-1 range, the
-	 * product is in the 2^104..2^106-1 range. We first reassemble
-	 * it and round it into the 2^54..2^56-1 range; the bottom bit
-	 * is made "sticky". Since the low limbs z0 and z1 are 25 bits
-	 * each, we just take the upper part (zu), and consider z0 and
-	 * z1 only for purposes of stickiness.
-	 * (This is the reason why we chose 25-bit limbs above.)
-	 */
-	zu |= ((z0 | z1) + 0x01FFFFFF) >> 25;
-
-	/*
-	 * We normalize zu to the 2^54..s^55-1 range: it could be one
-	 * bit too large at this point. This is done with a conditional
-	 * right-shift that takes into account the sticky bit.
-	 */
-	zv = (zu >> 1) | (zu & 1);
-	w = zu >> 55;
-	zu ^= (zu ^ zv) & -w;
-
-	/*
-	 * Get the aggregate scaling factor:
-	 *
-	 *   - Each exponent is biased by 1023.
-	 *
-	 *   - Integral mantissas are scaled by 2^52, hence an
-	 *     extra 52 bias for each exponent.
-	 *
-	 *   - However, we right-shifted z by 50 bits, and then
-	 *     by 0 or 1 extra bit (depending on the value of w).
-	 *
-	 * In total, we must add the exponents, then subtract
-	 * 2 * (1023 + 52), then add 50 + w.
-	 */
-	ex = (int)((x >> 52) & 0x7FF);
-	ey = (int)((y >> 52) & 0x7FF);
-	e = ex + ey - 2100 + (int)w;
-
-	/*
-	 * Sign bit is the XOR of the operand sign bits.
-	 */
-	s = (int)((x ^ y) >> 63);
-
-	/*
-	 * Corrective actions for zeros: if either of the operands is
-	 * zero, then the computations above were wrong. Test for zero
-	 * is whether ex or ey is zero. We just have to set the mantissa
-	 * (zu) to zero, the FPR() function will normalize e.
-	 */
-	d = ((ex + 0x7FF) & (ey + 0x7FF)) >> 11;
-	zu &= -(uint64_t)d;
-
-	/*
-	 * FPR() packs the result and applies proper rounding.
-	 */
-	return FPR(s, e, zu);
-}
-
-#endif // yyyASM_CORTEXM4-
-
-#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1
-
-__attribute__((naked))
-fpr
-fpr_div(fpr x __attribute__((unused)), fpr y __attribute__((unused)))
-{
-	__asm__ (
-	"push	{ r4, r5, r6, r7, r8, r10, r11, lr }\n\t"
-
-	"@ Extract mantissas of x and y, in r0:r4 and r2:r5, respectively.\n\t"
-	"@ We don't touch r1 and r3 as they contain the exponents and\n\t"
-	"@ signs, which we'll need later on.\n\t"
-	"ubfx	r4, r1, #0, #20\n\t"
-	"ubfx	r5, r3, #0, #20\n\t"
-	"orrs	r4, r4, #0x00100000\n\t"
-	"orrs	r5, r5, #0x00100000\n\t"
-	"\n\t"
-	"@ Perform bit-by-bit division. We want a 56-bit result in r8:r10\n\t"
-	"@ (low bit is 0). Bits come from the carry flag and are\n\t"
-	"@ injected with rrx, i.e. in position 31; we thus get bits in\n\t"
-	"@ the reverse order. Bits accumulate in r8; after the first 24\n\t"
-	"@ bits, we move the quotient bits to r10.\n\t"
-	"eors	r8, r8\n\t"
-	"\n\t"
-
-#define DIVSTEP \
-	"subs	r6, r0, r2\n\t" \
-	"sbcs	r7, r4, r5\n\t" \
-	"rrx	r8, r8\n\t" \
-	"ands	r6, r2, r8, asr #31\n\t" \
-	"ands	r7, r5, r8, asr #31\n\t" \
-	"subs	r0, r6\n\t" \
-	"sbcs	r4, r7\n\t" \
-	"adds	r0, r0, r0\n\t" \
-	"adcs	r4, r4, r4\n\t"
-
-#define DIVSTEP4   DIVSTEP DIVSTEP DIVSTEP DIVSTEP
-#define DIVSTEP8   DIVSTEP4 DIVSTEP4
-
-	DIVSTEP8
-	DIVSTEP8
-	DIVSTEP8
-
-	"\n\t"
-	"@ We have the first 24 bits of the quotient, move them to r10.\n\t"
-	"rbit	r10, r8\n\t"
-	"\n\t"
-
-	DIVSTEP8
-	DIVSTEP8
-	DIVSTEP8
-	DIVSTEP4 DIVSTEP DIVSTEP DIVSTEP
-
-#undef DIVSTEP
-#undef DIVSTEP4
-#undef DIVSTEP8
-
-	"\n\t"
-	"@ Lowest bit will be set if remainder is non-zero at this point\n\t"
-	"@ (this is the 'sticky' bit).\n\t"
-	"subs	r0, #1\n\t"
-	"sbcs	r4, #0\n\t"
-	"rrx	r8, r8\n\t"
-	"\n\t"
-	"@ We now have the next (low) 32 bits of the quotient.\n\t"
-	"rbit	r8, r8\n\t"
-	"\n\t"
-	"@ Since both operands had their top bit set, we know that the\n\t"
-	"@ result at this point is in 2^54..2^56-1. We scale it down\n\t"
-	"@ to 2^54..2^55-1 with a conditional shift. We also write the\n\t"
-	"@ result in r4:r5. If the shift is done, r6 will contain -1.\n\t"
-	"ands	r4, r8, #1\n\t"
-	"lsrs	r6, r10, #23\n\t"
-	"rsbs	r6, r6, #0\n\t"
-	"orrs	r4, r4, r8, lsr #1\n\t"
-	"orrs	r4, r4, r10, lsl #31\n\t"
-	"lsrs	r5, r10, #1\n\t"
-	"eors	r8, r8, r4\n\t"
-	"eors	r10, r10, r5\n\t"
-	"bics	r8, r8, r6\n\t"
-	"bics	r10, r10, r6\n\t"
-	"eors	r4, r4, r8\n\t"
-	"eors	r5, r5, r10\n\t"
-	"\n\t"
-	"@ Compute aggregate exponent: ex - ey + 1022 + w\n\t"
-	"@ (where w = 1 if the conditional shift was done, 0 otherwise)\n\t"
-	"@ But we subtract 1 because the injection of the mantissa high\n\t"
-	"@ bit will increment the exponent by 1.\n\t"
-	"lsls	r0, r1, #1\n\t"
-	"lsls	r2, r3, #1\n\t"
-	"lsrs	r0, r0, #21\n\t"
-	"addw	r7, r0, #0x7FF  @ save ex + 2047 in r7\n\t"
-	"subs	r0, r0, r2, lsr #21\n\t"
-	"addw	r0, r0, #1021\n\t"
-	"subs	r0, r6\n\t"
-	"\n\t"
-	"@ If the x operand was zero, then the computation was wrong and\n\t"
-	"@ the result is zero. Also, if the result exponent is zero or\n\t"
-	"@ negative, then the mantissa shall be clamped to zero. Since r0\n\t"
-	"@ contains the result exponent minus 1, we test on r0 being\n\t"
-	"@ strictly negative.\n\t"
-	"mvns	r2, r0\n\t"
-	"ands	r2, r2, r7, lsl #20\n\t"
-	"ands	r0, r0, r2, asr #31\n\t"
-	"ands	r4, r4, r2, asr #31\n\t"
-	"ands	r5, r5, r2, asr #31\n\t"
-	"\n\t"
-	"@ Sign is the XOR of the sign of the operands. This is true in\n\t"
-	"@ all cases, including very small results (exponent underflow)\n\t"
-	"@ and zeros.\n\t"
-	"eors	r1, r3\n\t"
-	"bfc	r1, #0, #31\n\t"
-	"\n\t"
-	"@ Plug in the exponent.\n\t"
-	"bfi	r1, r0, #20, #11\n\t"
-	"\n\t"
-	"@ Shift back to the normal 53-bit mantissa, with rounding.\n\t"
-	"@ Mantissa goes into r0:r1. For r1, we must use an addition\n\t"
-	"@ because the rounding may have triggered a carry, that should\n\t"
-	"@ be added to the exponent.\n\t"
-	"movs	r6, r4\n\t"
-	"lsrs	r0, r4, #2\n\t"
-	"orrs	r0, r0, r5, lsl #30\n\t"
-	"adds	r1, r1, r5, lsr #2\n\t"
-	"ands	r6, #0x7\n\t"
-	"movs	r3, #0xC8\n\t"
-	"lsrs	r3, r6\n\t"
-	"ands	r3, #1\n\t"
-	"adds	r0, r3\n\t"
-	"adcs	r1, #0\n\t"
-	"\n\t"
-	"pop	{ r4, r5, r6, r7, r8, r10, r11, pc }\n\t"
-	);
-}
-
-#else // yyyASM_CORTEXM4+0
-
-fpr
-fpr_div(fpr x, fpr y)
-{
-	uint64_t xu, yu, q, q2, w;
-	int i, ex, ey, e, d, s;
-
-	/*
-	 * Extract mantissas of x and y (unsigned).
-	 */
-	xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
-	yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
-
-	/*
-	 * Perform bit-by-bit division of xu by yu. We run it for 55 bits.
-	 */
-	q = 0;
-	for (i = 0; i < 55; i ++) {
-		/*
-		 * If yu is less than or equal xu, then subtract it and
-		 * push a 1 in the quotient; otherwise, leave xu unchanged
-		 * and push a 0.
-		 */
-		uint64_t b;
-
-		b = ((xu - yu) >> 63) - 1;
-		xu -= b & yu;
-		q |= b & 1;
-		xu <<= 1;
-		q <<= 1;
-	}
-
-	/*
-	 * We got 55 bits in the quotient, followed by an extra zero. We
-	 * want that 56th bit to be "sticky": it should be a 1 if and
-	 * only if the remainder (xu) is non-zero.
-	 */
-	q |= (xu | -xu) >> 63;
-
-	/*
-	 * Quotient is at most 2^56-1. Its top bit may be zero, but in
-	 * that case the next-to-top bit will be a one, since the
-	 * initial xu and yu were both in the 2^52..2^53-1 range.
-	 * We perform a conditional shift to normalize q to the
-	 * 2^54..2^55-1 range (with the bottom bit being sticky).
-	 */
-	q2 = (q >> 1) | (q & 1);
-	w = q >> 55;
-	q ^= (q ^ q2) & -w;
-
-	/*
-	 * Extract exponents to compute the scaling factor:
-	 *
-	 *   - Each exponent is biased and we scaled them up by
-	 *     52 bits; but these biases will cancel out.
-	 *
-	 *   - The division loop produced a 55-bit shifted result,
-	 *     so we must scale it down by 55 bits.
-	 *
-	 *   - If w = 1, we right-shifted the integer by 1 bit,
-	 *     hence we must add 1 to the scaling.
-	 */
-	ex = (int)((x >> 52) & 0x7FF);
-	ey = (int)((y >> 52) & 0x7FF);
-	e = ex - ey - 55 + (int)w;
-
-	/*
-	 * Sign is the XOR of the signs of the operands.
-	 */
-	s = (int)((x ^ y) >> 63);
-
-	/*
-	 * Corrective actions for zeros: if x = 0, then the computation
-	 * is wrong, and we must clamp e and q to 0. We do not care
-	 * about the case y = 0 (as per assumptions in this module,
-	 * the caller does not perform divisions by zero).
-	 */
-	d = (ex + 0x7FF) >> 11;
-	s &= d;
-	e &= -d;
-	q &= -(uint64_t)d;
-
-	/*
-	 * FPR() packs the result and applies proper rounding.
-	 */
-	return FPR(s, e, q);
-}
-
-#endif // yyyASM_CORTEXM4-
-
-#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1
-
-__attribute__((naked))
-fpr
-fpr_sqrt(fpr x __attribute__((unused)))
-{
-	__asm__ (
-	"push	{ r4, r5, r6, r7, r8, r10, r11, lr }\n\t"
-	"\n\t"
-	"@ Extract mantissa (r0:r1) and exponent (r2). We assume that the\n\t"
-	"@ sign is positive. If the source is zero, then the mantissa is\n\t"
-	"@ set to 0.\n\t"
-	"lsrs	r2, r1, #20\n\t"
-	"bfc	r1, #20, #12\n\t"
-	"addw	r3, r2, #0x7FF\n\t"
-	"subw	r2, r2, #1023\n\t"
-	"lsrs	r3, r3, #11\n\t"
-	"orrs	r1, r1, r3, lsl #20\n\t"
-	"\n\t"
-	"@ If the exponent is odd, then multiply mantissa by 2 and subtract\n\t"
-	"@ 1 from the exponent.\n\t"
-	"ands	r3, r2, #1\n\t"
-	"subs	r2, r2, r3\n\t"
-	"rsbs	r3, r3, #0\n\t"
-	"ands	r4, r1, r3\n\t"
-	"ands	r3, r0\n\t"
-	"adds	r0, r3\n\t"
-	"adcs	r1, r4\n\t"
-	"\n\t"
-	"@ Left-shift the mantissa by 9 bits to put it in the\n\t"
-	"@ 2^61..2^63-1 range (unless it is exactly 0).\n\t"
-	"lsls	r1, r1, #9\n\t"
-	"orrs	r1, r1, r0, lsr #23\n\t"
-	"lsls	r0, r0, #9\n\t"
-	"\n\t"
-	"@ Compute the square root bit-by-bit.\n\t"
-	"@ There are 54 iterations; first 30 can work on top word only.\n\t"
-	"@   q = r3 (bit-reversed)\n\t"
-	"@   s = r5\n\t"
-	"eors	r3, r3\n\t"
-	"eors	r5, r5\n\t"
-
-#define SQRT_STEP_HI(bit) \
-	"orrs	r6, r5, #(1 << (" #bit "))\n\t" \
-	"subs	r7, r1, r6\n\t" \
-	"rrx	r3, r3\n\t" \
-	"ands	r6, r6, r3, asr #31\n\t" \
-	"subs	r1, r1, r6\n\t" \
-	"lsrs	r6, r3, #31\n\t" \
-	"orrs	r5, r5, r6, lsl #((" #bit ") + 1)\n\t" \
-	"adds	r0, r0\n\t" \
-	"adcs	r1, r1\n\t"
-
-#define SQRT_STEP_HIx5(b)  \
-		SQRT_STEP_HI((b)+4) \
-		SQRT_STEP_HI((b)+3) \
-		SQRT_STEP_HI((b)+2) \
-		SQRT_STEP_HI((b)+1) \
-		SQRT_STEP_HI(b)
-
-	SQRT_STEP_HIx5(25)
-	SQRT_STEP_HIx5(20)
-	SQRT_STEP_HIx5(15)
-	SQRT_STEP_HIx5(10)
-	SQRT_STEP_HIx5(5)
-	SQRT_STEP_HIx5(0)
-
-#undef SQRT_STEP_HI
-#undef SQRT_STEP_HIx5
-
-	"@ Top 30 bits of the result must be reversed: they were\n\t"
-	"@ accumulated with rrx (hence from the top bit).\n\t"
-	"rbit	r3, r3\n\t"
-	"\n\t"
-	"@ For the next 24 iterations, we must use two-word operations.\n\t"
-	"@   bits of q now accumulate in r4\n\t"
-	"@   s is in r6:r5\n\t"
-	"eors	r4, r4\n\t"
-	"eors	r6, r6\n\t"
-	"\n\t"
-	"@ First iteration is special because the potential bit goes into\n\t"
-	"@ r5, not r6.\n\t"
-	"orrs	r7, r6, #(1 << 31)\n\t"
-	"subs	r8, r0, r7\n\t"
-	"sbcs	r10, r1, r5\n\t"
-	"rrx	r4, r4\n\t"
-	"ands	r7, r7, r4, asr #31\n\t"
-	"ands	r8, r5, r4, asr #31\n\t"
-	"subs	r0, r0, r7\n\t"
-	"sbcs	r1, r1, r8\n\t"
-	"lsrs	r7, r4, #31\n\t"
-	"orrs	r5, r5, r4, lsr #31\n\t"
-	"adds	r0, r0\n\t"
-	"adcs	r1, r1\n\t"
-
-#define SQRT_STEP_LO(bit) \
-	"orrs	r7, r6, #(1 << (" #bit "))\n\t" \
-	"subs	r8, r0, r7\n\t" \
-	"sbcs	r10, r1, r5\n\t" \
-	"rrx	r4, r4\n\t" \
-	"ands	r7, r7, r4, asr #31\n\t" \
-	"ands	r8, r5, r4, asr #31\n\t" \
-	"subs	r0, r0, r7\n\t" \
-	"sbcs	r1, r1, r8\n\t" \
-	"lsrs	r7, r4, #31\n\t" \
-	"orrs	r6, r6, r7, lsl #((" #bit ") + 1)\n\t" \
-	"adds	r0, r0\n\t" \
-	"adcs	r1, r1\n\t"
-
-#define SQRT_STEP_LOx4(b) \
-		SQRT_STEP_LO((b)+3) \
-		SQRT_STEP_LO((b)+2) \
-		SQRT_STEP_LO((b)+1) \
-		SQRT_STEP_LO(b)
-
-	SQRT_STEP_LO(30)
-	SQRT_STEP_LO(29)
-	SQRT_STEP_LO(28)
-	SQRT_STEP_LOx4(24)
-	SQRT_STEP_LOx4(20)
-	SQRT_STEP_LOx4(16)
-	SQRT_STEP_LOx4(12)
-	SQRT_STEP_LOx4(8)
-
-#undef SQRT_STEP_LO
-#undef SQRT_STEP_LOx4
-
-	"@ Put low 24 bits in the right order.\n\t"
-	"rbit	r4, r4\n\t"
-	"\n\t"
-	"@ We have a 54-bit result; compute the 55-th bit as the 'sticky'\n\t"
-	"@ bit: it is non-zero if and only if r0:r1 is non-zero. We put the\n\t"
-	"@ three low bits (including the sticky bit) in r5.\n\t"
-	"orrs	r0, r1\n\t"
-	"rsbs	r1, r0, #0\n\t"
-	"orrs	r0, r1\n\t"
-	"lsls	r5, r4, #1\n\t"
-	"orrs	r5, r5, r0, lsr #31\n\t"
-	"ands	r5, #0x7\n\t"
-	"\n\t"
-	"@ Compute the rounding: r6 is set to 0 or 1, and will be added\n\t"
-	"@ to the mantissa.\n\t"
-	"movs	r6, #0xC8\n\t"
-	"lsrs	r6, r5\n\t"
-	"ands	r6, #1\n\t"
-	"\n\t"
-	"@ Put the mantissa (53 bits, in the 2^52..2^53-1 range) in r0:r1\n\t"
-	"@ (rounding not applied yet).\n\t"
-	"lsrs	r0, r4, #1\n\t"
-	"orrs	r0, r0, r3, lsl #23\n\t"
-	"lsrs	r1, r3, #9\n\t"
-	"\n\t"
-	"@ Compute new exponent. This is half the old one (then reencoded\n\t"
-	"@ by adding 1023). Exception: if the mantissa is zero, then the\n\t"
-	"@ encoded exponent is set to 0. At that point, if the mantissa\n\t"
-	"@ is non-zero, then its high bit (bit 52, i.e. bit 20 of r1) is\n\t"
-	"@ non-zero. Note that the exponent cannot go out of range.\n\t"
-	"lsrs	r2, r2, #1\n\t"
-	"addw	r2, r2, #1023\n\t"
-	"lsrs	r5, r1, #20\n\t"
-	"rsbs	r5, r5, #0\n\t"
-	"ands	r2, r5\n\t"
-	"\n\t"
-	"@ Place exponent. This overwrites the high bit of the mantissa.\n\t"
-	"bfi	r1, r2, #20, #11\n\t"
-	"\n\t"
-	"@ Apply rounding. This may create a carry that will spill into\n\t"
-	"@ the exponent, which is exactly what should be done in that case\n\t"
-	"@ (i.e. increment the exponent).\n\t"
-	"adds	r0, r0, r6\n\t"
-	"adcs	r1, r1, #0\n\t"
-	"\n\t"
-	"pop	{ r4, r5, r6, r7, r8, r10, r11, pc }\n\t"
-	);
-}
-
-#else // yyyASM_CORTEXM4+0
-
-fpr
-fpr_sqrt(fpr x)
-{
-	uint64_t xu, q, s, r;
-	int ex, e;
-
-	/*
-	 * Extract the mantissa and the exponent. We don't care about
-	 * the sign: by assumption, the operand is nonnegative.
-	 * We want the "true" exponent corresponding to a mantissa
-	 * in the 1..2 range.
-	 */
-	xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
-	ex = (int)((x >> 52) & 0x7FF);
-	e = ex - 1023;
-
-	/*
-	 * If the exponent is odd, double the mantissa and decrement
-	 * the exponent. The exponent is then halved to account for
-	 * the square root.
-	 */
-	xu += xu & -(uint64_t)(e & 1);
-	e >>= 1;
-
-	/*
-	 * Double the mantissa.
-	 */
-	xu <<= 1;
-
-	/*
-	 * We now have a mantissa in the 2^53..2^55-1 range. It
-	 * represents a value between 1 (inclusive) and 4 (exclusive)
-	 * in fixed point notation (with 53 fractional bits). We
-	 * compute the square root bit by bit.
-	 */
-	q = 0;
-	s = 0;
-	r = (uint64_t)1 << 53;
-	for (int i = 0; i < 54; i ++) {
-		uint64_t t, b;
-
-		t = s + r;
-		b = ((xu - t) >> 63) - 1;
-		s += (r << 1) & b;
-		xu -= t & b;
-		q += r & b;
-		xu <<= 1;
-		r >>= 1;
-	}
-
-	/*
-	 * Now, q is a rounded-low 54-bit value, with a leading 1,
-	 * 52 fractional digits, and an additional guard bit. We add
-	 * an extra sticky bit to account for what remains of the operand.
-	 */
-	q <<= 1;
-	q |= (xu | -xu) >> 63;
-
-	/*
-	 * Result q is in the 2^54..2^55-1 range; we bias the exponent
-	 * by 54 bits (the value e at that point contains the "true"
-	 * exponent, but q is now considered an integer, i.e. scaled
-	 * up.
-	 */
-	e -= 54;
-
-	/*
-	 * Corrective action for an operand of value zero.
-	 */
-	q &= -(uint64_t)((ex + 0x7FF) >> 11);
-
-	/*
-	 * Apply rounding and back result.
-	 */
-	return FPR(0, e, q);
-}
-
-#endif // yyyASM_CORTEXM4-
-
-uint64_t
-fpr_expm_p63(fpr x, fpr ccs)
-{
-	/*
-	 * Polynomial approximation of exp(-x) is taken from FACCT:
-	 *   https://eprint.iacr.org/2018/1234
-	 * Specifically, values are extracted from the implementation
-	 * referenced from the FACCT article, and available at:
-	 *   https://github.com/raykzhao/gaussian
-	 * Here, the coefficients have been scaled up by 2^63 and
-	 * converted to integers.
-	 *
-	 * Tests over more than 24 billions of random inputs in the
-	 * 0..log(2) range have never shown a deviation larger than
-	 * 2^(-50) from the true mathematical value.
-	 */
-	static const uint64_t C[] = {
-		0x00000004741183A3u,
-		0x00000036548CFC06u,
-		0x0000024FDCBF140Au,
-		0x0000171D939DE045u,
-		0x0000D00CF58F6F84u,
-		0x000680681CF796E3u,
-		0x002D82D8305B0FEAu,
-		0x011111110E066FD0u,
-		0x0555555555070F00u,
-		0x155555555581FF00u,
-		0x400000000002B400u,
-		0x7FFFFFFFFFFF4800u,
-		0x8000000000000000u
-	};
-
-	uint64_t z, y;
-	unsigned u;
-	uint32_t z0, z1, y0, y1;
-	uint64_t a, b;
-
-	y = C[0];
-	z = (uint64_t)fpr_trunc(fpr_mul(x, fpr_ptwo63)) << 1;
-	for (u = 1; u < (sizeof C) / sizeof(C[0]); u ++) {
-		/*
-		 * Compute product z * y over 128 bits, but keep only
-		 * the top 64 bits.
-		 *
-		 * TODO: On some architectures/compilers we could use
-		 * some intrinsics (__umulh() on MSVC) or other compiler
-		 * extensions (unsigned __int128 on GCC / Clang) for
-		 * improved speed; however, most 64-bit architectures
-		 * also have appropriate IEEE754 floating-point support,
-		 * which is better.
-		 */
-		uint64_t c;
-
-		z0 = (uint32_t)z;
-		z1 = (uint32_t)(z >> 32);
-		y0 = (uint32_t)y;
-		y1 = (uint32_t)(y >> 32);
-		a = ((uint64_t)z0 * (uint64_t)y1)
-			+ (((uint64_t)z0 * (uint64_t)y0) >> 32);
-		b = ((uint64_t)z1 * (uint64_t)y0);
-		c = (a >> 32) + (b >> 32);
-		c += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
-		c += (uint64_t)z1 * (uint64_t)y1;
-		y = C[u] - c;
-	}
-
-	/*
-	 * The scaling factor must be applied at the end. Since y is now
-	 * in fixed-point notation, we have to convert the factor to the
-	 * same format, and do an extra integer multiplication.
-	 */
-	z = (uint64_t)fpr_trunc(fpr_mul(ccs, fpr_ptwo63)) << 1;
-	z0 = (uint32_t)z;
-	z1 = (uint32_t)(z >> 32);
-	y0 = (uint32_t)y;
-	y1 = (uint32_t)(y >> 32);
-	a = ((uint64_t)z0 * (uint64_t)y1)
-		+ (((uint64_t)z0 * (uint64_t)y0) >> 32);
-	b = ((uint64_t)z1 * (uint64_t)y0);
-	y = (a >> 32) + (b >> 32);
-	y += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
-	y += (uint64_t)z1 * (uint64_t)y1;
-
-	return y;
-}
-
-const fpr fpr_gm_tab[] = {
-	0, 0,
-	 9223372036854775808U,  4607182418800017408U,
-	 4604544271217802189U,  4604544271217802189U,
-	13827916308072577997U,  4604544271217802189U,
-	 4606496786581982534U,  4600565431771507043U,
-	13823937468626282851U,  4606496786581982534U,
-	 4600565431771507043U,  4606496786581982534U,
-	13829868823436758342U,  4600565431771507043U,
-	 4607009347991985328U,  4596196889902818827U,
-	13819568926757594635U,  4607009347991985328U,
-	 4603179351334086856U,  4605664432017547683U,
-	13829036468872323491U,  4603179351334086856U,
-	 4605664432017547683U,  4603179351334086856U,
-	13826551388188862664U,  4605664432017547683U,
-	 4596196889902818827U,  4607009347991985328U,
-	13830381384846761136U,  4596196889902818827U,
-	 4607139046673687846U,  4591727299969791020U,
-	13815099336824566828U,  4607139046673687846U,
-	 4603889326261607894U,  4605137878724712257U,
-	13828509915579488065U,  4603889326261607894U,
-	 4606118860100255153U,  4602163548591158843U,
-	13825535585445934651U,  4606118860100255153U,
-	 4598900923775164166U,  4606794571824115162U,
-	13830166608678890970U,  4598900923775164166U,
-	 4606794571824115162U,  4598900923775164166U,
-	13822272960629939974U,  4606794571824115162U,
-	 4602163548591158843U,  4606118860100255153U,
-	13829490896955030961U,  4602163548591158843U,
-	 4605137878724712257U,  4603889326261607894U,
-	13827261363116383702U,  4605137878724712257U,
-	 4591727299969791020U,  4607139046673687846U,
-	13830511083528463654U,  4591727299969791020U,
-	 4607171569234046334U,  4587232218149935124U,
-	13810604255004710932U,  4607171569234046334U,
-	 4604224084862889120U,  4604849113969373103U,
-	13828221150824148911U,  4604224084862889120U,
-	 4606317631232591731U,  4601373767755717824U,
-	13824745804610493632U,  4606317631232591731U,
-	 4599740487990714333U,  4606655894547498725U,
-	13830027931402274533U,  4599740487990714333U,
-	 4606912484326125783U,  4597922303871901467U,
-	13821294340726677275U,  4606912484326125783U,
-	 4602805845399633902U,  4605900952042040894U,
-	13829272988896816702U,  4602805845399633902U,
-	 4605409869824231233U,  4603540801876750389U,
-	13826912838731526197U,  4605409869824231233U,
-	 4594454542771183930U,  4607084929468638487U,
-	13830456966323414295U,  4594454542771183930U,
-	 4607084929468638487U,  4594454542771183930U,
-	13817826579625959738U,  4607084929468638487U,
-	 4603540801876750389U,  4605409869824231233U,
-	13828781906679007041U,  4603540801876750389U,
-	 4605900952042040894U,  4602805845399633902U,
-	13826177882254409710U,  4605900952042040894U,
-	 4597922303871901467U,  4606912484326125783U,
-	13830284521180901591U,  4597922303871901467U,
-	 4606655894547498725U,  4599740487990714333U,
-	13823112524845490141U,  4606655894547498725U,
-	 4601373767755717824U,  4606317631232591731U,
-	13829689668087367539U,  4601373767755717824U,
-	 4604849113969373103U,  4604224084862889120U,
-	13827596121717664928U,  4604849113969373103U,
-	 4587232218149935124U,  4607171569234046334U,
-	13830543606088822142U,  4587232218149935124U,
-	 4607179706000002317U,  4582730748936808062U,
-	13806102785791583870U,  4607179706000002317U,
-	 4604386048625945823U,  4604698657331085206U,
-	13828070694185861014U,  4604386048625945823U,
-	 4606409688975526202U,  4600971798440897930U,
-	13824343835295673738U,  4606409688975526202U,
-	 4600154912527631775U,  4606578871587619388U,
-	13829950908442395196U,  4600154912527631775U,
-	 4606963563043808649U,  4597061974398750563U,
-	13820434011253526371U,  4606963563043808649U,
-	 4602994049708411683U,  4605784983948558848U,
-	13829157020803334656U,  4602994049708411683U,
-	 4605539368864982914U,  4603361638657888991U,
-	13826733675512664799U,  4605539368864982914U,
-	 4595327571478659014U,  4607049811591515049U,
-	13830421848446290857U,  4595327571478659014U,
-	 4607114680469659603U,  4593485039402578702U,
-	13816857076257354510U,  4607114680469659603U,
-	 4603716733069447353U,  4605276012900672507U,
-	13828648049755448315U,  4603716733069447353U,
-	 4606012266443150634U,  4602550884377336506U,
-	13825922921232112314U,  4606012266443150634U,
-	 4598476289818621559U,  4606856142606846307U,
-	13830228179461622115U,  4598476289818621559U,
-	 4606727809065869586U,  4599322407794599425U,
-	13822694444649375233U,  4606727809065869586U,
-	 4601771097584682078U,  4606220668805321205U,
-	13829592705660097013U,  4601771097584682078U,
-	 4604995550503212910U,  4604058477489546729U,
-	13827430514344322537U,  4604995550503212910U,
-	 4589965306122607094U,  4607158013403433018U,
-	13830530050258208826U,  4589965306122607094U,
-	 4607158013403433018U,  4589965306122607094U,
-	13813337342977382902U,  4607158013403433018U,
-	 4604058477489546729U,  4604995550503212910U,
-	13828367587357988718U,  4604058477489546729U,
-	 4606220668805321205U,  4601771097584682078U,
-	13825143134439457886U,  4606220668805321205U,
-	 4599322407794599425U,  4606727809065869586U,
-	13830099845920645394U,  4599322407794599425U,
-	 4606856142606846307U,  4598476289818621559U,
-	13821848326673397367U,  4606856142606846307U,
-	 4602550884377336506U,  4606012266443150634U,
-	13829384303297926442U,  4602550884377336506U,
-	 4605276012900672507U,  4603716733069447353U,
-	13827088769924223161U,  4605276012900672507U,
-	 4593485039402578702U,  4607114680469659603U,
-	13830486717324435411U,  4593485039402578702U,
-	 4607049811591515049U,  4595327571478659014U,
-	13818699608333434822U,  4607049811591515049U,
-	 4603361638657888991U,  4605539368864982914U,
-	13828911405719758722U,  4603361638657888991U,
-	 4605784983948558848U,  4602994049708411683U,
-	13826366086563187491U,  4605784983948558848U,
-	 4597061974398750563U,  4606963563043808649U,
-	13830335599898584457U,  4597061974398750563U,
-	 4606578871587619388U,  4600154912527631775U,
-	13823526949382407583U,  4606578871587619388U,
-	 4600971798440897930U,  4606409688975526202U,
-	13829781725830302010U,  4600971798440897930U,
-	 4604698657331085206U,  4604386048625945823U,
-	13827758085480721631U,  4604698657331085206U,
-	 4582730748936808062U,  4607179706000002317U,
-	13830551742854778125U,  4582730748936808062U,
-	 4607181740574479067U,  4578227681973159812U,
-	13801599718827935620U,  4607181740574479067U,
-	 4604465633578481725U,  4604621949701367983U,
-	13827993986556143791U,  4604465633578481725U,
-	 4606453861145241227U,  4600769149537129431U,
-	13824141186391905239U,  4606453861145241227U,
-	 4600360675823176935U,  4606538458821337243U,
-	13829910495676113051U,  4600360675823176935U,
-	 4606987119037722413U,  4596629994023683153U,
-	13820002030878458961U,  4606987119037722413U,
-	 4603087070374583113U,  4605725276488455441U,
-	13829097313343231249U,  4603087070374583113U,
-	 4605602459698789090U,  4603270878689749849U,
-	13826642915544525657U,  4605602459698789090U,
-	 4595762727260045105U,  4607030246558998647U,
-	13830402283413774455U,  4595762727260045105U,
-	 4607127537664763515U,  4592606767730311893U,
-	13815978804585087701U,  4607127537664763515U,
-	 4603803453461190356U,  4605207475328619533U,
-	13828579512183395341U,  4603803453461190356U,
-	 4606066157444814153U,  4602357870542944470U,
-	13825729907397720278U,  4606066157444814153U,
-	 4598688984595225406U,  4606826008603986804U,
-	13830198045458762612U,  4598688984595225406U,
-	 4606761837001494797U,  4599112075441176914U,
-	13822484112295952722U,  4606761837001494797U,
-	 4601967947786150793U,  4606170366472647579U,
-	13829542403327423387U,  4601967947786150793U,
-	 4605067233569943231U,  4603974338538572089U,
-	13827346375393347897U,  4605067233569943231U,
-	 4590846768565625881U,  4607149205763218185U,
-	13830521242617993993U,  4590846768565625881U,
-	 4607165468267934125U,  4588998070480937184U,
-	13812370107335712992U,  4607165468267934125U,
-	 4604141730443515286U,  4604922840319727473U,
-	13828294877174503281U,  4604141730443515286U,
-	 4606269759522929756U,  4601573027631668967U,
-	13824945064486444775U,  4606269759522929756U,
-	 4599531889160152938U,  4606692493141721470U,
-	13830064529996497278U,  4599531889160152938U,
-	 4606884969294623682U,  4598262871476403630U,
-	13821634908331179438U,  4606884969294623682U,
-	 4602710690099904183U,  4605957195211051218U,
-	13829329232065827026U,  4602710690099904183U,
-	 4605343481119364930U,  4603629178146150899U,
-	13827001215000926707U,  4605343481119364930U,
-	 4594016801320007031U,  4607100477024622401U,
-	13830472513879398209U,  4594016801320007031U,
-	 4607068040143112603U,  4594891488091520602U,
-	13818263524946296410U,  4607068040143112603U,
-	 4603451617570386922U,  4605475169017376660U,
-	13828847205872152468U,  4603451617570386922U,
-	 4605843545406134034U,  4602900303344142735U,
-	13826272340198918543U,  4605843545406134034U,
-	 4597492765973365521U,  4606938683557690074U,
-	13830310720412465882U,  4597492765973365521U,
-	 4606618018794815019U,  4599948172872067014U,
-	13823320209726842822U,  4606618018794815019U,
-	 4601173347964633034U,  4606364276725003740U,
-	13829736313579779548U,  4601173347964633034U,
-	 4604774382555066977U,  4604305528345395596U,
-	13827677565200171404U,  4604774382555066977U,
-	 4585465300892538317U,  4607176315382986589U,
-	13830548352237762397U,  4585465300892538317U,
-	 4607176315382986589U,  4585465300892538317U,
-	13808837337747314125U,  4607176315382986589U,
-	 4604305528345395596U,  4604774382555066977U,
-	13828146419409842785U,  4604305528345395596U,
-	 4606364276725003740U,  4601173347964633034U,
-	13824545384819408842U,  4606364276725003740U,
-	 4599948172872067014U,  4606618018794815019U,
-	13829990055649590827U,  4599948172872067014U,
-	 4606938683557690074U,  4597492765973365521U,
-	13820864802828141329U,  4606938683557690074U,
-	 4602900303344142735U,  4605843545406134034U,
-	13829215582260909842U,  4602900303344142735U,
-	 4605475169017376660U,  4603451617570386922U,
-	13826823654425162730U,  4605475169017376660U,
-	 4594891488091520602U,  4607068040143112603U,
-	13830440076997888411U,  4594891488091520602U,
-	 4607100477024622401U,  4594016801320007031U,
-	13817388838174782839U,  4607100477024622401U,
-	 4603629178146150899U,  4605343481119364930U,
-	13828715517974140738U,  4603629178146150899U,
-	 4605957195211051218U,  4602710690099904183U,
-	13826082726954679991U,  4605957195211051218U,
-	 4598262871476403630U,  4606884969294623682U,
-	13830257006149399490U,  4598262871476403630U,
-	 4606692493141721470U,  4599531889160152938U,
-	13822903926014928746U,  4606692493141721470U,
-	 4601573027631668967U,  4606269759522929756U,
-	13829641796377705564U,  4601573027631668967U,
-	 4604922840319727473U,  4604141730443515286U,
-	13827513767298291094U,  4604922840319727473U,
-	 4588998070480937184U,  4607165468267934125U,
-	13830537505122709933U,  4588998070480937184U,
-	 4607149205763218185U,  4590846768565625881U,
-	13814218805420401689U,  4607149205763218185U,
-	 4603974338538572089U,  4605067233569943231U,
-	13828439270424719039U,  4603974338538572089U,
-	 4606170366472647579U,  4601967947786150793U,
-	13825339984640926601U,  4606170366472647579U,
-	 4599112075441176914U,  4606761837001494797U,
-	13830133873856270605U,  4599112075441176914U,
-	 4606826008603986804U,  4598688984595225406U,
-	13822061021450001214U,  4606826008603986804U,
-	 4602357870542944470U,  4606066157444814153U,
-	13829438194299589961U,  4602357870542944470U,
-	 4605207475328619533U,  4603803453461190356U,
-	13827175490315966164U,  4605207475328619533U,
-	 4592606767730311893U,  4607127537664763515U,
-	13830499574519539323U,  4592606767730311893U,
-	 4607030246558998647U,  4595762727260045105U,
-	13819134764114820913U,  4607030246558998647U,
-	 4603270878689749849U,  4605602459698789090U,
-	13828974496553564898U,  4603270878689749849U,
-	 4605725276488455441U,  4603087070374583113U,
-	13826459107229358921U,  4605725276488455441U,
-	 4596629994023683153U,  4606987119037722413U,
-	13830359155892498221U,  4596629994023683153U,
-	 4606538458821337243U,  4600360675823176935U,
-	13823732712677952743U,  4606538458821337243U,
-	 4600769149537129431U,  4606453861145241227U,
-	13829825898000017035U,  4600769149537129431U,
-	 4604621949701367983U,  4604465633578481725U,
-	13827837670433257533U,  4604621949701367983U,
-	 4578227681973159812U,  4607181740574479067U,
-	13830553777429254875U,  4578227681973159812U,
-	 4607182249242036882U,  4573724215515480177U,
-	13797096252370255985U,  4607182249242036882U,
-	 4604505071555817232U,  4604583231088591477U,
-	13827955267943367285U,  4604505071555817232U,
-	 4606475480113671417U,  4600667422348321968U,
-	13824039459203097776U,  4606475480113671417U,
-	 4600463181646572228U,  4606517779747998088U,
-	13829889816602773896U,  4600463181646572228U,
-	 4606998399608725124U,  4596413578358834022U,
-	13819785615213609830U,  4606998399608725124U,
-	 4603133304188877240U,  4605694995810664660U,
-	13829067032665440468U,  4603133304188877240U,
-	 4605633586259814045U,  4603225210076562971U,
-	13826597246931338779U,  4605633586259814045U,
-	 4595979936813835462U,  4607019963775302583U,
-	13830392000630078391U,  4595979936813835462U,
-	 4607133460805585796U,  4592167175087283203U,
-	13815539211942059011U,  4607133460805585796U,
-	 4603846496621587377U,  4605172808754305228U,
-	13828544845609081036U,  4603846496621587377U,
-	 4606092657816072624U,  4602260871257280788U,
-	13825632908112056596U,  4606092657816072624U,
-	 4598795050632330097U,  4606810452769876110U,
-	13830182489624651918U,  4598795050632330097U,
-	 4606778366364612594U,  4599006600037663623U,
-	13822378636892439431U,  4606778366364612594U,
-	 4602065906208722008U,  4606144763310860551U,
-	13829516800165636359U,  4602065906208722008U,
-	 4605102686554936490U,  4603931940768740167U,
-	13827303977623515975U,  4605102686554936490U,
-	 4591287158938884897U,  4607144295058764886U,
-	13830516331913540694U,  4591287158938884897U,
-	 4607168688050493276U,  4588115294056142819U,
-	13811487330910918627U,  4607168688050493276U,
-	 4604183020748362039U,  4604886103475043762U,
-	13828258140329819570U,  4604183020748362039U,
-	 4606293848208650998U,  4601473544562720001U,
-	13824845581417495809U,  4606293848208650998U,
-	 4599636300858866724U,  4606674353838411301U,
-	13830046390693187109U,  4599636300858866724U,
-	 4606898891031025132U,  4598136582470364665U,
-	13821508619325140473U,  4606898891031025132U,
-	 4602758354025980442U,  4605929219593405673U,
-	13829301256448181481U,  4602758354025980442U,
-	 4605376811039722786U,  4603585091850767959U,
-	13826957128705543767U,  4605376811039722786U,
-	 4594235767444503503U,  4607092871118901179U,
-	13830464907973676987U,  4594235767444503503U,
-	 4607076652372832968U,  4594673119063280916U,
-	13818045155918056724U,  4607076652372832968U,
-	 4603496309891590679U,  4605442656228245717U,
-	13828814693083021525U,  4603496309891590679U,
-	 4605872393621214213U,  4602853162432841185U,
-	13826225199287616993U,  4605872393621214213U,
-	 4597707695679609371U,  4606925748668145757U,
-	13830297785522921565U,  4597707695679609371U,
-	 4606637115963965612U,  4599844446633109139U,
-	13823216483487884947U,  4606637115963965612U,
-	 4601273700967202825U,  4606341107699334546U,
-	13829713144554110354U,  4601273700967202825U,
-	 4604811873195349477U,  4604264921241055824U,
-	13827636958095831632U,  4604811873195349477U,
-	 4586348876009622851U,  4607174111710118367U,
-	13830546148564894175U,  4586348876009622851U,
-	 4607178180169683960U,  4584498631466405633U,
-	13807870668321181441U,  4607178180169683960U,
-	 4604345904647073908U,  4604736643460027021U,
-	13828108680314802829U,  4604345904647073908U,
-	 4606387137437298591U,  4601072712526242277U,
-	13824444749381018085U,  4606387137437298591U,
-	 4600051662802353687U,  4606598603759044570U,
-	13829970640613820378U,  4600051662802353687U,
-	 4606951288507767453U,  4597277522845151878U,
-	13820649559699927686U,  4606951288507767453U,
-	 4602947266358709886U,  4605814408482919348U,
-	13829186445337695156U,  4602947266358709886U,
-	 4605507406967535927U,  4603406726595779752U,
-	13826778763450555560U,  4605507406967535927U,
-	 4595109641634432498U,  4607059093103722971U,
-	13830431129958498779U,  4595109641634432498U,
-	 4607107746899444102U,  4593797652641645341U,
-	13817169689496421149U,  4607107746899444102U,
-	 4603673059103075106U,  4605309881318010327U,
-	13828681918172786135U,  4603673059103075106U,
-	 4605984877841711338U,  4602646891659203088U,
-	13826018928513978896U,  4605984877841711338U,
-	 4598369669086960528U,  4606870719641066940U,
-	13830242756495842748U,  4598369669086960528U,
-	 4606710311774494716U,  4599427256825614420U,
-	13822799293680390228U,  4606710311774494716U,
-	 4601672213217083403U,  4606245366082353408U,
-	13829617402937129216U,  4601672213217083403U,
-	 4604959323120302796U,  4604100215502905499U,
-	13827472252357681307U,  4604959323120302796U,
-	 4589524267239410099U,  4607161910007591876U,
-	13830533946862367684U,  4589524267239410099U,
-	 4607153778602162496U,  4590406145430462614U,
-	13813778182285238422U,  4607153778602162496U,
-	 4604016517974851588U,  4605031521104517324U,
-	13828403557959293132U,  4604016517974851588U,
-	 4606195668621671667U,  4601869677011524443U,
-	13825241713866300251U,  4606195668621671667U,
-	 4599217346014614711U,  4606744984357082948U,
-	13830117021211858756U,  4599217346014614711U,
-	 4606841238740778884U,  4598582729657176439U,
-	13821954766511952247U,  4606841238740778884U,
-	 4602454542796181607U,  4606039359984203741U,
-	13829411396838979549U,  4602454542796181607U,
-	 4605241877142478242U,  4603760198400967492U,
-	13827132235255743300U,  4605241877142478242U,
-	 4593046061348462537U,  4607121277474223905U,
-	13830493314328999713U,  4593046061348462537U,
-	 4607040195955932526U,  4595545269419264690U,
-	13818917306274040498U,  4607040195955932526U,
-	 4603316355454250015U,  4605571053506370248U,
-	13828943090361146056U,  4603316355454250015U,
-	 4605755272910869620U,  4603040651631881451U,
-	13826412688486657259U,  4605755272910869620U,
-	 4596846128749438754U,  4606975506703684317U,
-	13830347543558460125U,  4596846128749438754U,
-	 4606558823023444576U,  4600257918160607478U,
-	13823629955015383286U,  4606558823023444576U,
-	 4600870609507958271U,  4606431930490633905U,
-	13829803967345409713U,  4600870609507958271U,
-	 4604660425598397818U,  4604425958770613225U,
-	13827797995625389033U,  4604660425598397818U,
-	 4580962600092897021U,  4607180892816495009U,
-	13830552929671270817U,  4580962600092897021U,
-	 4607180892816495009U,  4580962600092897021U,
-	13804334636947672829U,  4607180892816495009U,
-	 4604425958770613225U,  4604660425598397818U,
-	13828032462453173626U,  4604425958770613225U,
-	 4606431930490633905U,  4600870609507958271U,
-	13824242646362734079U,  4606431930490633905U,
-	 4600257918160607478U,  4606558823023444576U,
-	13829930859878220384U,  4600257918160607478U,
-	 4606975506703684317U,  4596846128749438754U,
-	13820218165604214562U,  4606975506703684317U,
-	 4603040651631881451U,  4605755272910869620U,
-	13829127309765645428U,  4603040651631881451U,
-	 4605571053506370248U,  4603316355454250015U,
-	13826688392309025823U,  4605571053506370248U,
-	 4595545269419264690U,  4607040195955932526U,
-	13830412232810708334U,  4595545269419264690U,
-	 4607121277474223905U,  4593046061348462537U,
-	13816418098203238345U,  4607121277474223905U,
-	 4603760198400967492U,  4605241877142478242U,
-	13828613913997254050U,  4603760198400967492U,
-	 4606039359984203741U,  4602454542796181607U,
-	13825826579650957415U,  4606039359984203741U,
-	 4598582729657176439U,  4606841238740778884U,
-	13830213275595554692U,  4598582729657176439U,
-	 4606744984357082948U,  4599217346014614711U,
-	13822589382869390519U,  4606744984357082948U,
-	 4601869677011524443U,  4606195668621671667U,
-	13829567705476447475U,  4601869677011524443U,
-	 4605031521104517324U,  4604016517974851588U,
-	13827388554829627396U,  4605031521104517324U,
-	 4590406145430462614U,  4607153778602162496U,
-	13830525815456938304U,  4590406145430462614U,
-	 4607161910007591876U,  4589524267239410099U,
-	13812896304094185907U,  4607161910007591876U,
-	 4604100215502905499U,  4604959323120302796U,
-	13828331359975078604U,  4604100215502905499U,
-	 4606245366082353408U,  4601672213217083403U,
-	13825044250071859211U,  4606245366082353408U,
-	 4599427256825614420U,  4606710311774494716U,
-	13830082348629270524U,  4599427256825614420U,
-	 4606870719641066940U,  4598369669086960528U,
-	13821741705941736336U,  4606870719641066940U,
-	 4602646891659203088U,  4605984877841711338U,
-	13829356914696487146U,  4602646891659203088U,
-	 4605309881318010327U,  4603673059103075106U,
-	13827045095957850914U,  4605309881318010327U,
-	 4593797652641645341U,  4607107746899444102U,
-	13830479783754219910U,  4593797652641645341U,
-	 4607059093103722971U,  4595109641634432498U,
-	13818481678489208306U,  4607059093103722971U,
-	 4603406726595779752U,  4605507406967535927U,
-	13828879443822311735U,  4603406726595779752U,
-	 4605814408482919348U,  4602947266358709886U,
-	13826319303213485694U,  4605814408482919348U,
-	 4597277522845151878U,  4606951288507767453U,
-	13830323325362543261U,  4597277522845151878U,
-	 4606598603759044570U,  4600051662802353687U,
-	13823423699657129495U,  4606598603759044570U,
-	 4601072712526242277U,  4606387137437298591U,
-	13829759174292074399U,  4601072712526242277U,
-	 4604736643460027021U,  4604345904647073908U,
-	13827717941501849716U,  4604736643460027021U,
-	 4584498631466405633U,  4607178180169683960U,
-	13830550217024459768U,  4584498631466405633U,
-	 4607174111710118367U,  4586348876009622851U,
-	13809720912864398659U,  4607174111710118367U,
-	 4604264921241055824U,  4604811873195349477U,
-	13828183910050125285U,  4604264921241055824U,
-	 4606341107699334546U,  4601273700967202825U,
-	13824645737821978633U,  4606341107699334546U,
-	 4599844446633109139U,  4606637115963965612U,
-	13830009152818741420U,  4599844446633109139U,
-	 4606925748668145757U,  4597707695679609371U,
-	13821079732534385179U,  4606925748668145757U,
-	 4602853162432841185U,  4605872393621214213U,
-	13829244430475990021U,  4602853162432841185U,
-	 4605442656228245717U,  4603496309891590679U,
-	13826868346746366487U,  4605442656228245717U,
-	 4594673119063280916U,  4607076652372832968U,
-	13830448689227608776U,  4594673119063280916U,
-	 4607092871118901179U,  4594235767444503503U,
-	13817607804299279311U,  4607092871118901179U,
-	 4603585091850767959U,  4605376811039722786U,
-	13828748847894498594U,  4603585091850767959U,
-	 4605929219593405673U,  4602758354025980442U,
-	13826130390880756250U,  4605929219593405673U,
-	 4598136582470364665U,  4606898891031025132U,
-	13830270927885800940U,  4598136582470364665U,
-	 4606674353838411301U,  4599636300858866724U,
-	13823008337713642532U,  4606674353838411301U,
-	 4601473544562720001U,  4606293848208650998U,
-	13829665885063426806U,  4601473544562720001U,
-	 4604886103475043762U,  4604183020748362039U,
-	13827555057603137847U,  4604886103475043762U,
-	 4588115294056142819U,  4607168688050493276U,
-	13830540724905269084U,  4588115294056142819U,
-	 4607144295058764886U,  4591287158938884897U,
-	13814659195793660705U,  4607144295058764886U,
-	 4603931940768740167U,  4605102686554936490U,
-	13828474723409712298U,  4603931940768740167U,
-	 4606144763310860551U,  4602065906208722008U,
-	13825437943063497816U,  4606144763310860551U,
-	 4599006600037663623U,  4606778366364612594U,
-	13830150403219388402U,  4599006600037663623U,
-	 4606810452769876110U,  4598795050632330097U,
-	13822167087487105905U,  4606810452769876110U,
-	 4602260871257280788U,  4606092657816072624U,
-	13829464694670848432U,  4602260871257280788U,
-	 4605172808754305228U,  4603846496621587377U,
-	13827218533476363185U,  4605172808754305228U,
-	 4592167175087283203U,  4607133460805585796U,
-	13830505497660361604U,  4592167175087283203U,
-	 4607019963775302583U,  4595979936813835462U,
-	13819351973668611270U,  4607019963775302583U,
-	 4603225210076562971U,  4605633586259814045U,
-	13829005623114589853U,  4603225210076562971U,
-	 4605694995810664660U,  4603133304188877240U,
-	13826505341043653048U,  4605694995810664660U,
-	 4596413578358834022U,  4606998399608725124U,
-	13830370436463500932U,  4596413578358834022U,
-	 4606517779747998088U,  4600463181646572228U,
-	13823835218501348036U,  4606517779747998088U,
-	 4600667422348321968U,  4606475480113671417U,
-	13829847516968447225U,  4600667422348321968U,
-	 4604583231088591477U,  4604505071555817232U,
-	13827877108410593040U,  4604583231088591477U,
-	 4573724215515480177U,  4607182249242036882U,
-	13830554286096812690U,  4573724215515480177U,
-	 4607182376410422530U,  4569220649180767418U,
-	13792592686035543226U,  4607182376410422530U,
-	 4604524701268679793U,  4604563781218984604U,
-	13827935818073760412U,  4604524701268679793U,
-	 4606486172460753999U,  4600616459743653188U,
-	13823988496598428996U,  4606486172460753999U,
-	 4600514338912178239U,  4606507322377452870U,
-	13829879359232228678U,  4600514338912178239U,
-	 4607003915349878877U,  4596305267720071930U,
-	13819677304574847738U,  4607003915349878877U,
-	 4603156351203636159U,  4605679749231851918U,
-	13829051786086627726U,  4603156351203636159U,
-	 4605649044311923410U,  4603202304363743346U,
-	13826574341218519154U,  4605649044311923410U,
-	 4596088445927168004U,  4607014697483910382U,
-	13830386734338686190U,  4596088445927168004U,
-	 4607136295912168606U,  4591947271803021404U,
-	13815319308657797212U,  4607136295912168606U,
-	 4603867938232615808U,  4605155376589456981U,
-	13828527413444232789U,  4603867938232615808U,
-	 4606105796280968177U,  4602212250118051877U,
-	13825584286972827685U,  4606105796280968177U,
-	 4598848011564831930U,  4606802552898869248U,
-	13830174589753645056U,  4598848011564831930U,
-	 4606786509620734768U,  4598953786765296928U,
-	13822325823620072736U,  4606786509620734768U,
-	 4602114767134999006U,  4606131849150971908U,
-	13829503886005747716U,  4602114767134999006U,
-	 4605120315324767624U,  4603910660507251362U,
-	13827282697362027170U,  4605120315324767624U,
-	 4591507261658050721U,  4607141713064252300U,
-	13830513749919028108U,  4591507261658050721U,
-	 4607170170974224083U,  4587673791460508439U,
-	13811045828315284247U,  4607170170974224083U,
-	 4604203581176243359U,  4604867640218014515U,
-	13828239677072790323U,  4604203581176243359U,
-	 4606305777984577632U,  4601423692641949331U,
-	13824795729496725139U,  4606305777984577632U,
-	 4599688422741010356U,  4606665164148251002U,
-	13830037201003026810U,  4599688422741010356U,
-	 4606905728766014348U,  4598029484874872834U,
-	13821401521729648642U,  4606905728766014348U,
-	 4602782121393764535U,  4605915122243179241U,
-	13829287159097955049U,  4602782121393764535U,
-	 4605393374401988274U,  4603562972219549215U,
-	13826935009074325023U,  4605393374401988274U,
-	 4594345179472540681U,  4607088942243446236U,
-	13830460979098222044U,  4594345179472540681U,
-	 4607080832832247697U,  4594563856311064231U,
-	13817935893165840039U,  4607080832832247697U,
-	 4603518581031047189U,  4605426297151190466U,
-	13828798334005966274U,  4603518581031047189U,
-	 4605886709123365959U,  4602829525820289164U,
-	13826201562675064972U,  4605886709123365959U,
-	 4597815040470278984U,  4606919157647773535U,
-	13830291194502549343U,  4597815040470278984U,
-	 4606646545123403481U,  4599792496117920694U,
-	13823164532972696502U,  4606646545123403481U,
-	 4601323770373937522U,  4606329407841126011U,
-	13829701444695901819U,  4601323770373937522U,
-	 4604830524903495634U,  4604244531615310815U,
-	13827616568470086623U,  4604830524903495634U,
-	 4586790578280679046U,  4607172882816799076U,
-	13830544919671574884U,  4586790578280679046U,
-	 4607178985458280057U,  4583614727651146525U,
-	13806986764505922333U,  4607178985458280057U,
-	 4604366005771528720U,  4604717681185626434U,
-	13828089718040402242U,  4604366005771528720U,
-	 4606398451906509788U,  4601022290077223616U,
-	13824394326931999424U,  4606398451906509788U,
-	 4600103317933788342U,  4606588777269136769U,
-	13829960814123912577U,  4600103317933788342U,
-	 4606957467106717424U,  4597169786279785693U,
-	13820541823134561501U,  4606957467106717424U,
-	 4602970680601913687U,  4605799732098147061U,
-	13829171768952922869U,  4602970680601913687U,
-	 4605523422498301790U,  4603384207141321914U,
-	13826756243996097722U,  4605523422498301790U,
-	 4595218635031890910U,  4607054494135176056U,
-	13830426530989951864U,  4595218635031890910U,
-	 4607111255739239816U,  4593688012422887515U,
-	13817060049277663323U,  4607111255739239816U,
-	 4603694922063032361U,  4605292980606880364U,
-	13828665017461656172U,  4603694922063032361U,
-	 4605998608960791335U,  4602598930031891166U,
-	13825970966886666974U,  4605998608960791335U,
-	 4598423001813699022U,  4606863472012527185U,
-	13830235508867302993U,  4598423001813699022U,
-	 4606719100629313491U,  4599374859150636784U,
-	13822746896005412592U,  4606719100629313491U,
-	 4601721693286060937U,  4606233055365547081U,
-	13829605092220322889U,  4601721693286060937U,
-	 4604977468824438271U,  4604079374282302598U,
-	13827451411137078406U,  4604977468824438271U,
-	 4589744810590291021U,  4607160003989618959U,
-	13830532040844394767U,  4589744810590291021U,
-	 4607155938267770208U,  4590185751760970393U,
-	13813557788615746201U,  4607155938267770208U,
-	 4604037525321326463U,  4605013567986435066U,
-	13828385604841210874U,  4604037525321326463U,
-	 4606208206518262803U,  4601820425647934753U,
-	13825192462502710561U,  4606208206518262803U,
-	 4599269903251194481U,  4606736437002195879U,
-	13830108473856971687U,  4599269903251194481U,
-	 4606848731493011465U,  4598529532600161144U,
-	13821901569454936952U,  4606848731493011465U,
-	 4602502755147763107U,  4606025850160239809U,
-	13829397887015015617U,  4602502755147763107U,
-	 4605258978359093269U,  4603738491917026584U,
-	13827110528771802392U,  4605258978359093269U,
-	 4593265590854265407U,  4607118021058468598U,
-	13830490057913244406U,  4593265590854265407U,
-	 4607045045516813836U,  4595436449949385485U,
-	13818808486804161293U,  4607045045516813836U,
-	 4603339021357904144U,  4605555245917486022U,
-	13828927282772261830U,  4603339021357904144U,
-	 4605770164172969910U,  4603017373458244943U,
-	13826389410313020751U,  4605770164172969910U,
-	 4596954088216812973U,  4606969576261663845U,
-	13830341613116439653U,  4596954088216812973U,
-	 4606568886807728474U,  4600206446098256018U,
-	13823578482953031826U,  4606568886807728474U,
-	 4600921238092511730U,  4606420848538580260U,
-	13829792885393356068U,  4600921238092511730U,
-	 4604679572075463103U,  4604406033021674239U,
-	13827778069876450047U,  4604679572075463103U,
-	 4581846703643734566U,  4607180341788068727U,
-	13830552378642844535U,  4581846703643734566U,
-	 4607181359080094673U,  4579996072175835083U,
-	13803368109030610891U,  4607181359080094673U,
-	 4604445825685214043U,  4604641218080103285U,
-	13828013254934879093U,  4604445825685214043U,
-	 4606442934727379583U,  4600819913163773071U,
-	13824191950018548879U,  4606442934727379583U,
-	 4600309328230211502U,  4606548680329491866U,
-	13829920717184267674U,  4600309328230211502U,
-	 4606981354314050484U,  4596738097012783531U,
-	13820110133867559339U,  4606981354314050484U,
-	 4603063884010218172U,  4605740310302420207U,
-	13829112347157196015U,  4603063884010218172U,
-	 4605586791482848547U,  4603293641160266722U,
-	13826665678015042530U,  4605586791482848547U,
-	 4595654028864046335U,  4607035262954517034U,
-	13830407299809292842U,  4595654028864046335U,
-	 4607124449686274900U,  4592826452951465409U,
-	13816198489806241217U,  4607124449686274900U,
-	 4603781852316960384U,  4605224709411790590U,
-	13828596746266566398U,  4603781852316960384U,
-	 4606052795787882823U,  4602406247776385022U,
-	13825778284631160830U,  4606052795787882823U,
-	 4598635880488956483U,  4606833664420673202U,
-	13830205701275449010U,  4598635880488956483U,
-	 4606753451050079834U,  4599164736579548843U,
-	13822536773434324651U,  4606753451050079834U,
-	 4601918851211878557U,  4606183055233559255U,
-	13829555092088335063U,  4601918851211878557U,
-	 4605049409688478101U,  4603995455647851249U,
-	13827367492502627057U,  4605049409688478101U,
-	 4590626485056654602U,  4607151534426937478U,
-	13830523571281713286U,  4590626485056654602U,
-	 4607163731439411601U,  4589303678145802340U,
-	13812675715000578148U,  4607163731439411601U,
-	 4604121000955189926U,  4604941113561600762U,
-	13828313150416376570U,  4604121000955189926U,
-	 4606257600839867033U,  4601622657843474729U,
-	13824994694698250537U,  4606257600839867033U,
-	 4599479600326345459U,  4606701442584137310U,
-	13830073479438913118U,  4599479600326345459U,
-	 4606877885424248132U,  4598316292140394014U,
-	13821688328995169822U,  4606877885424248132U,
-	 4602686793990243041U,  4605971073215153165U,
-	13829343110069928973U,  4602686793990243041U,
-	 4605326714874986465U,  4603651144395358093U,
-	13827023181250133901U,  4605326714874986465U,
-	 4593907249284540294U,  4607104153983298999U,
-	13830476190838074807U,  4593907249284540294U,
-	 4607063608453868552U,  4595000592312171144U,
-	13818372629166946952U,  4607063608453868552U,
-	 4603429196809300824U,  4605491322423429598U,
-	13828863359278205406U,  4603429196809300824U,
-	 4605829012964735987U,  4602923807199184054U,
-	13826295844053959862U,  4605829012964735987U,
-	 4597385183080791534U,  4606945027305114062U,
-	13830317064159889870U,  4597385183080791534U,
-	 4606608350964852124U,  4599999947619525579U,
-	13823371984474301387U,  4606608350964852124U,
-	 4601123065313358619U,  4606375745674388705U,
-	13829747782529164513U,  4601123065313358619U,
-	 4604755543975806820U,  4604325745441780828U,
-	13827697782296556636U,  4604755543975806820U,
-	 4585023436363055487U,  4607177290141793710U,
-	13830549326996569518U,  4585023436363055487U,
-	 4607175255902437396U,  4585907115494236537U,
-	13809279152349012345U,  4607175255902437396U,
-	 4604285253548209224U,  4604793159020491611U,
-	13828165195875267419U,  4604285253548209224U,
-	 4606352730697093817U,  4601223560006786057U,
-	13824595596861561865U,  4606352730697093817U,
-	 4599896339047301634U,  4606627607157935956U,
-	13829999644012711764U,  4599896339047301634U,
-	 4606932257325205256U,  4597600270510262682U,
-	13820972307365038490U,  4606932257325205256U,
-	 4602876755014813164U,  4605858005670328613U,
-	13829230042525104421U,  4602876755014813164U,
-	 4605458946901419122U,  4603473988668005304U,
-	13826846025522781112U,  4605458946901419122U,
-	 4594782329999411347U,  4607072388129742377U,
-	13830444424984518185U,  4594782329999411347U,
-	 4607096716058023245U,  4594126307716900071U,
-	13817498344571675879U,  4607096716058023245U,
-	 4603607160562208225U,  4605360179893335444U,
-	13828732216748111252U,  4603607160562208225U,
-	 4605943243960030558U,  4602734543519989142U,
-	13826106580374764950U,  4605943243960030558U,
-	 4598209407597805010U,  4606891971185517504U,
-	13830264008040293312U,  4598209407597805010U,
-	 4606683463531482757U,  4599584122834874440U,
-	13822956159689650248U,  4606683463531482757U,
-	 4601523323048804569U,  4606281842017099424U,
-	13829653878871875232U,  4601523323048804569U,
-	 4604904503566677638U,  4604162403772767740U,
-	13827534440627543548U,  4604904503566677638U,
-	 4588556721781247689U,  4607167120476811757U,
-	13830539157331587565U,  4588556721781247689U,
-	 4607146792632922887U,  4591066993883984169U,
-	13814439030738759977U,  4607146792632922887U,
-	 4603953166845776383U,  4605084992581147553U,
-	13828457029435923361U,  4603953166845776383U,
-	 4606157602458368090U,  4602016966272225497U,
-	13825389003127001305U,  4606157602458368090U,
-	 4599059363095165615U,  4606770142132396069U,
-	13830142178987171877U,  4599059363095165615U,
-	 4606818271362779153U,  4598742041476147134U,
-	13822114078330922942U,  4606818271362779153U,
-	 4602309411551204896U,  4606079444829232727U,
-	13829451481684008535U,  4602309411551204896U,
-	 4605190175055178825U,  4603825001630339212U,
-	13827197038485115020U,  4605190175055178825U,
-	 4592387007752762956U,  4607130541380624519U,
-	13830502578235400327U,  4592387007752762956U,
-	 4607025146816593591U,  4595871363584150300U,
-	13819243400438926108U,  4607025146816593591U,
-	 4603248068256948438U,  4605618058006716661U,
-	13828990094861492469U,  4603248068256948438U,
-	 4605710171610479304U,  4603110210506737381U,
-	13826482247361513189U,  4605710171610479304U,
-	 4596521820799644122U,  4606992800820440327U,
-	13830364837675216135U,  4596521820799644122U,
-	 4606528158595189433U,  4600411960456200676U,
-	13823783997310976484U,  4606528158595189433U,
-	 4600718319105833937U,  4606464709641375231U,
-	13829836746496151039U,  4600718319105833937U,
-	 4604602620643553229U,  4604485382263976838U,
-	13827857419118752646U,  4604602620643553229U,
-	 4576459225186735875U,  4607182037296057423U,
-	13830554074150833231U,  4576459225186735875U,
-	 4607182037296057423U,  4576459225186735875U,
-	13799831262041511683U,  4607182037296057423U,
-	 4604485382263976838U,  4604602620643553229U,
-	13827974657498329037U,  4604485382263976838U,
-	 4606464709641375231U,  4600718319105833937U,
-	13824090355960609745U,  4606464709641375231U,
-	 4600411960456200676U,  4606528158595189433U,
-	13829900195449965241U,  4600411960456200676U,
-	 4606992800820440327U,  4596521820799644122U,
-	13819893857654419930U,  4606992800820440327U,
-	 4603110210506737381U,  4605710171610479304U,
-	13829082208465255112U,  4603110210506737381U,
-	 4605618058006716661U,  4603248068256948438U,
-	13826620105111724246U,  4605618058006716661U,
-	 4595871363584150300U,  4607025146816593591U,
-	13830397183671369399U,  4595871363584150300U,
-	 4607130541380624519U,  4592387007752762956U,
-	13815759044607538764U,  4607130541380624519U,
-	 4603825001630339212U,  4605190175055178825U,
-	13828562211909954633U,  4603825001630339212U,
-	 4606079444829232727U,  4602309411551204896U,
-	13825681448405980704U,  4606079444829232727U,
-	 4598742041476147134U,  4606818271362779153U,
-	13830190308217554961U,  4598742041476147134U,
-	 4606770142132396069U,  4599059363095165615U,
-	13822431399949941423U,  4606770142132396069U,
-	 4602016966272225497U,  4606157602458368090U,
-	13829529639313143898U,  4602016966272225497U,
-	 4605084992581147553U,  4603953166845776383U,
-	13827325203700552191U,  4605084992581147553U,
-	 4591066993883984169U,  4607146792632922887U,
-	13830518829487698695U,  4591066993883984169U,
-	 4607167120476811757U,  4588556721781247689U,
-	13811928758636023497U,  4607167120476811757U,
-	 4604162403772767740U,  4604904503566677638U,
-	13828276540421453446U,  4604162403772767740U,
-	 4606281842017099424U,  4601523323048804569U,
-	13824895359903580377U,  4606281842017099424U,
-	 4599584122834874440U,  4606683463531482757U,
-	13830055500386258565U,  4599584122834874440U,
-	 4606891971185517504U,  4598209407597805010U,
-	13821581444452580818U,  4606891971185517504U,
-	 4602734543519989142U,  4605943243960030558U,
-	13829315280814806366U,  4602734543519989142U,
-	 4605360179893335444U,  4603607160562208225U,
-	13826979197416984033U,  4605360179893335444U,
-	 4594126307716900071U,  4607096716058023245U,
-	13830468752912799053U,  4594126307716900071U,
-	 4607072388129742377U,  4594782329999411347U,
-	13818154366854187155U,  4607072388129742377U,
-	 4603473988668005304U,  4605458946901419122U,
-	13828830983756194930U,  4603473988668005304U,
-	 4605858005670328613U,  4602876755014813164U,
-	13826248791869588972U,  4605858005670328613U,
-	 4597600270510262682U,  4606932257325205256U,
-	13830304294179981064U,  4597600270510262682U,
-	 4606627607157935956U,  4599896339047301634U,
-	13823268375902077442U,  4606627607157935956U,
-	 4601223560006786057U,  4606352730697093817U,
-	13829724767551869625U,  4601223560006786057U,
-	 4604793159020491611U,  4604285253548209224U,
-	13827657290402985032U,  4604793159020491611U,
-	 4585907115494236537U,  4607175255902437396U,
-	13830547292757213204U,  4585907115494236537U,
-	 4607177290141793710U,  4585023436363055487U,
-	13808395473217831295U,  4607177290141793710U,
-	 4604325745441780828U,  4604755543975806820U,
-	13828127580830582628U,  4604325745441780828U,
-	 4606375745674388705U,  4601123065313358619U,
-	13824495102168134427U,  4606375745674388705U,
-	 4599999947619525579U,  4606608350964852124U,
-	13829980387819627932U,  4599999947619525579U,
-	 4606945027305114062U,  4597385183080791534U,
-	13820757219935567342U,  4606945027305114062U,
-	 4602923807199184054U,  4605829012964735987U,
-	13829201049819511795U,  4602923807199184054U,
-	 4605491322423429598U,  4603429196809300824U,
-	13826801233664076632U,  4605491322423429598U,
-	 4595000592312171144U,  4607063608453868552U,
-	13830435645308644360U,  4595000592312171144U,
-	 4607104153983298999U,  4593907249284540294U,
-	13817279286139316102U,  4607104153983298999U,
-	 4603651144395358093U,  4605326714874986465U,
-	13828698751729762273U,  4603651144395358093U,
-	 4605971073215153165U,  4602686793990243041U,
-	13826058830845018849U,  4605971073215153165U,
-	 4598316292140394014U,  4606877885424248132U,
-	13830249922279023940U,  4598316292140394014U,
-	 4606701442584137310U,  4599479600326345459U,
-	13822851637181121267U,  4606701442584137310U,
-	 4601622657843474729U,  4606257600839867033U,
-	13829629637694642841U,  4601622657843474729U,
-	 4604941113561600762U,  4604121000955189926U,
-	13827493037809965734U,  4604941113561600762U,
-	 4589303678145802340U,  4607163731439411601U,
-	13830535768294187409U,  4589303678145802340U,
-	 4607151534426937478U,  4590626485056654602U,
-	13813998521911430410U,  4607151534426937478U,
-	 4603995455647851249U,  4605049409688478101U,
-	13828421446543253909U,  4603995455647851249U,
-	 4606183055233559255U,  4601918851211878557U,
-	13825290888066654365U,  4606183055233559255U,
-	 4599164736579548843U,  4606753451050079834U,
-	13830125487904855642U,  4599164736579548843U,
-	 4606833664420673202U,  4598635880488956483U,
-	13822007917343732291U,  4606833664420673202U,
-	 4602406247776385022U,  4606052795787882823U,
-	13829424832642658631U,  4602406247776385022U,
-	 4605224709411790590U,  4603781852316960384U,
-	13827153889171736192U,  4605224709411790590U,
-	 4592826452951465409U,  4607124449686274900U,
-	13830496486541050708U,  4592826452951465409U,
-	 4607035262954517034U,  4595654028864046335U,
-	13819026065718822143U,  4607035262954517034U,
-	 4603293641160266722U,  4605586791482848547U,
-	13828958828337624355U,  4603293641160266722U,
-	 4605740310302420207U,  4603063884010218172U,
-	13826435920864993980U,  4605740310302420207U,
-	 4596738097012783531U,  4606981354314050484U,
-	13830353391168826292U,  4596738097012783531U,
-	 4606548680329491866U,  4600309328230211502U,
-	13823681365084987310U,  4606548680329491866U,
-	 4600819913163773071U,  4606442934727379583U,
-	13829814971582155391U,  4600819913163773071U,
-	 4604641218080103285U,  4604445825685214043U,
-	13827817862539989851U,  4604641218080103285U,
-	 4579996072175835083U,  4607181359080094673U,
-	13830553395934870481U,  4579996072175835083U,
-	 4607180341788068727U,  4581846703643734566U,
-	13805218740498510374U,  4607180341788068727U,
-	 4604406033021674239U,  4604679572075463103U,
-	13828051608930238911U,  4604406033021674239U,
-	 4606420848538580260U,  4600921238092511730U,
-	13824293274947287538U,  4606420848538580260U,
-	 4600206446098256018U,  4606568886807728474U,
-	13829940923662504282U,  4600206446098256018U,
-	 4606969576261663845U,  4596954088216812973U,
-	13820326125071588781U,  4606969576261663845U,
-	 4603017373458244943U,  4605770164172969910U,
-	13829142201027745718U,  4603017373458244943U,
-	 4605555245917486022U,  4603339021357904144U,
-	13826711058212679952U,  4605555245917486022U,
-	 4595436449949385485U,  4607045045516813836U,
-	13830417082371589644U,  4595436449949385485U,
-	 4607118021058468598U,  4593265590854265407U,
-	13816637627709041215U,  4607118021058468598U,
-	 4603738491917026584U,  4605258978359093269U,
-	13828631015213869077U,  4603738491917026584U,
-	 4606025850160239809U,  4602502755147763107U,
-	13825874792002538915U,  4606025850160239809U,
-	 4598529532600161144U,  4606848731493011465U,
-	13830220768347787273U,  4598529532600161144U,
-	 4606736437002195879U,  4599269903251194481U,
-	13822641940105970289U,  4606736437002195879U,
-	 4601820425647934753U,  4606208206518262803U,
-	13829580243373038611U,  4601820425647934753U,
-	 4605013567986435066U,  4604037525321326463U,
-	13827409562176102271U,  4605013567986435066U,
-	 4590185751760970393U,  4607155938267770208U,
-	13830527975122546016U,  4590185751760970393U,
-	 4607160003989618959U,  4589744810590291021U,
-	13813116847445066829U,  4607160003989618959U,
-	 4604079374282302598U,  4604977468824438271U,
-	13828349505679214079U,  4604079374282302598U,
-	 4606233055365547081U,  4601721693286060937U,
-	13825093730140836745U,  4606233055365547081U,
-	 4599374859150636784U,  4606719100629313491U,
-	13830091137484089299U,  4599374859150636784U,
-	 4606863472012527185U,  4598423001813699022U,
-	13821795038668474830U,  4606863472012527185U,
-	 4602598930031891166U,  4605998608960791335U,
-	13829370645815567143U,  4602598930031891166U,
-	 4605292980606880364U,  4603694922063032361U,
-	13827066958917808169U,  4605292980606880364U,
-	 4593688012422887515U,  4607111255739239816U,
-	13830483292594015624U,  4593688012422887515U,
-	 4607054494135176056U,  4595218635031890910U,
-	13818590671886666718U,  4607054494135176056U,
-	 4603384207141321914U,  4605523422498301790U,
-	13828895459353077598U,  4603384207141321914U,
-	 4605799732098147061U,  4602970680601913687U,
-	13826342717456689495U,  4605799732098147061U,
-	 4597169786279785693U,  4606957467106717424U,
-	13830329503961493232U,  4597169786279785693U,
-	 4606588777269136769U,  4600103317933788342U,
-	13823475354788564150U,  4606588777269136769U,
-	 4601022290077223616U,  4606398451906509788U,
-	13829770488761285596U,  4601022290077223616U,
-	 4604717681185626434U,  4604366005771528720U,
-	13827738042626304528U,  4604717681185626434U,
-	 4583614727651146525U,  4607178985458280057U,
-	13830551022313055865U,  4583614727651146525U,
-	 4607172882816799076U,  4586790578280679046U,
-	13810162615135454854U,  4607172882816799076U,
-	 4604244531615310815U,  4604830524903495634U,
-	13828202561758271442U,  4604244531615310815U,
-	 4606329407841126011U,  4601323770373937522U,
-	13824695807228713330U,  4606329407841126011U,
-	 4599792496117920694U,  4606646545123403481U,
-	13830018581978179289U,  4599792496117920694U,
-	 4606919157647773535U,  4597815040470278984U,
-	13821187077325054792U,  4606919157647773535U,
-	 4602829525820289164U,  4605886709123365959U,
-	13829258745978141767U,  4602829525820289164U,
-	 4605426297151190466U,  4603518581031047189U,
-	13826890617885822997U,  4605426297151190466U,
-	 4594563856311064231U,  4607080832832247697U,
-	13830452869687023505U,  4594563856311064231U,
-	 4607088942243446236U,  4594345179472540681U,
-	13817717216327316489U,  4607088942243446236U,
-	 4603562972219549215U,  4605393374401988274U,
-	13828765411256764082U,  4603562972219549215U,
-	 4605915122243179241U,  4602782121393764535U,
-	13826154158248540343U,  4605915122243179241U,
-	 4598029484874872834U,  4606905728766014348U,
-	13830277765620790156U,  4598029484874872834U,
-	 4606665164148251002U,  4599688422741010356U,
-	13823060459595786164U,  4606665164148251002U,
-	 4601423692641949331U,  4606305777984577632U,
-	13829677814839353440U,  4601423692641949331U,
-	 4604867640218014515U,  4604203581176243359U,
-	13827575618031019167U,  4604867640218014515U,
-	 4587673791460508439U,  4607170170974224083U,
-	13830542207828999891U,  4587673791460508439U,
-	 4607141713064252300U,  4591507261658050721U,
-	13814879298512826529U,  4607141713064252300U,
-	 4603910660507251362U,  4605120315324767624U,
-	13828492352179543432U,  4603910660507251362U,
-	 4606131849150971908U,  4602114767134999006U,
-	13825486803989774814U,  4606131849150971908U,
-	 4598953786765296928U,  4606786509620734768U,
-	13830158546475510576U,  4598953786765296928U,
-	 4606802552898869248U,  4598848011564831930U,
-	13822220048419607738U,  4606802552898869248U,
-	 4602212250118051877U,  4606105796280968177U,
-	13829477833135743985U,  4602212250118051877U,
-	 4605155376589456981U,  4603867938232615808U,
-	13827239975087391616U,  4605155376589456981U,
-	 4591947271803021404U,  4607136295912168606U,
-	13830508332766944414U,  4591947271803021404U,
-	 4607014697483910382U,  4596088445927168004U,
-	13819460482781943812U,  4607014697483910382U,
-	 4603202304363743346U,  4605649044311923410U,
-	13829021081166699218U,  4603202304363743346U,
-	 4605679749231851918U,  4603156351203636159U,
-	13826528388058411967U,  4605679749231851918U,
-	 4596305267720071930U,  4607003915349878877U,
-	13830375952204654685U,  4596305267720071930U,
-	 4606507322377452870U,  4600514338912178239U,
-	13823886375766954047U,  4606507322377452870U,
-	 4600616459743653188U,  4606486172460753999U,
-	13829858209315529807U,  4600616459743653188U,
-	 4604563781218984604U,  4604524701268679793U,
-	13827896738123455601U,  4604563781218984604U,
-	 4569220649180767418U,  4607182376410422530U,
-	13830554413265198338U,  4569220649180767418U
-};
-
-const fpr fpr_p2_tab[] = {
-	4611686018427387904U,
-	4607182418800017408U,
-	4602678819172646912U,
-	4598175219545276416U,
-	4593671619917905920U,
-	4589168020290535424U,
-	4584664420663164928U,
-	4580160821035794432U,
-	4575657221408423936U,
-	4571153621781053440U,
-	4566650022153682944U
-};
-
-#elif FALCON_FPNATIVE // yyyFPEMU+0 yyyFPNATIVE+1
-
-const fpr fpr_gm_tab[] = {
-	{0}, {0}, /* unused */
-	{-0.000000000000000000000000000}, { 1.000000000000000000000000000},
-	{ 0.707106781186547524400844362}, { 0.707106781186547524400844362},
-	{-0.707106781186547524400844362}, { 0.707106781186547524400844362},
-	{ 0.923879532511286756128183189}, { 0.382683432365089771728459984},
-	{-0.382683432365089771728459984}, { 0.923879532511286756128183189},
-	{ 0.382683432365089771728459984}, { 0.923879532511286756128183189},
-	{-0.923879532511286756128183189}, { 0.382683432365089771728459984},
-	{ 0.980785280403230449126182236}, { 0.195090322016128267848284868},
-	{-0.195090322016128267848284868}, { 0.980785280403230449126182236},
-	{ 0.555570233019602224742830814}, { 0.831469612302545237078788378},
-	{-0.831469612302545237078788378}, { 0.555570233019602224742830814},
-	{ 0.831469612302545237078788378}, { 0.555570233019602224742830814},
-	{-0.555570233019602224742830814}, { 0.831469612302545237078788378},
-	{ 0.195090322016128267848284868}, { 0.980785280403230449126182236},
-	{-0.980785280403230449126182236}, { 0.195090322016128267848284868},
-	{ 0.995184726672196886244836953}, { 0.098017140329560601994195564},
-	{-0.098017140329560601994195564}, { 0.995184726672196886244836953},
-	{ 0.634393284163645498215171613}, { 0.773010453362736960810906610},
-	{-0.773010453362736960810906610}, { 0.634393284163645498215171613},
-	{ 0.881921264348355029712756864}, { 0.471396736825997648556387626},
-	{-0.471396736825997648556387626}, { 0.881921264348355029712756864},
-	{ 0.290284677254462367636192376}, { 0.956940335732208864935797887},
-	{-0.956940335732208864935797887}, { 0.290284677254462367636192376},
-	{ 0.956940335732208864935797887}, { 0.290284677254462367636192376},
-	{-0.290284677254462367636192376}, { 0.956940335732208864935797887},
-	{ 0.471396736825997648556387626}, { 0.881921264348355029712756864},
-	{-0.881921264348355029712756864}, { 0.471396736825997648556387626},
-	{ 0.773010453362736960810906610}, { 0.634393284163645498215171613},
-	{-0.634393284163645498215171613}, { 0.773010453362736960810906610},
-	{ 0.098017140329560601994195564}, { 0.995184726672196886244836953},
-	{-0.995184726672196886244836953}, { 0.098017140329560601994195564},
-	{ 0.998795456205172392714771605}, { 0.049067674327418014254954977},
-	{-0.049067674327418014254954977}, { 0.998795456205172392714771605},
-	{ 0.671558954847018400625376850}, { 0.740951125354959091175616897},
-	{-0.740951125354959091175616897}, { 0.671558954847018400625376850},
-	{ 0.903989293123443331586200297}, { 0.427555093430282094320966857},
-	{-0.427555093430282094320966857}, { 0.903989293123443331586200297},
-	{ 0.336889853392220050689253213}, { 0.941544065183020778412509403},
-	{-0.941544065183020778412509403}, { 0.336889853392220050689253213},
-	{ 0.970031253194543992603984207}, { 0.242980179903263889948274162},
-	{-0.242980179903263889948274162}, { 0.970031253194543992603984207},
-	{ 0.514102744193221726593693839}, { 0.857728610000272069902269984},
-	{-0.857728610000272069902269984}, { 0.514102744193221726593693839},
-	{ 0.803207531480644909806676513}, { 0.595699304492433343467036529},
-	{-0.595699304492433343467036529}, { 0.803207531480644909806676513},
-	{ 0.146730474455361751658850130}, { 0.989176509964780973451673738},
-	{-0.989176509964780973451673738}, { 0.146730474455361751658850130},
-	{ 0.989176509964780973451673738}, { 0.146730474455361751658850130},
-	{-0.146730474455361751658850130}, { 0.989176509964780973451673738},
-	{ 0.595699304492433343467036529}, { 0.803207531480644909806676513},
-	{-0.803207531480644909806676513}, { 0.595699304492433343467036529},
-	{ 0.857728610000272069902269984}, { 0.514102744193221726593693839},
-	{-0.514102744193221726593693839}, { 0.857728610000272069902269984},
-	{ 0.242980179903263889948274162}, { 0.970031253194543992603984207},
-	{-0.970031253194543992603984207}, { 0.242980179903263889948274162},
-	{ 0.941544065183020778412509403}, { 0.336889853392220050689253213},
-	{-0.336889853392220050689253213}, { 0.941544065183020778412509403},
-	{ 0.427555093430282094320966857}, { 0.903989293123443331586200297},
-	{-0.903989293123443331586200297}, { 0.427555093430282094320966857},
-	{ 0.740951125354959091175616897}, { 0.671558954847018400625376850},
-	{-0.671558954847018400625376850}, { 0.740951125354959091175616897},
-	{ 0.049067674327418014254954977}, { 0.998795456205172392714771605},
-	{-0.998795456205172392714771605}, { 0.049067674327418014254954977},
-	{ 0.999698818696204220115765650}, { 0.024541228522912288031734529},
-	{-0.024541228522912288031734529}, { 0.999698818696204220115765650},
-	{ 0.689540544737066924616730630}, { 0.724247082951466920941069243},
-	{-0.724247082951466920941069243}, { 0.689540544737066924616730630},
-	{ 0.914209755703530654635014829}, { 0.405241314004989870908481306},
-	{-0.405241314004989870908481306}, { 0.914209755703530654635014829},
-	{ 0.359895036534988148775104572}, { 0.932992798834738887711660256},
-	{-0.932992798834738887711660256}, { 0.359895036534988148775104572},
-	{ 0.975702130038528544460395766}, { 0.219101240156869797227737547},
-	{-0.219101240156869797227737547}, { 0.975702130038528544460395766},
-	{ 0.534997619887097210663076905}, { 0.844853565249707073259571205},
-	{-0.844853565249707073259571205}, { 0.534997619887097210663076905},
-	{ 0.817584813151583696504920884}, { 0.575808191417845300745972454},
-	{-0.575808191417845300745972454}, { 0.817584813151583696504920884},
-	{ 0.170961888760301226363642357}, { 0.985277642388941244774018433},
-	{-0.985277642388941244774018433}, { 0.170961888760301226363642357},
-	{ 0.992479534598709998156767252}, { 0.122410675199216198498704474},
-	{-0.122410675199216198498704474}, { 0.992479534598709998156767252},
-	{ 0.615231590580626845484913563}, { 0.788346427626606262009164705},
-	{-0.788346427626606262009164705}, { 0.615231590580626845484913563},
-	{ 0.870086991108711418652292404}, { 0.492898192229784036873026689},
-	{-0.492898192229784036873026689}, { 0.870086991108711418652292404},
-	{ 0.266712757474898386325286515}, { 0.963776065795439866686464356},
-	{-0.963776065795439866686464356}, { 0.266712757474898386325286515},
-	{ 0.949528180593036667195936074}, { 0.313681740398891476656478846},
-	{-0.313681740398891476656478846}, { 0.949528180593036667195936074},
-	{ 0.449611329654606600046294579}, { 0.893224301195515320342416447},
-	{-0.893224301195515320342416447}, { 0.449611329654606600046294579},
-	{ 0.757208846506484547575464054}, { 0.653172842953776764084203014},
-	{-0.653172842953776764084203014}, { 0.757208846506484547575464054},
-	{ 0.073564563599667423529465622}, { 0.997290456678690216135597140},
-	{-0.997290456678690216135597140}, { 0.073564563599667423529465622},
-	{ 0.997290456678690216135597140}, { 0.073564563599667423529465622},
-	{-0.073564563599667423529465622}, { 0.997290456678690216135597140},
-	{ 0.653172842953776764084203014}, { 0.757208846506484547575464054},
-	{-0.757208846506484547575464054}, { 0.653172842953776764084203014},
-	{ 0.893224301195515320342416447}, { 0.449611329654606600046294579},
-	{-0.449611329654606600046294579}, { 0.893224301195515320342416447},
-	{ 0.313681740398891476656478846}, { 0.949528180593036667195936074},
-	{-0.949528180593036667195936074}, { 0.313681740398891476656478846},
-	{ 0.963776065795439866686464356}, { 0.266712757474898386325286515},
-	{-0.266712757474898386325286515}, { 0.963776065795439866686464356},
-	{ 0.492898192229784036873026689}, { 0.870086991108711418652292404},
-	{-0.870086991108711418652292404}, { 0.492898192229784036873026689},
-	{ 0.788346427626606262009164705}, { 0.615231590580626845484913563},
-	{-0.615231590580626845484913563}, { 0.788346427626606262009164705},
-	{ 0.122410675199216198498704474}, { 0.992479534598709998156767252},
-	{-0.992479534598709998156767252}, { 0.122410675199216198498704474},
-	{ 0.985277642388941244774018433}, { 0.170961888760301226363642357},
-	{-0.170961888760301226363642357}, { 0.985277642388941244774018433},
-	{ 0.575808191417845300745972454}, { 0.817584813151583696504920884},
-	{-0.817584813151583696504920884}, { 0.575808191417845300745972454},
-	{ 0.844853565249707073259571205}, { 0.534997619887097210663076905},
-	{-0.534997619887097210663076905}, { 0.844853565249707073259571205},
-	{ 0.219101240156869797227737547}, { 0.975702130038528544460395766},
-	{-0.975702130038528544460395766}, { 0.219101240156869797227737547},
-	{ 0.932992798834738887711660256}, { 0.359895036534988148775104572},
-	{-0.359895036534988148775104572}, { 0.932992798834738887711660256},
-	{ 0.405241314004989870908481306}, { 0.914209755703530654635014829},
-	{-0.914209755703530654635014829}, { 0.405241314004989870908481306},
-	{ 0.724247082951466920941069243}, { 0.689540544737066924616730630},
-	{-0.689540544737066924616730630}, { 0.724247082951466920941069243},
-	{ 0.024541228522912288031734529}, { 0.999698818696204220115765650},
-	{-0.999698818696204220115765650}, { 0.024541228522912288031734529},
-	{ 0.999924701839144540921646491}, { 0.012271538285719926079408262},
-	{-0.012271538285719926079408262}, { 0.999924701839144540921646491},
-	{ 0.698376249408972853554813503}, { 0.715730825283818654125532623},
-	{-0.715730825283818654125532623}, { 0.698376249408972853554813503},
-	{ 0.919113851690057743908477789}, { 0.393992040061048108596188661},
-	{-0.393992040061048108596188661}, { 0.919113851690057743908477789},
-	{ 0.371317193951837543411934967}, { 0.928506080473215565937167396},
-	{-0.928506080473215565937167396}, { 0.371317193951837543411934967},
-	{ 0.978317370719627633106240097}, { 0.207111376192218549708116020},
-	{-0.207111376192218549708116020}, { 0.978317370719627633106240097},
-	{ 0.545324988422046422313987347}, { 0.838224705554838043186996856},
-	{-0.838224705554838043186996856}, { 0.545324988422046422313987347},
-	{ 0.824589302785025264474803737}, { 0.565731810783613197389765011},
-	{-0.565731810783613197389765011}, { 0.824589302785025264474803737},
-	{ 0.183039887955140958516532578}, { 0.983105487431216327180301155},
-	{-0.983105487431216327180301155}, { 0.183039887955140958516532578},
-	{ 0.993906970002356041546922813}, { 0.110222207293883058807899140},
-	{-0.110222207293883058807899140}, { 0.993906970002356041546922813},
-	{ 0.624859488142386377084072816}, { 0.780737228572094478301588484},
-	{-0.780737228572094478301588484}, { 0.624859488142386377084072816},
-	{ 0.876070094195406607095844268}, { 0.482183772079122748517344481},
-	{-0.482183772079122748517344481}, { 0.876070094195406607095844268},
-	{ 0.278519689385053105207848526}, { 0.960430519415565811199035138},
-	{-0.960430519415565811199035138}, { 0.278519689385053105207848526},
-	{ 0.953306040354193836916740383}, { 0.302005949319228067003463232},
-	{-0.302005949319228067003463232}, { 0.953306040354193836916740383},
-	{ 0.460538710958240023633181487}, { 0.887639620402853947760181617},
-	{-0.887639620402853947760181617}, { 0.460538710958240023633181487},
-	{ 0.765167265622458925888815999}, { 0.643831542889791465068086063},
-	{-0.643831542889791465068086063}, { 0.765167265622458925888815999},
-	{ 0.085797312344439890461556332}, { 0.996312612182778012627226190},
-	{-0.996312612182778012627226190}, { 0.085797312344439890461556332},
-	{ 0.998118112900149207125155861}, { 0.061320736302208577782614593},
-	{-0.061320736302208577782614593}, { 0.998118112900149207125155861},
-	{ 0.662415777590171761113069817}, { 0.749136394523459325469203257},
-	{-0.749136394523459325469203257}, { 0.662415777590171761113069817},
-	{ 0.898674465693953843041976744}, { 0.438616238538527637647025738},
-	{-0.438616238538527637647025738}, { 0.898674465693953843041976744},
-	{ 0.325310292162262934135954708}, { 0.945607325380521325730945387},
-	{-0.945607325380521325730945387}, { 0.325310292162262934135954708},
-	{ 0.966976471044852109087220226}, { 0.254865659604514571553980779},
-	{-0.254865659604514571553980779}, { 0.966976471044852109087220226},
-	{ 0.503538383725717558691867071}, { 0.863972856121586737918147054},
-	{-0.863972856121586737918147054}, { 0.503538383725717558691867071},
-	{ 0.795836904608883536262791915}, { 0.605511041404325513920626941},
-	{-0.605511041404325513920626941}, { 0.795836904608883536262791915},
-	{ 0.134580708507126186316358409}, { 0.990902635427780025108237011},
-	{-0.990902635427780025108237011}, { 0.134580708507126186316358409},
-	{ 0.987301418157858382399815802}, { 0.158858143333861441684385360},
-	{-0.158858143333861441684385360}, { 0.987301418157858382399815802},
-	{ 0.585797857456438860328080838}, { 0.810457198252594791726703434},
-	{-0.810457198252594791726703434}, { 0.585797857456438860328080838},
-	{ 0.851355193105265142261290312}, { 0.524589682678468906215098464},
-	{-0.524589682678468906215098464}, { 0.851355193105265142261290312},
-	{ 0.231058108280671119643236018}, { 0.972939952205560145467720114},
-	{-0.972939952205560145467720114}, { 0.231058108280671119643236018},
-	{ 0.937339011912574923201899593}, { 0.348418680249434568419308588},
-	{-0.348418680249434568419308588}, { 0.937339011912574923201899593},
-	{ 0.416429560097637182562598911}, { 0.909167983090522376563884788},
-	{-0.909167983090522376563884788}, { 0.416429560097637182562598911},
-	{ 0.732654271672412834615546649}, { 0.680600997795453050594430464},
-	{-0.680600997795453050594430464}, { 0.732654271672412834615546649},
-	{ 0.036807222941358832324332691}, { 0.999322384588349500896221011},
-	{-0.999322384588349500896221011}, { 0.036807222941358832324332691},
-	{ 0.999322384588349500896221011}, { 0.036807222941358832324332691},
-	{-0.036807222941358832324332691}, { 0.999322384588349500896221011},
-	{ 0.680600997795453050594430464}, { 0.732654271672412834615546649},
-	{-0.732654271672412834615546649}, { 0.680600997795453050594430464},
-	{ 0.909167983090522376563884788}, { 0.416429560097637182562598911},
-	{-0.416429560097637182562598911}, { 0.909167983090522376563884788},
-	{ 0.348418680249434568419308588}, { 0.937339011912574923201899593},
-	{-0.937339011912574923201899593}, { 0.348418680249434568419308588},
-	{ 0.972939952205560145467720114}, { 0.231058108280671119643236018},
-	{-0.231058108280671119643236018}, { 0.972939952205560145467720114},
-	{ 0.524589682678468906215098464}, { 0.851355193105265142261290312},
-	{-0.851355193105265142261290312}, { 0.524589682678468906215098464},
-	{ 0.810457198252594791726703434}, { 0.585797857456438860328080838},
-	{-0.585797857456438860328080838}, { 0.810457198252594791726703434},
-	{ 0.158858143333861441684385360}, { 0.987301418157858382399815802},
-	{-0.987301418157858382399815802}, { 0.158858143333861441684385360},
-	{ 0.990902635427780025108237011}, { 0.134580708507126186316358409},
-	{-0.134580708507126186316358409}, { 0.990902635427780025108237011},
-	{ 0.605511041404325513920626941}, { 0.795836904608883536262791915},
-	{-0.795836904608883536262791915}, { 0.605511041404325513920626941},
-	{ 0.863972856121586737918147054}, { 0.503538383725717558691867071},
-	{-0.503538383725717558691867071}, { 0.863972856121586737918147054},
-	{ 0.254865659604514571553980779}, { 0.966976471044852109087220226},
-	{-0.966976471044852109087220226}, { 0.254865659604514571553980779},
-	{ 0.945607325380521325730945387}, { 0.325310292162262934135954708},
-	{-0.325310292162262934135954708}, { 0.945607325380521325730945387},
-	{ 0.438616238538527637647025738}, { 0.898674465693953843041976744},
-	{-0.898674465693953843041976744}, { 0.438616238538527637647025738},
-	{ 0.749136394523459325469203257}, { 0.662415777590171761113069817},
-	{-0.662415777590171761113069817}, { 0.749136394523459325469203257},
-	{ 0.061320736302208577782614593}, { 0.998118112900149207125155861},
-	{-0.998118112900149207125155861}, { 0.061320736302208577782614593},
-	{ 0.996312612182778012627226190}, { 0.085797312344439890461556332},
-	{-0.085797312344439890461556332}, { 0.996312612182778012627226190},
-	{ 0.643831542889791465068086063}, { 0.765167265622458925888815999},
-	{-0.765167265622458925888815999}, { 0.643831542889791465068086063},
-	{ 0.887639620402853947760181617}, { 0.460538710958240023633181487},
-	{-0.460538710958240023633181487}, { 0.887639620402853947760181617},
-	{ 0.302005949319228067003463232}, { 0.953306040354193836916740383},
-	{-0.953306040354193836916740383}, { 0.302005949319228067003463232},
-	{ 0.960430519415565811199035138}, { 0.278519689385053105207848526},
-	{-0.278519689385053105207848526}, { 0.960430519415565811199035138},
-	{ 0.482183772079122748517344481}, { 0.876070094195406607095844268},
-	{-0.876070094195406607095844268}, { 0.482183772079122748517344481},
-	{ 0.780737228572094478301588484}, { 0.624859488142386377084072816},
-	{-0.624859488142386377084072816}, { 0.780737228572094478301588484},
-	{ 0.110222207293883058807899140}, { 0.993906970002356041546922813},
-	{-0.993906970002356041546922813}, { 0.110222207293883058807899140},
-	{ 0.983105487431216327180301155}, { 0.183039887955140958516532578},
-	{-0.183039887955140958516532578}, { 0.983105487431216327180301155},
-	{ 0.565731810783613197389765011}, { 0.824589302785025264474803737},
-	{-0.824589302785025264474803737}, { 0.565731810783613197389765011},
-	{ 0.838224705554838043186996856}, { 0.545324988422046422313987347},
-	{-0.545324988422046422313987347}, { 0.838224705554838043186996856},
-	{ 0.207111376192218549708116020}, { 0.978317370719627633106240097},
-	{-0.978317370719627633106240097}, { 0.207111376192218549708116020},
-	{ 0.928506080473215565937167396}, { 0.371317193951837543411934967},
-	{-0.371317193951837543411934967}, { 0.928506080473215565937167396},
-	{ 0.393992040061048108596188661}, { 0.919113851690057743908477789},
-	{-0.919113851690057743908477789}, { 0.393992040061048108596188661},
-	{ 0.715730825283818654125532623}, { 0.698376249408972853554813503},
-	{-0.698376249408972853554813503}, { 0.715730825283818654125532623},
-	{ 0.012271538285719926079408262}, { 0.999924701839144540921646491},
-	{-0.999924701839144540921646491}, { 0.012271538285719926079408262},
-	{ 0.999981175282601142656990438}, { 0.006135884649154475359640235},
-	{-0.006135884649154475359640235}, { 0.999981175282601142656990438},
-	{ 0.702754744457225302452914421}, { 0.711432195745216441522130290},
-	{-0.711432195745216441522130290}, { 0.702754744457225302452914421},
-	{ 0.921514039342041943465396332}, { 0.388345046698826291624993541},
-	{-0.388345046698826291624993541}, { 0.921514039342041943465396332},
-	{ 0.377007410216418256726567823}, { 0.926210242138311341974793388},
-	{-0.926210242138311341974793388}, { 0.377007410216418256726567823},
-	{ 0.979569765685440534439326110}, { 0.201104634842091911558443546},
-	{-0.201104634842091911558443546}, { 0.979569765685440534439326110},
-	{ 0.550457972936604802977289893}, { 0.834862874986380056304401383},
-	{-0.834862874986380056304401383}, { 0.550457972936604802977289893},
-	{ 0.828045045257755752067527592}, { 0.560661576197336023839710223},
-	{-0.560661576197336023839710223}, { 0.828045045257755752067527592},
-	{ 0.189068664149806212754997837}, { 0.981963869109555264072848154},
-	{-0.981963869109555264072848154}, { 0.189068664149806212754997837},
-	{ 0.994564570734255452119106243}, { 0.104121633872054579120943880},
-	{-0.104121633872054579120943880}, { 0.994564570734255452119106243},
-	{ 0.629638238914927025372981341}, { 0.776888465673232450040827983},
-	{-0.776888465673232450040827983}, { 0.629638238914927025372981341},
-	{ 0.879012226428633477831323711}, { 0.476799230063322133342158117},
-	{-0.476799230063322133342158117}, { 0.879012226428633477831323711},
-	{ 0.284407537211271843618310615}, { 0.958703474895871555374645792},
-	{-0.958703474895871555374645792}, { 0.284407537211271843618310615},
-	{ 0.955141168305770721498157712}, { 0.296150888243623824121786128},
-	{-0.296150888243623824121786128}, { 0.955141168305770721498157712},
-	{ 0.465976495767966177902756065}, { 0.884797098430937780104007041},
-	{-0.884797098430937780104007041}, { 0.465976495767966177902756065},
-	{ 0.769103337645579639346626069}, { 0.639124444863775743801488193},
-	{-0.639124444863775743801488193}, { 0.769103337645579639346626069},
-	{ 0.091908956497132728624990979}, { 0.995767414467659793982495643},
-	{-0.995767414467659793982495643}, { 0.091908956497132728624990979},
-	{ 0.998475580573294752208559038}, { 0.055195244349689939809447526},
-	{-0.055195244349689939809447526}, { 0.998475580573294752208559038},
-	{ 0.666999922303637506650154222}, { 0.745057785441465962407907310},
-	{-0.745057785441465962407907310}, { 0.666999922303637506650154222},
-	{ 0.901348847046022014570746093}, { 0.433093818853151968484222638},
-	{-0.433093818853151968484222638}, { 0.901348847046022014570746093},
-	{ 0.331106305759876401737190737}, { 0.943593458161960361495301445},
-	{-0.943593458161960361495301445}, { 0.331106305759876401737190737},
-	{ 0.968522094274417316221088329}, { 0.248927605745720168110682816},
-	{-0.248927605745720168110682816}, { 0.968522094274417316221088329},
-	{ 0.508830142543107036931749324}, { 0.860866938637767279344583877},
-	{-0.860866938637767279344583877}, { 0.508830142543107036931749324},
-	{ 0.799537269107905033500246232}, { 0.600616479383868926653875896},
-	{-0.600616479383868926653875896}, { 0.799537269107905033500246232},
-	{ 0.140658239332849230714788846}, { 0.990058210262297105505906464},
-	{-0.990058210262297105505906464}, { 0.140658239332849230714788846},
-	{ 0.988257567730749491404792538}, { 0.152797185258443427720336613},
-	{-0.152797185258443427720336613}, { 0.988257567730749491404792538},
-	{ 0.590759701858874228423887908}, { 0.806847553543799272206514313},
-	{-0.806847553543799272206514313}, { 0.590759701858874228423887908},
-	{ 0.854557988365400520767862276}, { 0.519355990165589587361829932},
-	{-0.519355990165589587361829932}, { 0.854557988365400520767862276},
-	{ 0.237023605994367206867735915}, { 0.971503890986251775537099622},
-	{-0.971503890986251775537099622}, { 0.237023605994367206867735915},
-	{ 0.939459223602189911962669246}, { 0.342660717311994397592781983},
-	{-0.342660717311994397592781983}, { 0.939459223602189911962669246},
-	{ 0.422000270799799685941287941}, { 0.906595704514915365332960588},
-	{-0.906595704514915365332960588}, { 0.422000270799799685941287941},
-	{ 0.736816568877369875090132520}, { 0.676092703575315960360419228},
-	{-0.676092703575315960360419228}, { 0.736816568877369875090132520},
-	{ 0.042938256934940823077124540}, { 0.999077727752645382888781997},
-	{-0.999077727752645382888781997}, { 0.042938256934940823077124540},
-	{ 0.999529417501093163079703322}, { 0.030674803176636625934021028},
-	{-0.030674803176636625934021028}, { 0.999529417501093163079703322},
-	{ 0.685083667772700381362052545}, { 0.728464390448225196492035438},
-	{-0.728464390448225196492035438}, { 0.685083667772700381362052545},
-	{ 0.911706032005429851404397325}, { 0.410843171057903942183466675},
-	{-0.410843171057903942183466675}, { 0.911706032005429851404397325},
-	{ 0.354163525420490382357395796}, { 0.935183509938947577642207480},
-	{-0.935183509938947577642207480}, { 0.354163525420490382357395796},
-	{ 0.974339382785575860518721668}, { 0.225083911359792835991642120},
-	{-0.225083911359792835991642120}, { 0.974339382785575860518721668},
-	{ 0.529803624686294668216054671}, { 0.848120344803297251279133563},
-	{-0.848120344803297251279133563}, { 0.529803624686294668216054671},
-	{ 0.814036329705948361654516690}, { 0.580813958095764545075595272},
-	{-0.580813958095764545075595272}, { 0.814036329705948361654516690},
-	{ 0.164913120489969921418189113}, { 0.986308097244598647863297524},
-	{-0.986308097244598647863297524}, { 0.164913120489969921418189113},
-	{ 0.991709753669099522860049931}, { 0.128498110793793172624415589},
-	{-0.128498110793793172624415589}, { 0.991709753669099522860049931},
-	{ 0.610382806276309452716352152}, { 0.792106577300212351782342879},
-	{-0.792106577300212351782342879}, { 0.610382806276309452716352152},
-	{ 0.867046245515692651480195629}, { 0.498227666972781852410983869},
-	{-0.498227666972781852410983869}, { 0.867046245515692651480195629},
-	{ 0.260794117915275518280186509}, { 0.965394441697689374550843858},
-	{-0.965394441697689374550843858}, { 0.260794117915275518280186509},
-	{ 0.947585591017741134653387321}, { 0.319502030816015677901518272},
-	{-0.319502030816015677901518272}, { 0.947585591017741134653387321},
-	{ 0.444122144570429231642069418}, { 0.895966249756185155914560282},
-	{-0.895966249756185155914560282}, { 0.444122144570429231642069418},
-	{ 0.753186799043612482483430486}, { 0.657806693297078656931182264},
-	{-0.657806693297078656931182264}, { 0.753186799043612482483430486},
-	{ 0.067443919563664057897972422}, { 0.997723066644191609848546728},
-	{-0.997723066644191609848546728}, { 0.067443919563664057897972422},
-	{ 0.996820299291165714972629398}, { 0.079682437971430121147120656},
-	{-0.079682437971430121147120656}, { 0.996820299291165714972629398},
-	{ 0.648514401022112445084560551}, { 0.761202385484261814029709836},
-	{-0.761202385484261814029709836}, { 0.648514401022112445084560551},
-	{ 0.890448723244757889952150560}, { 0.455083587126343823535869268},
-	{-0.455083587126343823535869268}, { 0.890448723244757889952150560},
-	{ 0.307849640041534893682063646}, { 0.951435020969008369549175569},
-	{-0.951435020969008369549175569}, { 0.307849640041534893682063646},
-	{ 0.962121404269041595429604316}, { 0.272621355449948984493347477},
-	{-0.272621355449948984493347477}, { 0.962121404269041595429604316},
-	{ 0.487550160148435954641485027}, { 0.873094978418290098636085973},
-	{-0.873094978418290098636085973}, { 0.487550160148435954641485027},
-	{ 0.784556597155575233023892575}, { 0.620057211763289178646268191},
-	{-0.620057211763289178646268191}, { 0.784556597155575233023892575},
-	{ 0.116318630911904767252544319}, { 0.993211949234794533104601012},
-	{-0.993211949234794533104601012}, { 0.116318630911904767252544319},
-	{ 0.984210092386929073193874387}, { 0.177004220412148756196839844},
-	{-0.177004220412148756196839844}, { 0.984210092386929073193874387},
-	{ 0.570780745886967280232652864}, { 0.821102514991104679060430820},
-	{-0.821102514991104679060430820}, { 0.570780745886967280232652864},
-	{ 0.841554977436898409603499520}, { 0.540171472729892881297845480},
-	{-0.540171472729892881297845480}, { 0.841554977436898409603499520},
-	{ 0.213110319916091373967757518}, { 0.977028142657754351485866211},
-	{-0.977028142657754351485866211}, { 0.213110319916091373967757518},
-	{ 0.930766961078983731944872340}, { 0.365612997804773870011745909},
-	{-0.365612997804773870011745909}, { 0.930766961078983731944872340},
-	{ 0.399624199845646828544117031}, { 0.916679059921042663116457013},
-	{-0.916679059921042663116457013}, { 0.399624199845646828544117031},
-	{ 0.720002507961381629076682999}, { 0.693971460889654009003734389},
-	{-0.693971460889654009003734389}, { 0.720002507961381629076682999},
-	{ 0.018406729905804820927366313}, { 0.999830581795823422015722275},
-	{-0.999830581795823422015722275}, { 0.018406729905804820927366313},
-	{ 0.999830581795823422015722275}, { 0.018406729905804820927366313},
-	{-0.018406729905804820927366313}, { 0.999830581795823422015722275},
-	{ 0.693971460889654009003734389}, { 0.720002507961381629076682999},
-	{-0.720002507961381629076682999}, { 0.693971460889654009003734389},
-	{ 0.916679059921042663116457013}, { 0.399624199845646828544117031},
-	{-0.399624199845646828544117031}, { 0.916679059921042663116457013},
-	{ 0.365612997804773870011745909}, { 0.930766961078983731944872340},
-	{-0.930766961078983731944872340}, { 0.365612997804773870011745909},
-	{ 0.977028142657754351485866211}, { 0.213110319916091373967757518},
-	{-0.213110319916091373967757518}, { 0.977028142657754351485866211},
-	{ 0.540171472729892881297845480}, { 0.841554977436898409603499520},
-	{-0.841554977436898409603499520}, { 0.540171472729892881297845480},
-	{ 0.821102514991104679060430820}, { 0.570780745886967280232652864},
-	{-0.570780745886967280232652864}, { 0.821102514991104679060430820},
-	{ 0.177004220412148756196839844}, { 0.984210092386929073193874387},
-	{-0.984210092386929073193874387}, { 0.177004220412148756196839844},
-	{ 0.993211949234794533104601012}, { 0.116318630911904767252544319},
-	{-0.116318630911904767252544319}, { 0.993211949234794533104601012},
-	{ 0.620057211763289178646268191}, { 0.784556597155575233023892575},
-	{-0.784556597155575233023892575}, { 0.620057211763289178646268191},
-	{ 0.873094978418290098636085973}, { 0.487550160148435954641485027},
-	{-0.487550160148435954641485027}, { 0.873094978418290098636085973},
-	{ 0.272621355449948984493347477}, { 0.962121404269041595429604316},
-	{-0.962121404269041595429604316}, { 0.272621355449948984493347477},
-	{ 0.951435020969008369549175569}, { 0.307849640041534893682063646},
-	{-0.307849640041534893682063646}, { 0.951435020969008369549175569},
-	{ 0.455083587126343823535869268}, { 0.890448723244757889952150560},
-	{-0.890448723244757889952150560}, { 0.455083587126343823535869268},
-	{ 0.761202385484261814029709836}, { 0.648514401022112445084560551},
-	{-0.648514401022112445084560551}, { 0.761202385484261814029709836},
-	{ 0.079682437971430121147120656}, { 0.996820299291165714972629398},
-	{-0.996820299291165714972629398}, { 0.079682437971430121147120656},
-	{ 0.997723066644191609848546728}, { 0.067443919563664057897972422},
-	{-0.067443919563664057897972422}, { 0.997723066644191609848546728},
-	{ 0.657806693297078656931182264}, { 0.753186799043612482483430486},
-	{-0.753186799043612482483430486}, { 0.657806693297078656931182264},
-	{ 0.895966249756185155914560282}, { 0.444122144570429231642069418},
-	{-0.444122144570429231642069418}, { 0.895966249756185155914560282},
-	{ 0.319502030816015677901518272}, { 0.947585591017741134653387321},
-	{-0.947585591017741134653387321}, { 0.319502030816015677901518272},
-	{ 0.965394441697689374550843858}, { 0.260794117915275518280186509},
-	{-0.260794117915275518280186509}, { 0.965394441697689374550843858},
-	{ 0.498227666972781852410983869}, { 0.867046245515692651480195629},
-	{-0.867046245515692651480195629}, { 0.498227666972781852410983869},
-	{ 0.792106577300212351782342879}, { 0.610382806276309452716352152},
-	{-0.610382806276309452716352152}, { 0.792106577300212351782342879},
-	{ 0.128498110793793172624415589}, { 0.991709753669099522860049931},
-	{-0.991709753669099522860049931}, { 0.128498110793793172624415589},
-	{ 0.986308097244598647863297524}, { 0.164913120489969921418189113},
-	{-0.164913120489969921418189113}, { 0.986308097244598647863297524},
-	{ 0.580813958095764545075595272}, { 0.814036329705948361654516690},
-	{-0.814036329705948361654516690}, { 0.580813958095764545075595272},
-	{ 0.848120344803297251279133563}, { 0.529803624686294668216054671},
-	{-0.529803624686294668216054671}, { 0.848120344803297251279133563},
-	{ 0.225083911359792835991642120}, { 0.974339382785575860518721668},
-	{-0.974339382785575860518721668}, { 0.225083911359792835991642120},
-	{ 0.935183509938947577642207480}, { 0.354163525420490382357395796},
-	{-0.354163525420490382357395796}, { 0.935183509938947577642207480},
-	{ 0.410843171057903942183466675}, { 0.911706032005429851404397325},
-	{-0.911706032005429851404397325}, { 0.410843171057903942183466675},
-	{ 0.728464390448225196492035438}, { 0.685083667772700381362052545},
-	{-0.685083667772700381362052545}, { 0.728464390448225196492035438},
-	{ 0.030674803176636625934021028}, { 0.999529417501093163079703322},
-	{-0.999529417501093163079703322}, { 0.030674803176636625934021028},
-	{ 0.999077727752645382888781997}, { 0.042938256934940823077124540},
-	{-0.042938256934940823077124540}, { 0.999077727752645382888781997},
-	{ 0.676092703575315960360419228}, { 0.736816568877369875090132520},
-	{-0.736816568877369875090132520}, { 0.676092703575315960360419228},
-	{ 0.906595704514915365332960588}, { 0.422000270799799685941287941},
-	{-0.422000270799799685941287941}, { 0.906595704514915365332960588},
-	{ 0.342660717311994397592781983}, { 0.939459223602189911962669246},
-	{-0.939459223602189911962669246}, { 0.342660717311994397592781983},
-	{ 0.971503890986251775537099622}, { 0.237023605994367206867735915},
-	{-0.237023605994367206867735915}, { 0.971503890986251775537099622},
-	{ 0.519355990165589587361829932}, { 0.854557988365400520767862276},
-	{-0.854557988365400520767862276}, { 0.519355990165589587361829932},
-	{ 0.806847553543799272206514313}, { 0.590759701858874228423887908},
-	{-0.590759701858874228423887908}, { 0.806847553543799272206514313},
-	{ 0.152797185258443427720336613}, { 0.988257567730749491404792538},
-	{-0.988257567730749491404792538}, { 0.152797185258443427720336613},
-	{ 0.990058210262297105505906464}, { 0.140658239332849230714788846},
-	{-0.140658239332849230714788846}, { 0.990058210262297105505906464},
-	{ 0.600616479383868926653875896}, { 0.799537269107905033500246232},
-	{-0.799537269107905033500246232}, { 0.600616479383868926653875896},
-	{ 0.860866938637767279344583877}, { 0.508830142543107036931749324},
-	{-0.508830142543107036931749324}, { 0.860866938637767279344583877},
-	{ 0.248927605745720168110682816}, { 0.968522094274417316221088329},
-	{-0.968522094274417316221088329}, { 0.248927605745720168110682816},
-	{ 0.943593458161960361495301445}, { 0.331106305759876401737190737},
-	{-0.331106305759876401737190737}, { 0.943593458161960361495301445},
-	{ 0.433093818853151968484222638}, { 0.901348847046022014570746093},
-	{-0.901348847046022014570746093}, { 0.433093818853151968484222638},
-	{ 0.745057785441465962407907310}, { 0.666999922303637506650154222},
-	{-0.666999922303637506650154222}, { 0.745057785441465962407907310},
-	{ 0.055195244349689939809447526}, { 0.998475580573294752208559038},
-	{-0.998475580573294752208559038}, { 0.055195244349689939809447526},
-	{ 0.995767414467659793982495643}, { 0.091908956497132728624990979},
-	{-0.091908956497132728624990979}, { 0.995767414467659793982495643},
-	{ 0.639124444863775743801488193}, { 0.769103337645579639346626069},
-	{-0.769103337645579639346626069}, { 0.639124444863775743801488193},
-	{ 0.884797098430937780104007041}, { 0.465976495767966177902756065},
-	{-0.465976495767966177902756065}, { 0.884797098430937780104007041},
-	{ 0.296150888243623824121786128}, { 0.955141168305770721498157712},
-	{-0.955141168305770721498157712}, { 0.296150888243623824121786128},
-	{ 0.958703474895871555374645792}, { 0.284407537211271843618310615},
-	{-0.284407537211271843618310615}, { 0.958703474895871555374645792},
-	{ 0.476799230063322133342158117}, { 0.879012226428633477831323711},
-	{-0.879012226428633477831323711}, { 0.476799230063322133342158117},
-	{ 0.776888465673232450040827983}, { 0.629638238914927025372981341},
-	{-0.629638238914927025372981341}, { 0.776888465673232450040827983},
-	{ 0.104121633872054579120943880}, { 0.994564570734255452119106243},
-	{-0.994564570734255452119106243}, { 0.104121633872054579120943880},
-	{ 0.981963869109555264072848154}, { 0.189068664149806212754997837},
-	{-0.189068664149806212754997837}, { 0.981963869109555264072848154},
-	{ 0.560661576197336023839710223}, { 0.828045045257755752067527592},
-	{-0.828045045257755752067527592}, { 0.560661576197336023839710223},
-	{ 0.834862874986380056304401383}, { 0.550457972936604802977289893},
-	{-0.550457972936604802977289893}, { 0.834862874986380056304401383},
-	{ 0.201104634842091911558443546}, { 0.979569765685440534439326110},
-	{-0.979569765685440534439326110}, { 0.201104634842091911558443546},
-	{ 0.926210242138311341974793388}, { 0.377007410216418256726567823},
-	{-0.377007410216418256726567823}, { 0.926210242138311341974793388},
-	{ 0.388345046698826291624993541}, { 0.921514039342041943465396332},
-	{-0.921514039342041943465396332}, { 0.388345046698826291624993541},
-	{ 0.711432195745216441522130290}, { 0.702754744457225302452914421},
-	{-0.702754744457225302452914421}, { 0.711432195745216441522130290},
-	{ 0.006135884649154475359640235}, { 0.999981175282601142656990438},
-	{-0.999981175282601142656990438}, { 0.006135884649154475359640235},
-	{ 0.999995293809576171511580126}, { 0.003067956762965976270145365},
-	{-0.003067956762965976270145365}, { 0.999995293809576171511580126},
-	{ 0.704934080375904908852523758}, { 0.709272826438865651316533772},
-	{-0.709272826438865651316533772}, { 0.704934080375904908852523758},
-	{ 0.922701128333878570437264227}, { 0.385516053843918864075607949},
-	{-0.385516053843918864075607949}, { 0.922701128333878570437264227},
-	{ 0.379847208924051170576281147}, { 0.925049240782677590302371869},
-	{-0.925049240782677590302371869}, { 0.379847208924051170576281147},
-	{ 0.980182135968117392690210009}, { 0.198098410717953586179324918},
-	{-0.198098410717953586179324918}, { 0.980182135968117392690210009},
-	{ 0.553016705580027531764226988}, { 0.833170164701913186439915922},
-	{-0.833170164701913186439915922}, { 0.553016705580027531764226988},
-	{ 0.829761233794523042469023765}, { 0.558118531220556115693702964},
-	{-0.558118531220556115693702964}, { 0.829761233794523042469023765},
-	{ 0.192080397049892441679288205}, { 0.981379193313754574318224190},
-	{-0.981379193313754574318224190}, { 0.192080397049892441679288205},
-	{ 0.994879330794805620591166107}, { 0.101069862754827824987887585},
-	{-0.101069862754827824987887585}, { 0.994879330794805620591166107},
-	{ 0.632018735939809021909403706}, { 0.774953106594873878359129282},
-	{-0.774953106594873878359129282}, { 0.632018735939809021909403706},
-	{ 0.880470889052160770806542929}, { 0.474100214650550014398580015},
-	{-0.474100214650550014398580015}, { 0.880470889052160770806542929},
-	{ 0.287347459544729526477331841}, { 0.957826413027532890321037029},
-	{-0.957826413027532890321037029}, { 0.287347459544729526477331841},
-	{ 0.956045251349996443270479823}, { 0.293219162694258650606608599},
-	{-0.293219162694258650606608599}, { 0.956045251349996443270479823},
-	{ 0.468688822035827933697617870}, { 0.883363338665731594736308015},
-	{-0.883363338665731594736308015}, { 0.468688822035827933697617870},
-	{ 0.771060524261813773200605759}, { 0.636761861236284230413943435},
-	{-0.636761861236284230413943435}, { 0.771060524261813773200605759},
-	{ 0.094963495329638998938034312}, { 0.995480755491926941769171600},
-	{-0.995480755491926941769171600}, { 0.094963495329638998938034312},
-	{ 0.998640218180265222418199049}, { 0.052131704680283321236358216},
-	{-0.052131704680283321236358216}, { 0.998640218180265222418199049},
-	{ 0.669282588346636065720696366}, { 0.743007952135121693517362293},
-	{-0.743007952135121693517362293}, { 0.669282588346636065720696366},
-	{ 0.902673318237258806751502391}, { 0.430326481340082633908199031},
-	{-0.430326481340082633908199031}, { 0.902673318237258806751502391},
-	{ 0.333999651442009404650865481}, { 0.942573197601446879280758735},
-	{-0.942573197601446879280758735}, { 0.333999651442009404650865481},
-	{ 0.969281235356548486048290738}, { 0.245955050335794611599924709},
-	{-0.245955050335794611599924709}, { 0.969281235356548486048290738},
-	{ 0.511468850437970399504391001}, { 0.859301818357008404783582139},
-	{-0.859301818357008404783582139}, { 0.511468850437970399504391001},
-	{ 0.801376171723140219430247777}, { 0.598160706996342311724958652},
-	{-0.598160706996342311724958652}, { 0.801376171723140219430247777},
-	{ 0.143695033150294454819773349}, { 0.989622017463200834623694454},
-	{-0.989622017463200834623694454}, { 0.143695033150294454819773349},
-	{ 0.988721691960323767604516485}, { 0.149764534677321517229695737},
-	{-0.149764534677321517229695737}, { 0.988721691960323767604516485},
-	{ 0.593232295039799808047809426}, { 0.805031331142963597922659282},
-	{-0.805031331142963597922659282}, { 0.593232295039799808047809426},
-	{ 0.856147328375194481019630732}, { 0.516731799017649881508753876},
-	{-0.516731799017649881508753876}, { 0.856147328375194481019630732},
-	{ 0.240003022448741486568922365}, { 0.970772140728950302138169611},
-	{-0.970772140728950302138169611}, { 0.240003022448741486568922365},
-	{ 0.940506070593268323787291309}, { 0.339776884406826857828825803},
-	{-0.339776884406826857828825803}, { 0.940506070593268323787291309},
-	{ 0.424779681209108833357226189}, { 0.905296759318118774354048329},
-	{-0.905296759318118774354048329}, { 0.424779681209108833357226189},
-	{ 0.738887324460615147933116508}, { 0.673829000378756060917568372},
-	{-0.673829000378756060917568372}, { 0.738887324460615147933116508},
-	{ 0.046003182130914628814301788}, { 0.998941293186856850633930266},
-	{-0.998941293186856850633930266}, { 0.046003182130914628814301788},
-	{ 0.999618822495178597116830637}, { 0.027608145778965741612354872},
-	{-0.027608145778965741612354872}, { 0.999618822495178597116830637},
-	{ 0.687315340891759108199186948}, { 0.726359155084345976817494315},
-	{-0.726359155084345976817494315}, { 0.687315340891759108199186948},
-	{ 0.912962190428398164628018233}, { 0.408044162864978680820747499},
-	{-0.408044162864978680820747499}, { 0.912962190428398164628018233},
-	{ 0.357030961233430032614954036}, { 0.934092550404258914729877883},
-	{-0.934092550404258914729877883}, { 0.357030961233430032614954036},
-	{ 0.975025345066994146844913468}, { 0.222093620973203534094094721},
-	{-0.222093620973203534094094721}, { 0.975025345066994146844913468},
-	{ 0.532403127877197971442805218}, { 0.846490938774052078300544488},
-	{-0.846490938774052078300544488}, { 0.532403127877197971442805218},
-	{ 0.815814410806733789010772660}, { 0.578313796411655563342245019},
-	{-0.578313796411655563342245019}, { 0.815814410806733789010772660},
-	{ 0.167938294974731178054745536}, { 0.985797509167567424700995000},
-	{-0.985797509167567424700995000}, { 0.167938294974731178054745536},
-	{ 0.992099313142191757112085445}, { 0.125454983411546238542336453},
-	{-0.125454983411546238542336453}, { 0.992099313142191757112085445},
-	{ 0.612810082429409703935211936}, { 0.790230221437310055030217152},
-	{-0.790230221437310055030217152}, { 0.612810082429409703935211936},
-	{ 0.868570705971340895340449876}, { 0.495565261825772531150266670},
-	{-0.495565261825772531150266670}, { 0.868570705971340895340449876},
-	{ 0.263754678974831383611349322}, { 0.964589793289812723836432159},
-	{-0.964589793289812723836432159}, { 0.263754678974831383611349322},
-	{ 0.948561349915730288158494826}, { 0.316593375556165867243047035},
-	{-0.316593375556165867243047035}, { 0.948561349915730288158494826},
-	{ 0.446868840162374195353044389}, { 0.894599485631382678433072126},
-	{-0.894599485631382678433072126}, { 0.446868840162374195353044389},
-	{ 0.755201376896536527598710756}, { 0.655492852999615385312679701},
-	{-0.655492852999615385312679701}, { 0.755201376896536527598710756},
-	{ 0.070504573389613863027351471}, { 0.997511456140303459699448390},
-	{-0.997511456140303459699448390}, { 0.070504573389613863027351471},
-	{ 0.997060070339482978987989949}, { 0.076623861392031492278332463},
-	{-0.076623861392031492278332463}, { 0.997060070339482978987989949},
-	{ 0.650846684996380915068975573}, { 0.759209188978388033485525443},
-	{-0.759209188978388033485525443}, { 0.650846684996380915068975573},
-	{ 0.891840709392342727796478697}, { 0.452349587233770874133026703},
-	{-0.452349587233770874133026703}, { 0.891840709392342727796478697},
-	{ 0.310767152749611495835997250}, { 0.950486073949481721759926101},
-	{-0.950486073949481721759926101}, { 0.310767152749611495835997250},
-	{ 0.962953266873683886347921481}, { 0.269668325572915106525464462},
-	{-0.269668325572915106525464462}, { 0.962953266873683886347921481},
-	{ 0.490226483288291154229598449}, { 0.871595086655951034842481435},
-	{-0.871595086655951034842481435}, { 0.490226483288291154229598449},
-	{ 0.786455213599085757522319464}, { 0.617647307937803932403979402},
-	{-0.617647307937803932403979402}, { 0.786455213599085757522319464},
-	{ 0.119365214810991364593637790}, { 0.992850414459865090793563344},
-	{-0.992850414459865090793563344}, { 0.119365214810991364593637790},
-	{ 0.984748501801904218556553176}, { 0.173983873387463827950700807},
-	{-0.173983873387463827950700807}, { 0.984748501801904218556553176},
-	{ 0.573297166698042212820171239}, { 0.819347520076796960824689637},
-	{-0.819347520076796960824689637}, { 0.573297166698042212820171239},
-	{ 0.843208239641845437161743865}, { 0.537587076295645482502214932},
-	{-0.537587076295645482502214932}, { 0.843208239641845437161743865},
-	{ 0.216106797076219509948385131}, { 0.976369731330021149312732194},
-	{-0.976369731330021149312732194}, { 0.216106797076219509948385131},
-	{ 0.931884265581668106718557199}, { 0.362755724367397216204854462},
-	{-0.362755724367397216204854462}, { 0.931884265581668106718557199},
-	{ 0.402434650859418441082533934}, { 0.915448716088267819566431292},
-	{-0.915448716088267819566431292}, { 0.402434650859418441082533934},
-	{ 0.722128193929215321243607198}, { 0.691759258364157774906734132},
-	{-0.691759258364157774906734132}, { 0.722128193929215321243607198},
-	{ 0.021474080275469507418374898}, { 0.999769405351215321657617036},
-	{-0.999769405351215321657617036}, { 0.021474080275469507418374898},
-	{ 0.999882347454212525633049627}, { 0.015339206284988101044151868},
-	{-0.015339206284988101044151868}, { 0.999882347454212525633049627},
-	{ 0.696177131491462944788582591}, { 0.717870045055731736211325329},
-	{-0.717870045055731736211325329}, { 0.696177131491462944788582591},
-	{ 0.917900775621390457642276297}, { 0.396809987416710328595290911},
-	{-0.396809987416710328595290911}, { 0.917900775621390457642276297},
-	{ 0.368466829953372331712746222}, { 0.929640895843181265457918066},
-	{-0.929640895843181265457918066}, { 0.368466829953372331712746222},
-	{ 0.977677357824509979943404762}, { 0.210111836880469621717489972},
-	{-0.210111836880469621717489972}, { 0.977677357824509979943404762},
-	{ 0.542750784864515906586768661}, { 0.839893794195999504583383987},
-	{-0.839893794195999504583383987}, { 0.542750784864515906586768661},
-	{ 0.822849781375826332046780034}, { 0.568258952670131549790548489},
-	{-0.568258952670131549790548489}, { 0.822849781375826332046780034},
-	{ 0.180022901405699522679906590}, { 0.983662419211730274396237776},
-	{-0.983662419211730274396237776}, { 0.180022901405699522679906590},
-	{ 0.993564135520595333782021697}, { 0.113270952177564349018228733},
-	{-0.113270952177564349018228733}, { 0.993564135520595333782021697},
-	{ 0.622461279374149972519166721}, { 0.782650596166575738458949301},
-	{-0.782650596166575738458949301}, { 0.622461279374149972519166721},
-	{ 0.874586652278176112634431897}, { 0.484869248000791101822951699},
-	{-0.484869248000791101822951699}, { 0.874586652278176112634431897},
-	{ 0.275571819310958163076425168}, { 0.961280485811320641748659653},
-	{-0.961280485811320641748659653}, { 0.275571819310958163076425168},
-	{ 0.952375012719765858529893608}, { 0.304929229735402406490728633},
-	{-0.304929229735402406490728633}, { 0.952375012719765858529893608},
-	{ 0.457813303598877221904961155}, { 0.889048355854664562540777729},
-	{-0.889048355854664562540777729}, { 0.457813303598877221904961155},
-	{ 0.763188417263381271704838297}, { 0.646176012983316364832802220},
-	{-0.646176012983316364832802220}, { 0.763188417263381271704838297},
-	{ 0.082740264549375693111987083}, { 0.996571145790554847093566910},
-	{-0.996571145790554847093566910}, { 0.082740264549375693111987083},
-	{ 0.997925286198596012623025462}, { 0.064382630929857460819324537},
-	{-0.064382630929857460819324537}, { 0.997925286198596012623025462},
-	{ 0.660114342067420478559490747}, { 0.751165131909686411205819422},
-	{-0.751165131909686411205819422}, { 0.660114342067420478559490747},
-	{ 0.897324580705418281231391836}, { 0.441371268731716692879988968},
-	{-0.441371268731716692879988968}, { 0.897324580705418281231391836},
-	{ 0.322407678801069848384807478}, { 0.946600913083283570044599823},
-	{-0.946600913083283570044599823}, { 0.322407678801069848384807478},
-	{ 0.966190003445412555433832961}, { 0.257831102162159005614471295},
-	{-0.257831102162159005614471295}, { 0.966190003445412555433832961},
-	{ 0.500885382611240786241285004}, { 0.865513624090569082825488358},
-	{-0.865513624090569082825488358}, { 0.500885382611240786241285004},
-	{ 0.793975477554337164895083757}, { 0.607949784967773667243642671},
-	{-0.607949784967773667243642671}, { 0.793975477554337164895083757},
-	{ 0.131540028702883111103387493}, { 0.991310859846115418957349799},
-	{-0.991310859846115418957349799}, { 0.131540028702883111103387493},
-	{ 0.986809401814185476970235952}, { 0.161886393780111837641387995},
-	{-0.161886393780111837641387995}, { 0.986809401814185476970235952},
-	{ 0.583308652937698294392830961}, { 0.812250586585203913049744181},
-	{-0.812250586585203913049744181}, { 0.583308652937698294392830961},
-	{ 0.849741768000852489471268395}, { 0.527199134781901348464274575},
-	{-0.527199134781901348464274575}, { 0.849741768000852489471268395},
-	{ 0.228072083170885739254457379}, { 0.973644249650811925318383912},
-	{-0.973644249650811925318383912}, { 0.228072083170885739254457379},
-	{ 0.936265667170278246576310996}, { 0.351292756085567125601307623},
-	{-0.351292756085567125601307623}, { 0.936265667170278246576310996},
-	{ 0.413638312238434547471944324}, { 0.910441292258067196934095369},
-	{-0.910441292258067196934095369}, { 0.413638312238434547471944324},
-	{ 0.730562769227827561177758850}, { 0.682845546385248068164596123},
-	{-0.682845546385248068164596123}, { 0.730562769227827561177758850},
-	{ 0.033741171851377584833716112}, { 0.999430604555461772019008327},
-	{-0.999430604555461772019008327}, { 0.033741171851377584833716112},
-	{ 0.999204758618363895492950001}, { 0.039872927587739811128578738},
-	{-0.039872927587739811128578738}, { 0.999204758618363895492950001},
-	{ 0.678350043129861486873655042}, { 0.734738878095963464563223604},
-	{-0.734738878095963464563223604}, { 0.678350043129861486873655042},
-	{ 0.907886116487666212038681480}, { 0.419216888363223956433010020},
-	{-0.419216888363223956433010020}, { 0.907886116487666212038681480},
-	{ 0.345541324963989065539191723}, { 0.938403534063108112192420774},
-	{-0.938403534063108112192420774}, { 0.345541324963989065539191723},
-	{ 0.972226497078936305708321144}, { 0.234041958583543423191242045},
-	{-0.234041958583543423191242045}, { 0.972226497078936305708321144},
-	{ 0.521975292937154342694258318}, { 0.852960604930363657746588082},
-	{-0.852960604930363657746588082}, { 0.521975292937154342694258318},
-	{ 0.808656181588174991946968128}, { 0.588281548222645304786439813},
-	{-0.588281548222645304786439813}, { 0.808656181588174991946968128},
-	{ 0.155828397654265235743101486}, { 0.987784141644572154230969032},
-	{-0.987784141644572154230969032}, { 0.155828397654265235743101486},
-	{ 0.990485084256457037998682243}, { 0.137620121586486044948441663},
-	{-0.137620121586486044948441663}, { 0.990485084256457037998682243},
-	{ 0.603066598540348201693430617}, { 0.797690840943391108362662755},
-	{-0.797690840943391108362662755}, { 0.603066598540348201693430617},
-	{ 0.862423956111040538690933878}, { 0.506186645345155291048942344},
-	{-0.506186645345155291048942344}, { 0.862423956111040538690933878},
-	{ 0.251897818154216950498106628}, { 0.967753837093475465243391912},
-	{-0.967753837093475465243391912}, { 0.251897818154216950498106628},
-	{ 0.944604837261480265659265493}, { 0.328209843579092526107916817},
-	{-0.328209843579092526107916817}, { 0.944604837261480265659265493},
-	{ 0.435857079922255491032544080}, { 0.900015892016160228714535267},
-	{-0.900015892016160228714535267}, { 0.435857079922255491032544080},
-	{ 0.747100605980180144323078847}, { 0.664710978203344868130324985},
-	{-0.664710978203344868130324985}, { 0.747100605980180144323078847},
-	{ 0.058258264500435759613979782}, { 0.998301544933892840738782163},
-	{-0.998301544933892840738782163}, { 0.058258264500435759613979782},
-	{ 0.996044700901251989887944810}, { 0.088853552582524596561586535},
-	{-0.088853552582524596561586535}, { 0.996044700901251989887944810},
-	{ 0.641481012808583151988739898}, { 0.767138911935820381181694573},
-	{-0.767138911935820381181694573}, { 0.641481012808583151988739898},
-	{ 0.886222530148880631647990821}, { 0.463259783551860197390719637},
-	{-0.463259783551860197390719637}, { 0.886222530148880631647990821},
-	{ 0.299079826308040476750336973}, { 0.954228095109105629780430732},
-	{-0.954228095109105629780430732}, { 0.299079826308040476750336973},
-	{ 0.959571513081984528335528181}, { 0.281464937925757984095231007},
-	{-0.281464937925757984095231007}, { 0.959571513081984528335528181},
-	{ 0.479493757660153026679839798}, { 0.877545290207261291668470750},
-	{-0.877545290207261291668470750}, { 0.479493757660153026679839798},
-	{ 0.778816512381475953374724325}, { 0.627251815495144113509622565},
-	{-0.627251815495144113509622565}, { 0.778816512381475953374724325},
-	{ 0.107172424956808849175529148}, { 0.994240449453187946358413442},
-	{-0.994240449453187946358413442}, { 0.107172424956808849175529148},
-	{ 0.982539302287441255907040396}, { 0.186055151663446648105438304},
-	{-0.186055151663446648105438304}, { 0.982539302287441255907040396},
-	{ 0.563199344013834115007363772}, { 0.826321062845663480311195452},
-	{-0.826321062845663480311195452}, { 0.563199344013834115007363772},
-	{ 0.836547727223511984524285790}, { 0.547894059173100165608820571},
-	{-0.547894059173100165608820571}, { 0.836547727223511984524285790},
-	{ 0.204108966092816874181696950}, { 0.978948175319062194715480124},
-	{-0.978948175319062194715480124}, { 0.204108966092816874181696950},
-	{ 0.927362525650401087274536959}, { 0.374164062971457997104393020},
-	{-0.374164062971457997104393020}, { 0.927362525650401087274536959},
-	{ 0.391170384302253888687512949}, { 0.920318276709110566440076541},
-	{-0.920318276709110566440076541}, { 0.391170384302253888687512949},
-	{ 0.713584868780793592903125099}, { 0.700568793943248366792866380},
-	{-0.700568793943248366792866380}, { 0.713584868780793592903125099},
-	{ 0.009203754782059819315102378}, { 0.999957644551963866333120920},
-	{-0.999957644551963866333120920}, { 0.009203754782059819315102378},
-	{ 0.999957644551963866333120920}, { 0.009203754782059819315102378},
-	{-0.009203754782059819315102378}, { 0.999957644551963866333120920},
-	{ 0.700568793943248366792866380}, { 0.713584868780793592903125099},
-	{-0.713584868780793592903125099}, { 0.700568793943248366792866380},
-	{ 0.920318276709110566440076541}, { 0.391170384302253888687512949},
-	{-0.391170384302253888687512949}, { 0.920318276709110566440076541},
-	{ 0.374164062971457997104393020}, { 0.927362525650401087274536959},
-	{-0.927362525650401087274536959}, { 0.374164062971457997104393020},
-	{ 0.978948175319062194715480124}, { 0.204108966092816874181696950},
-	{-0.204108966092816874181696950}, { 0.978948175319062194715480124},
-	{ 0.547894059173100165608820571}, { 0.836547727223511984524285790},
-	{-0.836547727223511984524285790}, { 0.547894059173100165608820571},
-	{ 0.826321062845663480311195452}, { 0.563199344013834115007363772},
-	{-0.563199344013834115007363772}, { 0.826321062845663480311195452},
-	{ 0.186055151663446648105438304}, { 0.982539302287441255907040396},
-	{-0.982539302287441255907040396}, { 0.186055151663446648105438304},
-	{ 0.994240449453187946358413442}, { 0.107172424956808849175529148},
-	{-0.107172424956808849175529148}, { 0.994240449453187946358413442},
-	{ 0.627251815495144113509622565}, { 0.778816512381475953374724325},
-	{-0.778816512381475953374724325}, { 0.627251815495144113509622565},
-	{ 0.877545290207261291668470750}, { 0.479493757660153026679839798},
-	{-0.479493757660153026679839798}, { 0.877545290207261291668470750},
-	{ 0.281464937925757984095231007}, { 0.959571513081984528335528181},
-	{-0.959571513081984528335528181}, { 0.281464937925757984095231007},
-	{ 0.954228095109105629780430732}, { 0.299079826308040476750336973},
-	{-0.299079826308040476750336973}, { 0.954228095109105629780430732},
-	{ 0.463259783551860197390719637}, { 0.886222530148880631647990821},
-	{-0.886222530148880631647990821}, { 0.463259783551860197390719637},
-	{ 0.767138911935820381181694573}, { 0.641481012808583151988739898},
-	{-0.641481012808583151988739898}, { 0.767138911935820381181694573},
-	{ 0.088853552582524596561586535}, { 0.996044700901251989887944810},
-	{-0.996044700901251989887944810}, { 0.088853552582524596561586535},
-	{ 0.998301544933892840738782163}, { 0.058258264500435759613979782},
-	{-0.058258264500435759613979782}, { 0.998301544933892840738782163},
-	{ 0.664710978203344868130324985}, { 0.747100605980180144323078847},
-	{-0.747100605980180144323078847}, { 0.664710978203344868130324985},
-	{ 0.900015892016160228714535267}, { 0.435857079922255491032544080},
-	{-0.435857079922255491032544080}, { 0.900015892016160228714535267},
-	{ 0.328209843579092526107916817}, { 0.944604837261480265659265493},
-	{-0.944604837261480265659265493}, { 0.328209843579092526107916817},
-	{ 0.967753837093475465243391912}, { 0.251897818154216950498106628},
-	{-0.251897818154216950498106628}, { 0.967753837093475465243391912},
-	{ 0.506186645345155291048942344}, { 0.862423956111040538690933878},
-	{-0.862423956111040538690933878}, { 0.506186645345155291048942344},
-	{ 0.797690840943391108362662755}, { 0.603066598540348201693430617},
-	{-0.603066598540348201693430617}, { 0.797690840943391108362662755},
-	{ 0.137620121586486044948441663}, { 0.990485084256457037998682243},
-	{-0.990485084256457037998682243}, { 0.137620121586486044948441663},
-	{ 0.987784141644572154230969032}, { 0.155828397654265235743101486},
-	{-0.155828397654265235743101486}, { 0.987784141644572154230969032},
-	{ 0.588281548222645304786439813}, { 0.808656181588174991946968128},
-	{-0.808656181588174991946968128}, { 0.588281548222645304786439813},
-	{ 0.852960604930363657746588082}, { 0.521975292937154342694258318},
-	{-0.521975292937154342694258318}, { 0.852960604930363657746588082},
-	{ 0.234041958583543423191242045}, { 0.972226497078936305708321144},
-	{-0.972226497078936305708321144}, { 0.234041958583543423191242045},
-	{ 0.938403534063108112192420774}, { 0.345541324963989065539191723},
-	{-0.345541324963989065539191723}, { 0.938403534063108112192420774},
-	{ 0.419216888363223956433010020}, { 0.907886116487666212038681480},
-	{-0.907886116487666212038681480}, { 0.419216888363223956433010020},
-	{ 0.734738878095963464563223604}, { 0.678350043129861486873655042},
-	{-0.678350043129861486873655042}, { 0.734738878095963464563223604},
-	{ 0.039872927587739811128578738}, { 0.999204758618363895492950001},
-	{-0.999204758618363895492950001}, { 0.039872927587739811128578738},
-	{ 0.999430604555461772019008327}, { 0.033741171851377584833716112},
-	{-0.033741171851377584833716112}, { 0.999430604555461772019008327},
-	{ 0.682845546385248068164596123}, { 0.730562769227827561177758850},
-	{-0.730562769227827561177758850}, { 0.682845546385248068164596123},
-	{ 0.910441292258067196934095369}, { 0.413638312238434547471944324},
-	{-0.413638312238434547471944324}, { 0.910441292258067196934095369},
-	{ 0.351292756085567125601307623}, { 0.936265667170278246576310996},
-	{-0.936265667170278246576310996}, { 0.351292756085567125601307623},
-	{ 0.973644249650811925318383912}, { 0.228072083170885739254457379},
-	{-0.228072083170885739254457379}, { 0.973644249650811925318383912},
-	{ 0.527199134781901348464274575}, { 0.849741768000852489471268395},
-	{-0.849741768000852489471268395}, { 0.527199134781901348464274575},
-	{ 0.812250586585203913049744181}, { 0.583308652937698294392830961},
-	{-0.583308652937698294392830961}, { 0.812250586585203913049744181},
-	{ 0.161886393780111837641387995}, { 0.986809401814185476970235952},
-	{-0.986809401814185476970235952}, { 0.161886393780111837641387995},
-	{ 0.991310859846115418957349799}, { 0.131540028702883111103387493},
-	{-0.131540028702883111103387493}, { 0.991310859846115418957349799},
-	{ 0.607949784967773667243642671}, { 0.793975477554337164895083757},
-	{-0.793975477554337164895083757}, { 0.607949784967773667243642671},
-	{ 0.865513624090569082825488358}, { 0.500885382611240786241285004},
-	{-0.500885382611240786241285004}, { 0.865513624090569082825488358},
-	{ 0.257831102162159005614471295}, { 0.966190003445412555433832961},
-	{-0.966190003445412555433832961}, { 0.257831102162159005614471295},
-	{ 0.946600913083283570044599823}, { 0.322407678801069848384807478},
-	{-0.322407678801069848384807478}, { 0.946600913083283570044599823},
-	{ 0.441371268731716692879988968}, { 0.897324580705418281231391836},
-	{-0.897324580705418281231391836}, { 0.441371268731716692879988968},
-	{ 0.751165131909686411205819422}, { 0.660114342067420478559490747},
-	{-0.660114342067420478559490747}, { 0.751165131909686411205819422},
-	{ 0.064382630929857460819324537}, { 0.997925286198596012623025462},
-	{-0.997925286198596012623025462}, { 0.064382630929857460819324537},
-	{ 0.996571145790554847093566910}, { 0.082740264549375693111987083},
-	{-0.082740264549375693111987083}, { 0.996571145790554847093566910},
-	{ 0.646176012983316364832802220}, { 0.763188417263381271704838297},
-	{-0.763188417263381271704838297}, { 0.646176012983316364832802220},
-	{ 0.889048355854664562540777729}, { 0.457813303598877221904961155},
-	{-0.457813303598877221904961155}, { 0.889048355854664562540777729},
-	{ 0.304929229735402406490728633}, { 0.952375012719765858529893608},
-	{-0.952375012719765858529893608}, { 0.304929229735402406490728633},
-	{ 0.961280485811320641748659653}, { 0.275571819310958163076425168},
-	{-0.275571819310958163076425168}, { 0.961280485811320641748659653},
-	{ 0.484869248000791101822951699}, { 0.874586652278176112634431897},
-	{-0.874586652278176112634431897}, { 0.484869248000791101822951699},
-	{ 0.782650596166575738458949301}, { 0.622461279374149972519166721},
-	{-0.622461279374149972519166721}, { 0.782650596166575738458949301},
-	{ 0.113270952177564349018228733}, { 0.993564135520595333782021697},
-	{-0.993564135520595333782021697}, { 0.113270952177564349018228733},
-	{ 0.983662419211730274396237776}, { 0.180022901405699522679906590},
-	{-0.180022901405699522679906590}, { 0.983662419211730274396237776},
-	{ 0.568258952670131549790548489}, { 0.822849781375826332046780034},
-	{-0.822849781375826332046780034}, { 0.568258952670131549790548489},
-	{ 0.839893794195999504583383987}, { 0.542750784864515906586768661},
-	{-0.542750784864515906586768661}, { 0.839893794195999504583383987},
-	{ 0.210111836880469621717489972}, { 0.977677357824509979943404762},
-	{-0.977677357824509979943404762}, { 0.210111836880469621717489972},
-	{ 0.929640895843181265457918066}, { 0.368466829953372331712746222},
-	{-0.368466829953372331712746222}, { 0.929640895843181265457918066},
-	{ 0.396809987416710328595290911}, { 0.917900775621390457642276297},
-	{-0.917900775621390457642276297}, { 0.396809987416710328595290911},
-	{ 0.717870045055731736211325329}, { 0.696177131491462944788582591},
-	{-0.696177131491462944788582591}, { 0.717870045055731736211325329},
-	{ 0.015339206284988101044151868}, { 0.999882347454212525633049627},
-	{-0.999882347454212525633049627}, { 0.015339206284988101044151868},
-	{ 0.999769405351215321657617036}, { 0.021474080275469507418374898},
-	{-0.021474080275469507418374898}, { 0.999769405351215321657617036},
-	{ 0.691759258364157774906734132}, { 0.722128193929215321243607198},
-	{-0.722128193929215321243607198}, { 0.691759258364157774906734132},
-	{ 0.915448716088267819566431292}, { 0.402434650859418441082533934},
-	{-0.402434650859418441082533934}, { 0.915448716088267819566431292},
-	{ 0.362755724367397216204854462}, { 0.931884265581668106718557199},
-	{-0.931884265581668106718557199}, { 0.362755724367397216204854462},
-	{ 0.976369731330021149312732194}, { 0.216106797076219509948385131},
-	{-0.216106797076219509948385131}, { 0.976369731330021149312732194},
-	{ 0.537587076295645482502214932}, { 0.843208239641845437161743865},
-	{-0.843208239641845437161743865}, { 0.537587076295645482502214932},
-	{ 0.819347520076796960824689637}, { 0.573297166698042212820171239},
-	{-0.573297166698042212820171239}, { 0.819347520076796960824689637},
-	{ 0.173983873387463827950700807}, { 0.984748501801904218556553176},
-	{-0.984748501801904218556553176}, { 0.173983873387463827950700807},
-	{ 0.992850414459865090793563344}, { 0.119365214810991364593637790},
-	{-0.119365214810991364593637790}, { 0.992850414459865090793563344},
-	{ 0.617647307937803932403979402}, { 0.786455213599085757522319464},
-	{-0.786455213599085757522319464}, { 0.617647307937803932403979402},
-	{ 0.871595086655951034842481435}, { 0.490226483288291154229598449},
-	{-0.490226483288291154229598449}, { 0.871595086655951034842481435},
-	{ 0.269668325572915106525464462}, { 0.962953266873683886347921481},
-	{-0.962953266873683886347921481}, { 0.269668325572915106525464462},
-	{ 0.950486073949481721759926101}, { 0.310767152749611495835997250},
-	{-0.310767152749611495835997250}, { 0.950486073949481721759926101},
-	{ 0.452349587233770874133026703}, { 0.891840709392342727796478697},
-	{-0.891840709392342727796478697}, { 0.452349587233770874133026703},
-	{ 0.759209188978388033485525443}, { 0.650846684996380915068975573},
-	{-0.650846684996380915068975573}, { 0.759209188978388033485525443},
-	{ 0.076623861392031492278332463}, { 0.997060070339482978987989949},
-	{-0.997060070339482978987989949}, { 0.076623861392031492278332463},
-	{ 0.997511456140303459699448390}, { 0.070504573389613863027351471},
-	{-0.070504573389613863027351471}, { 0.997511456140303459699448390},
-	{ 0.655492852999615385312679701}, { 0.755201376896536527598710756},
-	{-0.755201376896536527598710756}, { 0.655492852999615385312679701},
-	{ 0.894599485631382678433072126}, { 0.446868840162374195353044389},
-	{-0.446868840162374195353044389}, { 0.894599485631382678433072126},
-	{ 0.316593375556165867243047035}, { 0.948561349915730288158494826},
-	{-0.948561349915730288158494826}, { 0.316593375556165867243047035},
-	{ 0.964589793289812723836432159}, { 0.263754678974831383611349322},
-	{-0.263754678974831383611349322}, { 0.964589793289812723836432159},
-	{ 0.495565261825772531150266670}, { 0.868570705971340895340449876},
-	{-0.868570705971340895340449876}, { 0.495565261825772531150266670},
-	{ 0.790230221437310055030217152}, { 0.612810082429409703935211936},
-	{-0.612810082429409703935211936}, { 0.790230221437310055030217152},
-	{ 0.125454983411546238542336453}, { 0.992099313142191757112085445},
-	{-0.992099313142191757112085445}, { 0.125454983411546238542336453},
-	{ 0.985797509167567424700995000}, { 0.167938294974731178054745536},
-	{-0.167938294974731178054745536}, { 0.985797509167567424700995000},
-	{ 0.578313796411655563342245019}, { 0.815814410806733789010772660},
-	{-0.815814410806733789010772660}, { 0.578313796411655563342245019},
-	{ 0.846490938774052078300544488}, { 0.532403127877197971442805218},
-	{-0.532403127877197971442805218}, { 0.846490938774052078300544488},
-	{ 0.222093620973203534094094721}, { 0.975025345066994146844913468},
-	{-0.975025345066994146844913468}, { 0.222093620973203534094094721},
-	{ 0.934092550404258914729877883}, { 0.357030961233430032614954036},
-	{-0.357030961233430032614954036}, { 0.934092550404258914729877883},
-	{ 0.408044162864978680820747499}, { 0.912962190428398164628018233},
-	{-0.912962190428398164628018233}, { 0.408044162864978680820747499},
-	{ 0.726359155084345976817494315}, { 0.687315340891759108199186948},
-	{-0.687315340891759108199186948}, { 0.726359155084345976817494315},
-	{ 0.027608145778965741612354872}, { 0.999618822495178597116830637},
-	{-0.999618822495178597116830637}, { 0.027608145778965741612354872},
-	{ 0.998941293186856850633930266}, { 0.046003182130914628814301788},
-	{-0.046003182130914628814301788}, { 0.998941293186856850633930266},
-	{ 0.673829000378756060917568372}, { 0.738887324460615147933116508},
-	{-0.738887324460615147933116508}, { 0.673829000378756060917568372},
-	{ 0.905296759318118774354048329}, { 0.424779681209108833357226189},
-	{-0.424779681209108833357226189}, { 0.905296759318118774354048329},
-	{ 0.339776884406826857828825803}, { 0.940506070593268323787291309},
-	{-0.940506070593268323787291309}, { 0.339776884406826857828825803},
-	{ 0.970772140728950302138169611}, { 0.240003022448741486568922365},
-	{-0.240003022448741486568922365}, { 0.970772140728950302138169611},
-	{ 0.516731799017649881508753876}, { 0.856147328375194481019630732},
-	{-0.856147328375194481019630732}, { 0.516731799017649881508753876},
-	{ 0.805031331142963597922659282}, { 0.593232295039799808047809426},
-	{-0.593232295039799808047809426}, { 0.805031331142963597922659282},
-	{ 0.149764534677321517229695737}, { 0.988721691960323767604516485},
-	{-0.988721691960323767604516485}, { 0.149764534677321517229695737},
-	{ 0.989622017463200834623694454}, { 0.143695033150294454819773349},
-	{-0.143695033150294454819773349}, { 0.989622017463200834623694454},
-	{ 0.598160706996342311724958652}, { 0.801376171723140219430247777},
-	{-0.801376171723140219430247777}, { 0.598160706996342311724958652},
-	{ 0.859301818357008404783582139}, { 0.511468850437970399504391001},
-	{-0.511468850437970399504391001}, { 0.859301818357008404783582139},
-	{ 0.245955050335794611599924709}, { 0.969281235356548486048290738},
-	{-0.969281235356548486048290738}, { 0.245955050335794611599924709},
-	{ 0.942573197601446879280758735}, { 0.333999651442009404650865481},
-	{-0.333999651442009404650865481}, { 0.942573197601446879280758735},
-	{ 0.430326481340082633908199031}, { 0.902673318237258806751502391},
-	{-0.902673318237258806751502391}, { 0.430326481340082633908199031},
-	{ 0.743007952135121693517362293}, { 0.669282588346636065720696366},
-	{-0.669282588346636065720696366}, { 0.743007952135121693517362293},
-	{ 0.052131704680283321236358216}, { 0.998640218180265222418199049},
-	{-0.998640218180265222418199049}, { 0.052131704680283321236358216},
-	{ 0.995480755491926941769171600}, { 0.094963495329638998938034312},
-	{-0.094963495329638998938034312}, { 0.995480755491926941769171600},
-	{ 0.636761861236284230413943435}, { 0.771060524261813773200605759},
-	{-0.771060524261813773200605759}, { 0.636761861236284230413943435},
-	{ 0.883363338665731594736308015}, { 0.468688822035827933697617870},
-	{-0.468688822035827933697617870}, { 0.883363338665731594736308015},
-	{ 0.293219162694258650606608599}, { 0.956045251349996443270479823},
-	{-0.956045251349996443270479823}, { 0.293219162694258650606608599},
-	{ 0.957826413027532890321037029}, { 0.287347459544729526477331841},
-	{-0.287347459544729526477331841}, { 0.957826413027532890321037029},
-	{ 0.474100214650550014398580015}, { 0.880470889052160770806542929},
-	{-0.880470889052160770806542929}, { 0.474100214650550014398580015},
-	{ 0.774953106594873878359129282}, { 0.632018735939809021909403706},
-	{-0.632018735939809021909403706}, { 0.774953106594873878359129282},
-	{ 0.101069862754827824987887585}, { 0.994879330794805620591166107},
-	{-0.994879330794805620591166107}, { 0.101069862754827824987887585},
-	{ 0.981379193313754574318224190}, { 0.192080397049892441679288205},
-	{-0.192080397049892441679288205}, { 0.981379193313754574318224190},
-	{ 0.558118531220556115693702964}, { 0.829761233794523042469023765},
-	{-0.829761233794523042469023765}, { 0.558118531220556115693702964},
-	{ 0.833170164701913186439915922}, { 0.553016705580027531764226988},
-	{-0.553016705580027531764226988}, { 0.833170164701913186439915922},
-	{ 0.198098410717953586179324918}, { 0.980182135968117392690210009},
-	{-0.980182135968117392690210009}, { 0.198098410717953586179324918},
-	{ 0.925049240782677590302371869}, { 0.379847208924051170576281147},
-	{-0.379847208924051170576281147}, { 0.925049240782677590302371869},
-	{ 0.385516053843918864075607949}, { 0.922701128333878570437264227},
-	{-0.922701128333878570437264227}, { 0.385516053843918864075607949},
-	{ 0.709272826438865651316533772}, { 0.704934080375904908852523758},
-	{-0.704934080375904908852523758}, { 0.709272826438865651316533772},
-	{ 0.003067956762965976270145365}, { 0.999995293809576171511580126},
-	{-0.999995293809576171511580126}, { 0.003067956762965976270145365}
-};
-
-const fpr fpr_p2_tab[] = {
-	{ 2.00000000000 },
-	{ 1.00000000000 },
-	{ 0.50000000000 },
-	{ 0.25000000000 },
-	{ 0.12500000000 },
-	{ 0.06250000000 },
-	{ 0.03125000000 },
-	{ 0.01562500000 },
-	{ 0.00781250000 },
-	{ 0.00390625000 },
-	{ 0.00195312500 }
-};
-
-#else // yyyFPNATIVE+0 yyyFPEMU+0
-
-#error No FP implementation selected
-
-#endif // yyyFPNATIVE- yyyFPEMU-
diff --git a/crypto_sign/falcon-512/m4-ct/fpr.h b/crypto_sign/falcon-512/m4-ct/fpr.h
deleted file mode 100644
index 8176212d..00000000
--- a/crypto_sign/falcon-512/m4-ct/fpr.h
+++ /dev/null
@@ -1,893 +0,0 @@
-/*
- * Floating-point operations.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#if FALCON_FPEMU  // yyyFPEMU+1 yyyFPNATIVE+0
-
-/* ====================================================================== */
-/*
- * Custom floating-point implementation with integer arithmetics. We
- * use IEEE-754 "binary64" format, with some simplifications:
- *
- *   - Top bit is s = 1 for negative, 0 for positive.
- *
- *   - Exponent e uses the next 11 bits (bits 52 to 62, inclusive).
- *
- *   - Mantissa m uses the 52 low bits.
- *
- * Encoded value is, in general: (-1)^s * 2^(e-1023) * (1 + m*2^(-52))
- * i.e. the mantissa really is a 53-bit number (less than 2.0, but not
- * less than 1.0), but the top bit (equal to 1 by definition) is omitted
- * in the encoding.
- *
- * In IEEE-754, there are some special values:
- *
- *   - If e = 2047, then the value is either an infinite (m = 0) or
- *     a NaN (m != 0).
- *
- *   - If e = 0, then the value is either a zero (m = 0) or a subnormal,
- *     aka "denormalized number" (m != 0).
- *
- * Of these, we only need the zeros. The caller is responsible for not
- * providing operands that would lead to infinites, NaNs or subnormals.
- * If inputs are such that values go out of range, then indeterminate
- * values are returned (it would still be deterministic, but no specific
- * value may be relied upon).
- *
- * At the C level, the three parts are stored in a 64-bit unsigned
- * word.
- *
- * One may note that a property of the IEEE-754 format is that order
- * is preserved for positive values: if two positive floating-point
- * values x and y are such that x < y, then their respective encodings
- * as _signed_ 64-bit integers i64(x) and i64(y) will be such that
- * i64(x) < i64(y). For negative values, order is reversed: if x < 0,
- * y < 0, and x < y, then ia64(x) > ia64(y).
- *
- * IMPORTANT ASSUMPTIONS:
- * ======================
- *
- * For proper computations, and constant-time behaviour, we assume the
- * following:
- *
- *   - 32x32->64 multiplication (unsigned) has an execution time that
- *     is independent of its operands. This is true of most modern
- *     x86 and ARM cores. Notable exceptions are the ARM Cortex M0, M0+
- *     and M3 (in the M0 and M0+, this is done in software, so it depends
- *     on that routine), and the PowerPC cores from the G3/G4 lines.
- *     For more info, see: https://www.bearssl.org/ctmul.html
- *
- *   - Left-shifts and right-shifts of 32-bit values have an execution
- *     time which does not depend on the shifted value nor on the
- *     shift count. An historical exception is the Pentium IV, but most
- *     modern CPU have barrel shifters. Some small microcontrollers
- *     might have varying-time shifts (not the ARM Cortex M*, though).
- *
- *   - Right-shift of a signed negative value performs a sign extension.
- *     As per the C standard, this operation returns an
- *     implementation-defined result (this is NOT an "undefined
- *     behaviour"). On most/all systems, an arithmetic shift is
- *     performed, because this is what makes most sense.
- */
-
-/*
- * Normally we should declare the 'fpr' type to be a struct or union
- * around the internal 64-bit value; however, we want to use the
- * direct 64-bit integer type to enable a lighter call convention on
- * ARM platforms. This means that direct (invalid) use of operators
- * such as '*' or '+' will not be caught by the compiler. We rely on
- * the "normal" (non-emulated) code to detect such instances.
- */
-typedef uint64_t fpr;
-
-/*
- * For computations, we split values into an integral mantissa in the
- * 2^54..2^55 range, and an (adjusted) exponent. The lowest bit is
- * "sticky" (it is set to 1 if any of the bits below it is 1); when
- * re-encoding, the low two bits are dropped, but may induce an
- * increment in the value for proper rounding.
- */
-
-/*
- * Right-shift a 64-bit unsigned value by a possibly secret shift count.
- * We assumed that the underlying architecture had a barrel shifter for
- * 32-bit shifts, but for 64-bit shifts on a 32-bit system, this will
- * typically invoke a software routine that is not necessarily
- * constant-time; hence the function below.
- *
- * Shift count n MUST be in the 0..63 range.
- */
-static inline uint64_t
-fpr_ursh(uint64_t x, int n)
-{
-	x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5);
-	return x >> (n & 31);
-}
-
-/*
- * Right-shift a 64-bit signed value by a possibly secret shift count
- * (see fpr_ursh() for the rationale).
- *
- * Shift count n MUST be in the 0..63 range.
- */
-static inline int64_t
-fpr_irsh(int64_t x, int n)
-{
-	x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5);
-	return x >> (n & 31);
-}
-
-/*
- * Left-shift a 64-bit unsigned value by a possibly secret shift count
- * (see fpr_ursh() for the rationale).
- *
- * Shift count n MUST be in the 0..63 range.
- */
-static inline uint64_t
-fpr_ulsh(uint64_t x, int n)
-{
-	x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5);
-	return x << (n & 31);
-}
-
-/*
- * Expectations:
- *   s = 0 or 1
- *   exponent e is "arbitrary" and unbiased
- *   2^54 <= m < 2^55
- * Numerical value is (-1)^2 * m * 2^e
- *
- * Exponents which are too low lead to value zero. If the exponent is
- * too large, the returned value is indeterminate.
- *
- * If m = 0, then a zero is returned (using the provided sign).
- * If e < -1076, then a zero is returned (regardless of the value of m).
- * If e >= -1076 and e != 0, m must be within the expected range
- * (2^54 to 2^55-1).
- */
-static inline fpr
-FPR(int s, int e, uint64_t m)
-{
-	fpr x;
-	uint32_t t;
-	unsigned f;
-
-	/*
-	 * If e >= -1076, then the value is "normal"; otherwise, it
-	 * should be a subnormal, which we clamp down to zero.
-	 */
-	e += 1076;
-	t = (uint32_t)e >> 31;
-	m &= (uint64_t)t - 1;
-
-	/*
-	 * If m = 0 then we want a zero; make e = 0 too, but conserve
-	 * the sign.
-	 */
-	t = (uint32_t)(m >> 54);
-	e &= -(int)t;
-
-	/*
-	 * The 52 mantissa bits come from m. Value m has its top bit set
-	 * (unless it is a zero); we leave it "as is": the top bit will
-	 * increment the exponent by 1, except when m = 0, which is
-	 * exactly what we want.
-	 */
-	x = (((uint64_t)s << 63) | (m >> 2)) + ((uint64_t)(uint32_t)e << 52);
-
-	/*
-	 * Rounding: if the low three bits of m are 011, 110 or 111,
-	 * then the value should be incremented to get the next
-	 * representable value. This implements the usual
-	 * round-to-nearest rule (with preference to even values in case
-	 * of a tie). Note that the increment may make a carry spill
-	 * into the exponent field, which is again exactly what we want
-	 * in that case.
-	 */
-	f = (unsigned)m & 7U;
-	x += (0xC8U >> f) & 1;
-	return x;
-}
-
-#define fpr_scaled   Zf(fpr_scaled)
-fpr fpr_scaled(int64_t i, int sc);
-
-static inline fpr
-fpr_of(int64_t i)
-{
-	return fpr_scaled(i, 0);
-}
-
-static const fpr fpr_q = 4667981563525332992;
-static const fpr fpr_inverse_of_q = 4545632735260551042;
-static const fpr fpr_inv_2sqrsigma0 = 4594603506513722306;
-static const fpr fpr_inv_sigma = 4573359825155195350;
-static const fpr fpr_sigma_min_9 = 4608495221497168882;
-static const fpr fpr_sigma_min_10 = 4608586345619182117;
-static const fpr fpr_log2 = 4604418534313441775;
-static const fpr fpr_inv_log2 = 4609176140021203710;
-static const fpr fpr_bnorm_max = 4670353323383631276;
-static const fpr fpr_zero = 0;
-static const fpr fpr_one = 4607182418800017408;
-static const fpr fpr_two = 4611686018427387904;
-static const fpr fpr_onehalf = 4602678819172646912;
-static const fpr fpr_invsqrt2 = 4604544271217802189;
-static const fpr fpr_invsqrt8 = 4600040671590431693;
-static const fpr fpr_ptwo31 = 4746794007248502784;
-static const fpr fpr_ptwo31m1 = 4746794007244308480;
-static const fpr fpr_mtwo31m1 = 13970166044099084288U;
-static const fpr fpr_ptwo63m1 = 4890909195324358656;
-static const fpr fpr_mtwo63m1 = 14114281232179134464U;
-static const fpr fpr_ptwo63 = 4890909195324358656;
-
-static inline int64_t
-fpr_rint(fpr x)
-{
-	uint64_t m, d;
-	int e;
-	uint32_t s, dd, f;
-
-	/*
-	 * We assume that the value fits in -(2^63-1)..+(2^63-1). We can
-	 * thus extract the mantissa as a 63-bit integer, then right-shift
-	 * it as needed.
-	 */
-	m = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
-	e = 1085 - ((int)(x >> 52) & 0x7FF);
-
-	/*
-	 * If a shift of more than 63 bits is needed, then simply set m
-	 * to zero. This also covers the case of an input operand equal
-	 * to zero.
-	 */
-	m &= -(uint64_t)((uint32_t)(e - 64) >> 31);
-	e &= 63;
-
-	/*
-	 * Right-shift m as needed. Shift count is e. Proper rounding
-	 * mandates that:
-	 *   - If the highest dropped bit is zero, then round low.
-	 *   - If the highest dropped bit is one, and at least one of the
-	 *     other dropped bits is one, then round up.
-	 *   - If the highest dropped bit is one, and all other dropped
-	 *     bits are zero, then round up if the lowest kept bit is 1,
-	 *     or low otherwise (i.e. ties are broken by "rounding to even").
-	 *
-	 * We thus first extract a word consisting of all the dropped bit
-	 * AND the lowest kept bit; then we shrink it down to three bits,
-	 * the lowest being "sticky".
-	 */
-	d = fpr_ulsh(m, 63 - e);
-	dd = (uint32_t)d | ((uint32_t)(d >> 32) & 0x1FFFFFFF);
-	f = (uint32_t)(d >> 61) | ((dd | -dd) >> 31);
-	m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U);
-
-	/*
-	 * Apply the sign bit.
-	 */
-	s = (uint32_t)(x >> 63);
-	return ((int64_t)m ^ -(int64_t)s) + (int64_t)s;
-}
-
-static inline int64_t
-fpr_floor(fpr x)
-{
-	uint64_t t;
-	int64_t xi;
-	int e, cc;
-
-	/*
-	 * We extract the integer as a _signed_ 64-bit integer with
-	 * a scaling factor. Since we assume that the value fits
-	 * in the -(2^63-1)..+(2^63-1) range, we can left-shift the
-	 * absolute value to make it in the 2^62..2^63-1 range: we
-	 * will only need a right-shift afterwards.
-	 */
-	e = (int)(x >> 52) & 0x7FF;
-	t = x >> 63;
-	xi = (int64_t)(((x << 10) | ((uint64_t)1 << 62))
-		& (((uint64_t)1 << 63) - 1));
-	xi = (xi ^ -(int64_t)t) + (int64_t)t;
-	cc = 1085 - e;
-
-	/*
-	 * We perform an arithmetic right-shift on the value. This
-	 * applies floor() semantics on both positive and negative values
-	 * (rounding toward minus infinity).
-	 */
-	xi = fpr_irsh(xi, cc & 63);
-
-	/*
-	 * If the true shift count was 64 or more, then we should instead
-	 * replace xi with 0 (if nonnegative) or -1 (if negative). Edge
-	 * case: -0 will be floored to -1, not 0 (whether this is correct
-	 * is debatable; in any case, the other functions normalize zero
-	 * to +0).
-	 *
-	 * For an input of zero, the non-shifted xi was incorrect (we used
-	 * a top implicit bit of value 1, not 0), but this does not matter
-	 * since this operation will clamp it down.
-	 */
-	xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31);
-	return xi;
-}
-
-static inline int64_t
-fpr_trunc(fpr x)
-{
-	uint64_t t, xu;
-	int e, cc;
-
-	/*
-	 * Extract the absolute value. Since we assume that the value
-	 * fits in the -(2^63-1)..+(2^63-1) range, we can left-shift
-	 * the absolute value into the 2^62..2^63-1 range, and then
-	 * do a right shift afterwards.
-	 */
-	e = (int)(x >> 52) & 0x7FF;
-	xu = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
-	cc = 1085 - e;
-	xu = fpr_ursh(xu, cc & 63);
-
-	/*
-	 * If the exponent is too low (cc > 63), then the shift was wrong
-	 * and we must clamp the value to 0. This also covers the case
-	 * of an input equal to zero.
-	 */
-	xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31);
-
-	/*
-	 * Apply back the sign, if the source value is negative.
-	 */
-	t = x >> 63;
-	xu = (xu ^ -t) + t;
-	return *(int64_t *)&xu;
-}
-
-#define fpr_add   Zf(fpr_add)
-fpr fpr_add(fpr x, fpr y);
-
-static inline fpr
-fpr_sub(fpr x, fpr y)
-{
-	y ^= (uint64_t)1 << 63;
-	return fpr_add(x, y);
-}
-
-static inline fpr
-fpr_neg(fpr x)
-{
-	x ^= (uint64_t)1 << 63;
-	return x;
-}
-
-static inline fpr
-fpr_half(fpr x)
-{
-	/*
-	 * To divide a value by 2, we just have to subtract 1 from its
-	 * exponent, but we have to take care of zero.
-	 */
-	uint32_t t;
-
-	x -= (uint64_t)1 << 52;
-	t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11;
-	x &= (uint64_t)t - 1;
-	return x;
-}
-
-static inline fpr
-fpr_double(fpr x)
-{
-	/*
-	 * To double a value, we just increment by one the exponent. We
-	 * don't care about infinites or NaNs; however, 0 is a
-	 * special case.
-	 */
-	x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52;
-	return x;
-}
-
-#define fpr_mul   Zf(fpr_mul)
-fpr fpr_mul(fpr x, fpr y);
-
-static inline fpr
-fpr_sqr(fpr x)
-{
-	return fpr_mul(x, x);
-}
-
-#define fpr_div   Zf(fpr_div)
-fpr fpr_div(fpr x, fpr y);
-
-static inline fpr
-fpr_inv(fpr x)
-{
-	return fpr_div(4607182418800017408u, x);
-}
-
-#define fpr_sqrt   Zf(fpr_sqrt)
-fpr fpr_sqrt(fpr x);
-
-static inline int
-fpr_lt(fpr x, fpr y)
-{
-	/*
-	 * If x >= 0 or y >= 0, a signed comparison yields the proper
-	 * result:
-	 *   - For positive values, the order is preserved.
-	 *   - The sign bit is at the same place as in integers, so
-	 *     sign is preserved.
-	 *
-	 * If both x and y are negative, then the order is reversed.
-	 * We cannot simply invert the comparison result in that case
-	 * because it would not handle the edge case x = y properly.
-	 */
-	int cc0, cc1;
-
-	cc0 = *(int64_t *)&x < *(int64_t *)&y;
-	cc1 = *(int64_t *)&x > *(int64_t *)&y;
-	return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63));
-}
-
-/*
- * Compute exp(x) for x such that |x| <= ln 2. We want a precision of 50
- * bits or so.
- */
-#define fpr_expm_p63   Zf(fpr_expm_p63)
-uint64_t fpr_expm_p63(fpr x, fpr ccs);
-
-#define fpr_gm_tab   Zf(fpr_gm_tab)
-extern const fpr fpr_gm_tab[];
-
-#define fpr_p2_tab   Zf(fpr_p2_tab)
-extern const fpr fpr_p2_tab[];
-
-/* ====================================================================== */
-
-#elif FALCON_FPNATIVE  // yyyFPEMU+0 yyyFPNATIVE+1
-
-/* ====================================================================== */
-
-#include <math.h>
-
-/*
- * We wrap the native 'double' type into a structure so that the C compiler
- * complains if we inadvertently use raw arithmetic operators on the 'fpr'
- * type instead of using the inline functions below. This should have no
- * extra runtime cost, since all the functions below are 'inline'.
- */
-typedef struct { double v; } fpr;
-
-static inline fpr
-FPR(double v)
-{
-	fpr x;
-
-	x.v = v;
-	return x;
-}
-
-static inline fpr
-fpr_of(int64_t i)
-{
-	return FPR((double)i);
-}
-
-static const fpr fpr_q = { 12289.0 };
-static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 };
-static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 };
-static const fpr fpr_inv_sigma = { .005819826392951607426919370871 };
-static const fpr fpr_sigma_min_9 = { 1.291500756233514568549480827642 };
-static const fpr fpr_sigma_min_10 = { 1.311734375905083682667395805765 };
-static const fpr fpr_log2 = { 0.69314718055994530941723212146 };
-static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 };
-static const fpr fpr_bnorm_max = { 16822.4121 };
-static const fpr fpr_zero = { 0.0 };
-static const fpr fpr_one = { 1.0 };
-static const fpr fpr_two = { 2.0 };
-static const fpr fpr_onehalf = { 0.5 };
-static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 };
-static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 };
-static const fpr fpr_ptwo31 = { 2147483648.0 };
-static const fpr fpr_ptwo31m1 = { 2147483647.0 };
-static const fpr fpr_mtwo31m1 = { -2147483647.0 };
-static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 };
-static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 };
-static const fpr fpr_ptwo63 = { 9223372036854775808.0 };
-
-static inline int64_t
-fpr_rint(fpr x)
-{
-	/*
-	 * We do not want to use llrint() since it might be not
-	 * constant-time.
-	 *
-	 * Suppose that x >= 0. If x >= 2^52, then it is already an
-	 * integer. Otherwise, if x < 2^52, then computing x+2^52 will
-	 * yield a value that will be rounded to the nearest integer
-	 * with exactly the right rules (round-to-nearest-even).
-	 *
-	 * In order to have constant-time processing, we must do the
-	 * computation for both x >= 0 and x < 0 cases, and use a
-	 * cast to an integer to access the sign and select the proper
-	 * value. Such casts also allow us to find out if |x| < 2^52.
-	 */
-	int64_t sx, tx, rp, rn, m;
-	uint32_t ub;
-
-	sx = (int64_t)(x.v - 1.0);
-	tx = (int64_t)x.v;
-	rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496;
-	rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496;
-
-	/*
-	 * If tx >= 2^52 or tx < -2^52, then result is tx.
-	 * Otherwise, if sx >= 0, then result is rp.
-	 * Otherwise, result is rn. We use the fact that when x is
-	 * close to 0 (|x| <= 0.25) then both rp and rn are correct;
-	 * and if x is not close to 0, then trunc(x-1.0) yields the
-	 * appropriate sign.
-	 */
-
-	/*
-	 * Clamp rp to zero if tx < 0.
-	 * Clamp rn to zero if tx >= 0.
-	 */
-	m = sx >> 63;
-	rn &= m;
-	rp &= ~m;
-
-	/*
-	 * Get the 12 upper bits of tx; if they are not all zeros or
-	 * all ones, then tx >= 2^52 or tx < -2^52, and we clamp both
-	 * rp and rn to zero. Otherwise, we clamp tx to zero.
-	 */
-	ub = (uint32_t)((uint64_t)tx >> 52);
-	m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31);
-	rp &= m;
-	rn &= m;
-	tx &= ~m;
-
-	/*
-	 * Only one of tx, rn or rp (at most) can be non-zero at this
-	 * point.
-	 */
-	return tx | rn | rp;
-}
-
-static inline int64_t
-fpr_floor(fpr x)
-{
-	int64_t r;
-
-	/*
-	 * The cast performs a trunc() (rounding toward 0) and thus is
-	 * wrong by 1 for most negative values. The correction below is
-	 * constant-time as long as the compiler turns the
-	 * floating-point conversion result into a 0/1 integer without a
-	 * conditional branch or another non-constant-time construction.
-	 * This should hold on all modern architectures with an FPU (and
-	 * if it is false on a given arch, then chances are that the FPU
-	 * itself is not constant-time, making the point moot).
-	 */
-	r = (int64_t)x.v;
-	return r - (x.v < (double)r);
-}
-
-static inline int64_t
-fpr_trunc(fpr x)
-{
-	return (int64_t)x.v;
-}
-
-static inline fpr
-fpr_add(fpr x, fpr y)
-{
-	return FPR(x.v + y.v);
-}
-
-static inline fpr
-fpr_sub(fpr x, fpr y)
-{
-	return FPR(x.v - y.v);
-}
-
-static inline fpr
-fpr_neg(fpr x)
-{
-	return FPR(-x.v);
-}
-
-static inline fpr
-fpr_half(fpr x)
-{
-	return FPR(x.v * 0.5);
-}
-
-static inline fpr
-fpr_double(fpr x)
-{
-	return FPR(x.v + x.v);
-}
-
-static inline fpr
-fpr_mul(fpr x, fpr y)
-{
-	return FPR(x.v * y.v);
-}
-
-static inline fpr
-fpr_sqr(fpr x)
-{
-	return FPR(x.v * x.v);
-}
-
-static inline fpr
-fpr_inv(fpr x)
-{
-	return FPR(1.0 / x.v);
-}
-
-static inline fpr
-fpr_div(fpr x, fpr y)
-{
-	return FPR(x.v / y.v);
-}
-
-#if FALCON_AVX2  // yyyAVX2+1
-TARGET_AVX2
-static inline void
-fpr_sqrt_avx2(double *t)
-{
-	__m128d x;
-
-	x = _mm_load1_pd(t);
-	x = _mm_sqrt_pd(x);
-	_mm_storel_pd(t, x);
-}
-#endif  // yyyAVX2-
-
-static inline fpr
-fpr_sqrt(fpr x)
-{
-	/*
-	 * We prefer not to have a dependency on libm when it can be
-	 * avoided. On x86, calling the sqrt() libm function inlines
-	 * the relevant opcode (fsqrt or sqrtsd, depending on whether
-	 * the 387 FPU or SSE2 is used for floating-point operations)
-	 * but then makes an optional call to the library function
-	 * for proper error handling, in case the operand is negative.
-	 *
-	 * To avoid this dependency, we use intrinsics or inline assembly
-	 * on recognized platforms:
-	 *
-	 *  - If AVX2 is explicitly enabled, then we use SSE2 intrinsics.
-	 *
-	 *  - On GCC/Clang with SSE maths, we use SSE2 intrinsics.
-	 *
-	 *  - On GCC/Clang on i386, or MSVC on i386, we use inline assembly
-	 *    to call the 387 FPU fsqrt opcode.
-	 *
-	 *  - On GCC/Clang/XLC on PowerPC, we use inline assembly to call
-	 *    the fsqrt opcode (Clang needs a special hack).
-	 *
-	 *  - On GCC/Clang on ARM with hardware floating-point, we use
-	 *    inline assembly to call the vqsrt.f64 opcode. Due to a
-	 *    complex ecosystem of compilers and assembly syntaxes, we
-	 *    have to call it "fsqrt" or "fsqrtd", depending on case.
-	 *
-	 * If the platform is not recognized, a call to the system
-	 * library function sqrt() is performed. On some compilers, this
-	 * may actually inline the relevant opcode, and call the library
-	 * function only when the input is invalid (e.g. negative);
-	 * Falcon never actually calls sqrt() on a negative value, but
-	 * the dependency to libm will still be there.
-	 */
-
-#if FALCON_AVX2  // yyyAVX2+1
-	fpr_sqrt_avx2(&x.v);
-	return x;
-#else  // yyyAVX2+0
-#if defined __GNUC__ && defined __SSE2_MATH__
-	return FPR(_mm_cvtsd_f64(_mm_sqrt_pd(_mm_set1_pd(x.v))));
-#elif defined __GNUC__ && defined __i386__
-	__asm__ __volatile__ (
-		"fldl   %0\n\t"
-		"fsqrt\n\t"
-		"fstpl  %0\n\t"
-		: "+m" (x.v) : : );
-	return x;
-#elif defined _M_IX86
-	__asm {
-		fld x.v
-		fsqrt
-		fstp x.v
-	}
-	return x;
-#elif defined __PPC__ && defined __GNUC__
-	fpr y;
-
-#if defined __clang__
-	/*
-	 * Normally we should use a 'd' constraint (register that contains
-	 * a 'double' value) but Clang 3.8.1 chokes on it. Instead we use
-	 * an 'f' constraint, counting on the fact that 'float' values
-	 * are managed in double-precision registers anyway, and the
-	 * compiler will not add extra rounding steps.
-	 */
-	__asm__ ( "fsqrt  %0, %1" : "=f" (y.v) : "f" (x.v) : );
-#else
-	__asm__ ( "fsqrt  %0, %1" : "=d" (y.v) : "d" (x.v) : );
-#endif
-	return y;
-#elif (defined __ARM_FP && ((__ARM_FP & 0x08) == 0x08)) \
-	|| (!defined __ARM_FP && defined __ARM_VFPV2__)
-	/*
-	 * On ARM, assembly syntaxes are a bit of a mess, depending on
-	 * whether GCC or Clang is used, and the binutils version, and
-	 * whether this is 32-bit or 64-bit mode. The code below appears
-	 * to work on:
-	 *    32-bit   GCC-4.9.2   Clang-3.5   Binutils-2.25
-	 *    64-bit   GCC-6.3.0   Clang-3.9   Binutils-2.28
-	 */
-#if defined __aarch64__ && __aarch64__
-	__asm__ ( "fsqrt   %d0, %d0" : "+w" (x.v) : : );
-#else
-	__asm__ ( "fsqrtd  %P0, %P0" : "+w" (x.v) : : );
-#endif
-	return x;
-#else
-	return FPR(sqrt(x.v));
-#endif
-#endif  // yyyAVX2-
-}
-
-static inline int
-fpr_lt(fpr x, fpr y)
-{
-	return x.v < y.v;
-}
-
-TARGET_AVX2
-static inline uint64_t
-fpr_expm_p63(fpr x, fpr ccs)
-{
-	/*
-	 * Polynomial approximation of exp(-x) is taken from FACCT:
-	 *   https://eprint.iacr.org/2018/1234
-	 * Specifically, values are extracted from the implementation
-	 * referenced from the FACCT article, and available at:
-	 *   https://github.com/raykzhao/gaussian
-	 * Tests over more than 24 billions of random inputs in the
-	 * 0..log(2) range have never shown a deviation larger than
-	 * 2^(-50) from the true mathematical value.
-	 */
-
-#if FALCON_AVX2  // yyyAVX2+1
-
-	/*
-	 * AVX2 implementation uses more operations than Horner's method,
-	 * but with a lower expression tree depth. This helps because
-	 * additions and multiplications have a latency of 4 cycles on
-	 * a Skylake, but the CPU can issue two of them per cycle.
-	 */
-
-	static const union {
-		double d[12];
-		__m256d v[3];
-	} c = {
-		{
-			0.999999999999994892974086724280,
-			0.500000000000019206858326015208,
-			0.166666666666984014666397229121,
-			0.041666666666110491190622155955,
-			0.008333333327800835146903501993,
-			0.001388888894063186997887560103,
-			0.000198412739277311890541063977,
-			0.000024801566833585381209939524,
-			0.000002755586350219122514855659,
-			0.000000275607356160477811864927,
-			0.000000025299506379442070029551,
-			0.000000002073772366009083061987
-		}
-	};
-
-	double d1, d2, d4, d8, y;
-	__m256d d14, d58, d9c;
-
-	d1 = -x.v;
-	d2 = d1 * d1;
-	d4 = d2 * d2;
-	d8 = d4 * d4;
-	d14 = _mm256_set_pd(d4, d2 * d1, d2, d1);
-	d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4));
-	d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8));
-	d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0]));
-	d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14);
-	d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58);
-	d9c = _mm256_hadd_pd(d9c, d9c);
-	y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c)
-		+ _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1));
-	y *= ccs.v;
-
-	/*
-	 * Final conversion goes through int64_t first, because that's what
-	 * the underlying opcode (vcvttsd2si) will do, and we know that the
-	 * result will fit, since x >= 0 and ccs < 1. If we did the
-	 * conversion directly to uint64_t, then the compiler would add some
-	 * extra code to cover the case of a source value of 2^63 or more,
-	 * and though the alternate path would never be exercised, the
-	 * extra comparison would cost us some cycles.
-	 */
-	return (uint64_t)(int64_t)(y * fpr_ptwo63.v);
-
-#else  // yyyAVX2+0
-
-	/*
-	 * Normal implementation uses Horner's method, which minimizes
-	 * the number of operations.
-	 */
-
-	double d, y;
-
-	d = x.v;
-	y = 0.000000002073772366009083061987;
-	y = 0.000000025299506379442070029551 - y * d;
-	y = 0.000000275607356160477811864927 - y * d;
-	y = 0.000002755586350219122514855659 - y * d;
-	y = 0.000024801566833585381209939524 - y * d;
-	y = 0.000198412739277311890541063977 - y * d;
-	y = 0.001388888894063186997887560103 - y * d;
-	y = 0.008333333327800835146903501993 - y * d;
-	y = 0.041666666666110491190622155955 - y * d;
-	y = 0.166666666666984014666397229121 - y * d;
-	y = 0.500000000000019206858326015208 - y * d;
-	y = 0.999999999999994892974086724280 - y * d;
-	y = 1.000000000000000000000000000000 - y * d;
-	y *= ccs.v;
-	return (uint64_t)(y * fpr_ptwo63.v);
-
-#endif  // yyyAVX2-
-}
-
-#define fpr_gm_tab   Zf(fpr_gm_tab)
-extern const fpr fpr_gm_tab[];
-
-#define fpr_p2_tab   Zf(fpr_p2_tab)
-extern const fpr fpr_p2_tab[];
-
-/* ====================================================================== */
-
-#else  // yyyFPEMU+0 yyyFPNATIVE+0
-
-#error No FP implementation selected
-
-#endif  // yyyFPEMU- yyyFPNATIVE-
diff --git a/crypto_sign/falcon-512/m4-ct/inner.h b/crypto_sign/falcon-512/m4-ct/inner.h
deleted file mode 100644
index 1f7d0819..00000000
--- a/crypto_sign/falcon-512/m4-ct/inner.h
+++ /dev/null
@@ -1,1168 +0,0 @@
-#ifndef FALCON_INNER_H__
-#define FALCON_INNER_H__
-
-/*
- * Internal functions for Falcon. This is not the API intended to be
- * used by applications; instead, this internal API provides all the
- * primitives on which wrappers build to provide external APIs.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-/*
- * IMPORTANT API RULES
- * -------------------
- *
- * This API has some non-trivial usage rules:
- *
- *
- *  - All public functions (i.e. the non-static ones) must be referenced
- *    with the Zf() macro (e.g. Zf(verify_raw) for the verify_raw()
- *    function). That macro adds a prefix to the name, which is
- *    configurable with the FALCON_PREFIX macro. This allows compiling
- *    the code into a specific "namespace" and potentially including
- *    several versions of this code into a single application (e.g. to
- *    have an AVX2 and a non-AVX2 variants and select the one to use at
- *    runtime based on availability of AVX2 opcodes).
- *
- *  - Functions that need temporary buffers expects them as a final
- *    tmp[] array of type uint8_t*, with a size which is documented for
- *    each function. However, most have some alignment requirements,
- *    because they will use the array to store 16-bit, 32-bit or 64-bit
- *    values (e.g. uint64_t or double). The caller must ensure proper
- *    alignment. What happens on unaligned access depends on the
- *    underlying architecture, ranging from a slight time penalty
- *    to immediate termination of the process.
- *
- *  - Some functions rely on specific rounding rules and precision for
- *    floating-point numbers. On some systems (in particular 32-bit x86
- *    with the 387 FPU), this requires setting an hardware control
- *    word. The caller MUST use set_fpu_cw() to ensure proper precision:
- *
- *      oldcw = set_fpu_cw(2);
- *      Zf(sign_dyn)(...);
- *      set_fpu_cw(oldcw);
- *
- *    On systems where the native floating-point precision is already
- *    proper, or integer-based emulation is used, the set_fpu_cw()
- *    function does nothing, so it can be called systematically.
- */
-
-// yyyPQCLEAN+0 yyyNIST+0 yyySUPERCOP+0
-#include "config.h"
-// yyyPQCLEAN- yyyNIST- yyySUPERCOP-
-// yyySUPERCOP+1
-// yyyCONF*
-// yyySUPERCOP-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#if defined FALCON_AVX2 && FALCON_AVX2 // yyyAVX2+1
-/*
- * This implementation uses AVX2 and optionally FMA intrinsics.
- */
-#include <immintrin.h>
-#ifndef FALCON_LE
-#define FALCON_LE   1
-#endif
-#ifndef FALCON_UNALIGNED
-#define FALCON_UNALIGNED   1
-#endif
-#if defined __GNUC__
-#if defined FALCON_FMA && FALCON_FMA
-#define TARGET_AVX2   __attribute__((target("avx2,fma")))
-#else
-#define TARGET_AVX2   __attribute__((target("avx2")))
-#endif
-#elif defined _MSC_VER && _MSC_VER
-#pragma warning( disable : 4752 )
-#endif
-#if defined FALCON_FMA && FALCON_FMA
-#define FMADD(a, b, c)   _mm256_fmadd_pd(a, b, c)
-#define FMSUB(a, b, c)   _mm256_fmsub_pd(a, b, c)
-#else
-#define FMADD(a, b, c)   _mm256_add_pd(_mm256_mul_pd(a, b), c)
-#define FMSUB(a, b, c)   _mm256_sub_pd(_mm256_mul_pd(a, b), c)
-#endif
-#endif // yyyAVX2-
-
-// yyyNIST+0 yyyPQCLEAN+0
-/*
- * On MSVC, disable warning about applying unary minus on an unsigned
- * type: this is perfectly defined standard behaviour and we do it
- * quite often.
- */
-#if defined _MSC_VER && _MSC_VER
-#pragma warning( disable : 4146 )
-#endif
-
-// yyySUPERCOP+0
-/*
- * Enable ARM assembly on any ARMv7m platform (if it was not done before).
- */
-#ifndef FALCON_ASM_CORTEXM4
-#if (defined __ARM_ARCH_7EM__ && __ARM_ARCH_7EM__) \
-	&& (defined __ARM_FEATURE_DSP && __ARM_FEATURE_DSP)
-#define FALCON_ASM_CORTEXM4   1
-#else
-#define FALCON_ASM_CORTEXM4   0
-#endif
-#endif
-// yyySUPERCOP-
-
-#if defined __i386__ || defined _M_IX86 \
-	|| defined __x86_64__ || defined _M_X64 || \
-	(defined _ARCH_PWR8 && \
-		(defined __LITTLE_ENDIAN || defined __LITTLE_ENDIAN__))
-
-#ifndef FALCON_LE
-#define FALCON_LE     1
-#endif
-#ifndef FALCON_UNALIGNED
-#define FALCON_UNALIGNED   1
-#endif
-
-#elif defined FALCON_ASM_CORTEXM4 && FALCON_ASM_CORTEXM4
-
-#ifndef FALCON_LE
-#define FALCON_LE     1
-#endif
-#ifndef FALCON_UNALIGNED
-#define FALCON_UNALIGNED   0
-#endif
-
-#elif (defined __LITTLE_ENDIAN__ && __LITTLE_ENDIAN__) \
-	|| (defined __BYTE_ORDER__ && defined __ORDER_LITTLE_ENDIAN__ \
-		&& __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-
-#ifndef FALCON_LE
-#define FALCON_LE     1
-#endif
-#ifndef FALCON_UNALIGNED
-#define FALCON_UNALIGNED   0
-#endif
-
-#else
-
-#ifndef FALCON_LE
-#define FALCON_LE     0
-#endif
-#ifndef FALCON_UNALIGNED
-#define FALCON_UNALIGNED   0
-#endif
-
-#endif
-
-/*
- * We ensure that both FALCON_FPEMU and FALCON_FPNATIVE are defined,
- * with compatible values (exactly one of them must be non-zero).
- * If none is defined, then default FP implementation is 'native'
- * except on ARM Cortex M4.
- */
-#if !defined FALCON_FPEMU && !defined FALCON_FPNATIVE
-
-#if (defined __ARM_FP && ((__ARM_FP & 0x08) == 0x08)) \
-	|| (!defined __ARM_FP && defined __ARM_VFPV2__)
-#define FALCON_FPEMU      0
-#define FALCON_FPNATIVE   1
-#elif defined FALCON_ASM_CORTEXM4 && FALCON_ASM_CORTEXM4
-#define FALCON_FPEMU      1
-#define FALCON_FPNATIVE   0
-#else
-#define FALCON_FPEMU      0
-#define FALCON_FPNATIVE   1
-#endif
-
-#elif defined FALCON_FPEMU && !defined FALCON_FPNATIVE
-
-#if FALCON_FPEMU
-#define FALCON_FPNATIVE   0
-#else
-#define FALCON_FPNATIVE   1
-#endif
-
-#elif defined FALCON_FPNATIVE && !defined FALCON_FPEMU
-
-#if FALCON_FPNATIVE
-#define FALCON_FPEMU   0
-#else
-#define FALCON_FPEMU   1
-#endif
-
-#endif
-
-#if (FALCON_FPEMU && FALCON_FPNATIVE) || (!FALCON_FPEMU && !FALCON_FPNATIVE)
-#error Exactly one of FALCON_FPEMU and FALCON_FPNATIVE must be selected
-#endif
-
-// yyySUPERCOP+0
-/*
- * For seed generation from the operating system:
- *  - On Linux and glibc-2.25+, FreeBSD 12+ and OpenBSD, use getentropy().
- *  - On Unix-like systems, use /dev/urandom (including as a fallback
- *    for failed getentropy() calls).
- *  - On Windows, use CryptGenRandom().
- */
-
-#ifndef FALCON_RAND_GETENTROPY
-#if (defined __linux__ && defined __GLIBC__ \
-	&& (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 25))) \
-	|| (defined __FreeBSD__ && __FreeBSD__ >= 12) \
-	|| defined __OpenBSD__
-#define FALCON_RAND_GETENTROPY   1
-#else
-#define FALCON_RAND_GETENTROPY   0
-#endif
-#endif
-
-#ifndef FALCON_RAND_URANDOM
-#if defined _AIX \
-	|| defined __ANDROID__ \
-	|| defined __FreeBSD__ \
-	|| defined __NetBSD__ \
-	|| defined __OpenBSD__ \
-	|| defined __DragonFly__ \
-	|| defined __linux__ \
-	|| (defined __sun && (defined __SVR4 || defined __svr4__)) \
-	|| (defined __APPLE__ && defined __MACH__)
-#define FALCON_RAND_URANDOM   1
-#else
-#define FALCON_RAND_URANDOM   0
-#endif
-#endif
-
-#ifndef FALCON_RAND_WIN32
-#if defined _WIN32 || defined _WIN64
-#define FALCON_RAND_WIN32   1
-#else
-#define FALCON_RAND_WIN32   0
-#endif
-#endif
-// yyySUPERCOP-
-
-/*
- * For still undefined compile-time macros, define them to 0 to avoid
- * warnings with -Wundef.
- */
-#ifndef FALCON_AVX2
-#define FALCON_AVX2   0
-#endif
-#ifndef FALCON_FMA
-#define FALCON_FMA   0
-#endif
-#ifndef FALCON_KG_CHACHA20
-#define FALCON_KG_CHACHA20   0
-#endif
-// yyyNIST- yyyPQCLEAN-
-
-// yyyPQCLEAN+0 yyySUPERCOP+0
-/*
- * "Naming" macro used to apply a consistent prefix over all global
- * symbols.
- */
-#ifndef FALCON_PREFIX
-#define FALCON_PREFIX   falcon_inner
-#endif
-#define Zf(name)             Zf_(FALCON_PREFIX, name)
-#define Zf_(prefix, name)    Zf__(prefix, name)
-#define Zf__(prefix, name)   prefix ## _ ## name
-// yyyPQCLEAN- yyySUPERCOP-
-
-// yyyAVX2+1
-/*
- * We use the TARGET_AVX2 macro to tag some functions which, in some
- * configurations, may use AVX2 and FMA intrinsics; this depends on
- * the compiler. In all other cases, we just define it to emptiness
- * (i.e. it will have no effect).
- */
-#ifndef TARGET_AVX2
-#define TARGET_AVX2
-#endif
-// yyyAVX2-
-
-/*
- * Some computations with floating-point elements, in particular
- * rounding to the nearest integer, rely on operations using _exactly_
- * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
- * x86, the 387 FPU may be used (depending on the target OS) and, in
- * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
- * total type length); to prevent miscomputations, we define an explicit
- * function that modifies the precision in the FPU control word.
- *
- * set_fpu_cw() sets the precision to the provided value, and returns
- * the previously set precision; callers are supposed to restore the
- * previous precision on exit. The correct (52-bit) precision is
- * configured with the value "2". On unsupported compilers, or on
- * targets other than 32-bit x86, or when the native 'double' type is
- * not used, the set_fpu_cw() function does nothing at all.
- */
-#if FALCON_FPNATIVE  // yyyFPNATIVE+1
-#if defined __GNUC__ && defined __i386__
-static inline unsigned
-set_fpu_cw(unsigned x)
-{
-	unsigned short t;
-	unsigned old;
-
-	__asm__ __volatile__ ("fstcw %0" : "=m" (t) : : );
-	old = (t & 0x0300u) >> 8;
-	t = (unsigned short)((t & ~0x0300u) | (x << 8));
-	__asm__ __volatile__ ("fldcw %0" : : "m" (t) : );
-	return old;
-}
-#elif defined _M_IX86
-static inline unsigned
-set_fpu_cw(unsigned x)
-{
-	unsigned short t;
-	unsigned old;
-
-	__asm { fstcw t }
-	old = (t & 0x0300u) >> 8;
-	t = (unsigned short)((t & ~0x0300u) | (x << 8));
-	__asm { fldcw t }
-	return old;
-}
-#else
-static inline unsigned
-set_fpu_cw(unsigned x)
-{
-	return x;
-}
-#endif
-#else  // yyyFPNATIVE+0
-static inline unsigned
-set_fpu_cw(unsigned x)
-{
-	return x;
-}
-#endif  // yyyFPNATIVE-
-
-#if FALCON_FPNATIVE && !FALCON_AVX2  // yyyFPNATIVE+1 yyyAVX2+0
-/*
- * If using the native 'double' type but not AVX2 code, on an x86
- * machine with SSE2 activated for maths, then we will use the
- * SSE2 intrinsics.
- */
-#if defined __GNUC__ && defined __SSE2_MATH__
-#include <immintrin.h>
-#endif
-#endif  // yyyFPNATIVE- yyyAVX2-
-
-#if FALCON_FPNATIVE  // yyyFPNATIVE+1
-/*
- * For optimal reproducibility of values, we need to disable contraction
- * of floating-point expressions; otherwise, on some architectures (e.g.
- * PowerPC), the compiler may generate fused-multiply-add opcodes that
- * may round differently than two successive separate opcodes. C99 defines
- * a standard pragma for that, but GCC-6.2.2 appears to ignore it,
- * hence the GCC-specific pragma (that Clang does not support).
- */
-#if defined __clang__
-#pragma STDC FP_CONTRACT OFF
-#elif defined __GNUC__
-#pragma GCC optimize ("fp-contract=off")
-#endif
-#endif  // yyyFPNATIVE-
-
-// yyyPQCLEAN+0
-/*
- * MSVC 2015 does not know the C99 keyword 'restrict'.
- */
-#if defined _MSC_VER && _MSC_VER
-#ifndef restrict
-#define restrict   __restrict
-#endif
-#endif
-// yyyPQCLEAN-
-
-/* ==================================================================== */
-/*
- * SHAKE256 implementation (shake.c).
- *
- * API is defined to be easily replaced with the fips202.h API defined
- * as part of PQClean.
- */
-
-// yyyPQCLEAN+0
-/*
-typedef struct {
-	union {
-		uint64_t A[25];
-		uint8_t dbuf[200];
-	} st;
-	uint64_t dptr;
-} inner_shake256_context;
-
-#define inner_shake256_init      Zf(i_shake256_init)
-#define inner_shake256_inject    Zf(i_shake256_inject)
-#define inner_shake256_flip      Zf(i_shake256_flip)
-#define inner_shake256_extract   Zf(i_shake256_extract)
-
-void Zf(i_shake256_init)(
-	inner_shake256_context *sc);
-void Zf(i_shake256_inject)(
-	inner_shake256_context *sc, const uint8_t *in, size_t len);
-void Zf(i_shake256_flip)(
-	inner_shake256_context *sc);
-void Zf(i_shake256_extract)(
-	inner_shake256_context *sc, uint8_t *out, size_t len);
-*/
-
-// yyyPQCLEAN+1
-
-#include "fips202.h"
-
-#define inner_shake256_context                shake256incctx
-#define inner_shake256_init(sc)               shake256_inc_init(sc)
-#define inner_shake256_inject(sc, in, len)    shake256_inc_absorb(sc, in, len)
-#define inner_shake256_flip(sc)               shake256_inc_finalize(sc)
-#define inner_shake256_extract(sc, out, len)  shake256_inc_squeeze(out, len, sc)
-
-// yyyPQCLEAN+0
-
-// yyyPQCLEAN-
-
-/* ==================================================================== */
-/*
- * Encoding/decoding functions (codec.c).
- *
- * Encoding functions take as parameters an output buffer (out) with
- * a given maximum length (max_out_len); returned value is the actual
- * number of bytes which have been written. If the output buffer is
- * not large enough, then 0 is returned (some bytes may have been
- * written to the buffer). If 'out' is NULL, then 'max_out_len' is
- * ignored; instead, the function computes and returns the actual
- * required output length (in bytes).
- *
- * Decoding functions take as parameters an input buffer (in) with
- * its maximum length (max_in_len); returned value is the actual number
- * of bytes that have been read from the buffer. If the provided length
- * is too short, then 0 is returned.
- *
- * Values to encode or decode are vectors of integers, with N = 2^logn
- * elements.
- *
- * Three encoding formats are defined:
- *
- *   - modq: sequence of values modulo 12289, each encoded over exactly
- *     14 bits. The encoder and decoder verify that integers are within
- *     the valid range (0..12288). Values are arrays of uint16.
- *
- *   - trim: sequence of signed integers, a specified number of bits
- *     each. The number of bits is provided as parameter and includes
- *     the sign bit. Each integer x must be such that |x| < 2^(bits-1)
- *     (which means that the -2^(bits-1) value is forbidden); encode and
- *     decode functions check that property. Values are arrays of
- *     int16_t or int8_t, corresponding to names 'trim_i16' and
- *     'trim_i8', respectively.
- *
- *   - comp: variable-length encoding for signed integers; each integer
- *     uses a minimum of 9 bits, possibly more. This is normally used
- *     only for signatures.
- *
- */
-
-size_t Zf(modq_encode)(void *out, size_t max_out_len,
-	const uint16_t *x, unsigned logn);
-size_t Zf(trim_i16_encode)(void *out, size_t max_out_len,
-	const int16_t *x, unsigned logn, unsigned bits);
-size_t Zf(trim_i8_encode)(void *out, size_t max_out_len,
-	const int8_t *x, unsigned logn, unsigned bits);
-size_t Zf(comp_encode)(void *out, size_t max_out_len,
-	const int16_t *x, unsigned logn);
-
-size_t Zf(modq_decode)(uint16_t *x, unsigned logn,
-	const void *in, size_t max_in_len);
-size_t Zf(trim_i16_decode)(int16_t *x, unsigned logn, unsigned bits,
-	const void *in, size_t max_in_len);
-size_t Zf(trim_i8_decode)(int8_t *x, unsigned logn, unsigned bits,
-	const void *in, size_t max_in_len);
-size_t Zf(comp_decode)(int16_t *x, unsigned logn,
-	const void *in, size_t max_in_len);
-
-/*
- * Number of bits for key elements, indexed by logn (1 to 10). This
- * is at most 8 bits for all degrees, but some degrees may have shorter
- * elements.
- */
-extern const uint8_t Zf(max_fg_bits)[];
-extern const uint8_t Zf(max_FG_bits)[];
-
-/*
- * Maximum size, in bits, of elements in a signature, indexed by logn
- * (1 to 10). The size includes the sign bit.
- */
-extern const uint8_t Zf(max_sig_bits)[];
-
-/* ==================================================================== */
-/*
- * Support functions used for both signature generation and signature
- * verification (common.c).
- */
-
-/*
- * From a SHAKE256 context (must be already flipped), produce a new
- * point. This is the non-constant-time version, which may leak enough
- * information to serve as a stop condition on a brute force attack on
- * the hashed message (provided that the nonce value is known).
- */
-void Zf(hash_to_point_vartime)(inner_shake256_context *sc,
-	uint16_t *x, unsigned logn);
-
-/*
- * From a SHAKE256 context (must be already flipped), produce a new
- * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
- * This function is constant-time but is typically more expensive than
- * Zf(hash_to_point_vartime)().
- *
- * tmp[] must have 16-bit alignment.
- */
-void Zf(hash_to_point_ct)(inner_shake256_context *sc,
-	uint16_t *x, unsigned logn, uint8_t *tmp);
-
-/*
- * Tell whether a given vector (2N coordinates, in two halves) is
- * acceptable as a signature. This compares the appropriate norm of the
- * vector with the acceptance bound. Returned value is 1 on success
- * (vector is short enough to be acceptable), 0 otherwise.
- */
-int Zf(is_short)(const int16_t *s1, const int16_t *s2, unsigned logn);
-
-/*
- * Tell whether a given vector (2N coordinates, in two halves) is
- * acceptable as a signature. Instead of the first half s1, this
- * function receives the "saturated squared norm" of s1, i.e. the
- * sum of the squares of the coordinates of s1 (saturated at 2^32-1
- * if the sum exceeds 2^31-1).
- *
- * Returned value is 1 on success (vector is short enough to be
- * acceptable), 0 otherwise.
- */
-int Zf(is_short_half)(uint32_t sqn, const int16_t *s2, unsigned logn);
-
-/* ==================================================================== */
-/*
- * Signature verification functions (vrfy.c).
- */
-
-/*
- * Convert a public key to NTT + Montgomery format. Conversion is done
- * in place.
- */
-void Zf(to_ntt_monty)(uint16_t *h, unsigned logn);
-
-/*
- * Internal signature verification code:
- *   c0[]      contains the hashed nonce+message
- *   s2[]      is the decoded signature
- *   h[]       contains the public key, in NTT + Montgomery format
- *   logn      is the degree log
- *   tmp[]     temporary, must have at least 2*2^logn bytes
- * Returned value is 1 on success, 0 on error.
- *
- * tmp[] must have 16-bit alignment.
- */
-int Zf(verify_raw)(const uint16_t *c0, const int16_t *s2,
-	const uint16_t *h, unsigned logn, uint8_t *tmp);
-
-/*
- * Compute the public key h[], given the private key elements f[] and
- * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial
- * modulus. This function returns 1 on success, 0 on error (an error is
- * reported if f is not invertible mod phi mod q).
- *
- * The tmp[] array must have room for at least 2*2^logn elements.
- * tmp[] must have 16-bit alignment.
- */
-int Zf(compute_public)(uint16_t *h,
-	const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp);
-
-/*
- * Recompute the fourth private key element. Private key consists in
- * four polynomials with small coefficients f, g, F and G, which are
- * such that fG - gF = q mod phi; furthermore, f is invertible modulo
- * phi and modulo q. This function recomputes G from f, g and F.
- *
- * The tmp[] array must have room for at least 4*2^logn bytes.
- *
- * Returned value is 1 in success, 0 on error (f not invertible).
- * tmp[] must have 16-bit alignment.
- */
-int Zf(complete_private)(int8_t *G,
-	const int8_t *f, const int8_t *g, const int8_t *F,
-	unsigned logn, uint8_t *tmp);
-
-/*
- * Test whether a given polynomial is invertible modulo phi and q.
- * Polynomial coefficients are small integers.
- *
- * tmp[] must have 16-bit alignment.
- */
-int Zf(is_invertible)(
-	const int16_t *s2, unsigned logn, uint8_t *tmp);
-
-/*
- * Count the number of elements of value zero in the NTT representation
- * of the given polynomial: this is the number of primitive 2n-th roots
- * of unity (modulo q = 12289) that are roots of the provided polynomial
- * (taken modulo q).
- *
- * tmp[] must have 16-bit alignment.
- */
-int Zf(count_nttzero)(const int16_t *sig, unsigned logn, uint8_t *tmp);
-
-/*
- * Internal signature verification with public key recovery:
- *   h[]       receives the public key (NOT in NTT/Montgomery format)
- *   c0[]      contains the hashed nonce+message
- *   s1[]      is the first signature half
- *   s2[]      is the second signature half
- *   logn      is the degree log
- *   tmp[]     temporary, must have at least 2*2^logn bytes
- * Returned value is 1 on success, 0 on error. Success is returned if
- * the signature is a short enough vector; in that case, the public
- * key has been written to h[]. However, the caller must still
- * verify that h[] is the correct value (e.g. with regards to a known
- * hash of the public key).
- *
- * h[] may not overlap with any of the other arrays.
- *
- * tmp[] must have 16-bit alignment.
- */
-int Zf(verify_recover)(uint16_t *h,
-	const uint16_t *c0, const int16_t *s1, const int16_t *s2,
-	unsigned logn, uint8_t *tmp);
-
-/* ==================================================================== */
-/*
- * Implementation of floating-point real numbers (fpr.h, fpr.c).
- */
-
-/*
- * Real numbers are implemented by an extra header file, included below.
- * This is meant to support pluggable implementations. The default
- * implementation relies on the C type 'double'.
- *
- * The included file must define the following types, functions and
- * constants:
- *
- *   fpr
- *         type for a real number
- *
- *   fpr fpr_of(int64_t i)
- *         cast an integer into a real number; source must be in the
- *         -(2^63-1)..+(2^63-1) range
- *
- *   fpr fpr_scaled(int64_t i, int sc)
- *         compute i*2^sc as a real number; source 'i' must be in the
- *         -(2^63-1)..+(2^63-1) range
- *
- *   fpr fpr_ldexp(fpr x, int e)
- *         compute x*2^e
- *
- *   int64_t fpr_rint(fpr x)
- *         round x to the nearest integer; x must be in the -(2^63-1)
- *         to +(2^63-1) range
- *
- *   int64_t fpr_trunc(fpr x)
- *         round to an integer; this rounds towards zero; value must
- *         be in the -(2^63-1) to +(2^63-1) range
- *
- *   fpr fpr_add(fpr x, fpr y)
- *         compute x + y
- *
- *   fpr fpr_sub(fpr x, fpr y)
- *         compute x - y
- *
- *   fpr fpr_neg(fpr x)
- *         compute -x
- *
- *   fpr fpr_half(fpr x)
- *         compute x/2
- *
- *   fpr fpr_double(fpr x)
- *         compute x*2
- *
- *   fpr fpr_mul(fpr x, fpr y)
- *         compute x * y
- *
- *   fpr fpr_sqr(fpr x)
- *         compute x * x
- *
- *   fpr fpr_inv(fpr x)
- *         compute 1/x
- *
- *   fpr fpr_div(fpr x, fpr y)
- *         compute x/y
- *
- *   fpr fpr_sqrt(fpr x)
- *         compute the square root of x
- *
- *   int fpr_lt(fpr x, fpr y)
- *         return 1 if x < y, 0 otherwise
- *
- *   uint64_t fpr_expm_p63(fpr x)
- *         return exp(x), assuming that 0 <= x < log(2). Returned value
- *         is scaled to 63 bits (i.e. it really returns 2^63*exp(-x),
- *         rounded to the nearest integer). Computation should have a
- *         precision of at least 45 bits.
- *
- *   const fpr fpr_gm_tab[]
- *         array of constants for FFT / iFFT
- *
- *   const fpr fpr_p2_tab[]
- *         precomputed powers of 2 (by index, 0 to 10)
- *
- * Constants of type 'fpr':
- *
- *   fpr fpr_q                 12289
- *   fpr fpr_inverse_of_q      1/12289
- *   fpr fpr_inv_2sqrsigma0    1/(2*(1.8205^2))
- *   fpr fpr_inv_sigma         1/(1.55*sqrt(12289))
- *   fpr fpr_sigma_min_9       1.291500756233514568549480827642
- *   fpr fpr_sigma_min_10      1.311734375905083682667395805765
- *   fpr fpr_log2              log(2)
- *   fpr fpr_inv_log2          1/log(2)
- *   fpr fpr_bnorm_max         16822.4121
- *   fpr fpr_zero              0
- *   fpr fpr_one               1
- *   fpr fpr_two               2
- *   fpr fpr_onehalf           0.5
- *   fpr fpr_ptwo31            2^31
- *   fpr fpr_ptwo31m1          2^31-1
- *   fpr fpr_mtwo31m1          -(2^31-1)
- *   fpr fpr_ptwo63m1          2^63-1
- *   fpr fpr_mtwo63m1          -(2^63-1)
- *   fpr fpr_ptwo63            2^63
- */
-#include "fpr.h"
-
-/* ==================================================================== */
-/*
- * RNG (rng.c).
- *
- * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256
- * context (flipped) and is used for bulk pseudorandom generation.
- * A system-dependent seed generator is also provided.
- */
-
-/*
- * Obtain a random seed from the system RNG.
- *
- * Returned value is 1 on success, 0 on error.
- */
-int Zf(get_seed)(void *seed, size_t seed_len);
-
-/*
- * Structure for a PRNG. This includes a large buffer so that values
- * get generated in advance. The 'state' is used to keep the current
- * PRNG algorithm state (contents depend on the selected algorithm).
- *
- * The unions with 'dummy_u64' are there to ensure proper alignment for
- * 64-bit direct access.
- */
-typedef struct {
-	union {
-		uint8_t d[512]; /* MUST be 512, exactly */
-		uint64_t dummy_u64;
-	} buf;
-	size_t ptr;
-	union {
-		uint8_t d[256];
-		uint64_t dummy_u64;
-	} state;
-	int type;
-} prng;
-
-/*
- * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
- * context (in "flipped" state) to obtain its initial state.
- */
-void Zf(prng_init)(prng *p, inner_shake256_context *src);
-
-/*
- * Refill the PRNG buffer. This is normally invoked automatically, and
- * is declared here only so that prng_get_u64() may be inlined.
- */
-void Zf(prng_refill)(prng *p);
-
-/*
- * Get some bytes from a PRNG.
- */
-void Zf(prng_get_bytes)(prng *p, void *dst, size_t len);
-
-/*
- * Get a 64-bit random value from a PRNG.
- */
-static inline uint64_t
-prng_get_u64(prng *p)
-{
-	size_t u;
-
-	/*
-	 * If there are less than 9 bytes in the buffer, we refill it.
-	 * This means that we may drop the last few bytes, but this allows
-	 * for faster extraction code. Also, it means that we never leave
-	 * an empty buffer.
-	 */
-	u = p->ptr;
-	if (u >= (sizeof p->buf.d) - 9) {
-		Zf(prng_refill)(p);
-		u = 0;
-	}
-	p->ptr = u + 8;
-
-	/*
-	 * On systems that use little-endian encoding and allow
-	 * unaligned accesses, we can simply read the data where it is.
-	 */
-#if FALCON_LE && FALCON_UNALIGNED  // yyyLEU+1
-	return *(uint64_t *)(p->buf.d + u);
-#else  // yyyLEU+0
-	return (uint64_t)p->buf.d[u + 0]
-		| ((uint64_t)p->buf.d[u + 1] << 8)
-		| ((uint64_t)p->buf.d[u + 2] << 16)
-		| ((uint64_t)p->buf.d[u + 3] << 24)
-		| ((uint64_t)p->buf.d[u + 4] << 32)
-		| ((uint64_t)p->buf.d[u + 5] << 40)
-		| ((uint64_t)p->buf.d[u + 6] << 48)
-		| ((uint64_t)p->buf.d[u + 7] << 56);
-#endif  // yyyLEU-
-}
-
-/*
- * Get an 8-bit random value from a PRNG.
- */
-static inline unsigned
-prng_get_u8(prng *p)
-{
-	unsigned v;
-
-	v = p->buf.d[p->ptr ++];
-	if (p->ptr == sizeof p->buf.d) {
-		Zf(prng_refill)(p);
-	}
-	return v;
-}
-
-/* ==================================================================== */
-/*
- * FFT (falcon-fft.c).
- *
- * A real polynomial is represented as an array of N 'fpr' elements.
- * The FFT representation of a real polynomial contains N/2 complex
- * elements; each is stored as two real numbers, for the real and
- * imaginary parts, respectively. See falcon-fft.c for details on the
- * internal representation.
- */
-
-/*
- * Compute FFT in-place: the source array should contain a real
- * polynomial (N coefficients); its storage area is reused to store
- * the FFT representation of that polynomial (N/2 complex numbers).
- *
- * 'logn' MUST lie between 1 and 10 (inclusive).
- */
-void Zf(FFT)(fpr *f, unsigned logn);
-
-/*
- * Compute the inverse FFT in-place: the source array should contain the
- * FFT representation of a real polynomial (N/2 elements); the resulting
- * real polynomial (N coefficients of type 'fpr') is written over the
- * array.
- *
- * 'logn' MUST lie between 1 and 10 (inclusive).
- */
-void Zf(iFFT)(fpr *f, unsigned logn);
-
-/*
- * Add polynomial b to polynomial a. a and b MUST NOT overlap. This
- * function works in both normal and FFT representations.
- */
-void Zf(poly_add)(fpr *restrict a, const fpr *restrict b, unsigned logn);
-
-/*
- * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This
- * function works in both normal and FFT representations.
- */
-void Zf(poly_sub)(fpr *restrict a, const fpr *restrict b, unsigned logn);
-
-/*
- * Negate polynomial a. This function works in both normal and FFT
- * representations.
- */
-void Zf(poly_neg)(fpr *a, unsigned logn);
-
-/*
- * Compute adjoint of polynomial a. This function works only in FFT
- * representation.
- */
-void Zf(poly_adj_fft)(fpr *a, unsigned logn);
-
-/*
- * Multiply polynomial a with polynomial b. a and b MUST NOT overlap.
- * This function works only in FFT representation.
- */
-void Zf(poly_mul_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn);
-
-/*
- * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT
- * overlap. This function works only in FFT representation.
- */
-void Zf(poly_muladj_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn);
-
-/*
- * Multiply polynomial with its own adjoint. This function works only in FFT
- * representation.
- */
-void Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn);
-
-/*
- * Multiply polynomial with a real constant. This function works in both
- * normal and FFT representations.
- */
-void Zf(poly_mulconst)(fpr *a, fpr x, unsigned logn);
-
-/*
- * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation).
- * a and b MUST NOT overlap.
- */
-void Zf(poly_div_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn);
-
-/*
- * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g))
- * (also in FFT representation). Since the result is auto-adjoint, all its
- * coordinates in FFT representation are real; as such, only the first N/2
- * values of d[] are filled (the imaginary parts are skipped).
- *
- * Array d MUST NOT overlap with either a or b.
- */
-void Zf(poly_invnorm2_fft)(fpr *restrict d,
-	const fpr *restrict a, const fpr *restrict b, unsigned logn);
-
-/*
- * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g)
- * (also in FFT representation). Destination d MUST NOT overlap with
- * any of the source arrays.
- */
-void Zf(poly_add_muladj_fft)(fpr *restrict d,
-	const fpr *restrict F, const fpr *restrict G,
-	const fpr *restrict f, const fpr *restrict g, unsigned logn);
-
-/*
- * Multiply polynomial a by polynomial b, where b is autoadjoint. Both
- * a and b are in FFT representation. Since b is autoadjoint, all its
- * FFT coefficients are real, and the array b contains only N/2 elements.
- * a and b MUST NOT overlap.
- */
-void Zf(poly_mul_autoadj_fft)(fpr *restrict a,
-	const fpr *restrict b, unsigned logn);
-
-/*
- * Divide polynomial a by polynomial b, where b is autoadjoint. Both
- * a and b are in FFT representation. Since b is autoadjoint, all its
- * FFT coefficients are real, and the array b contains only N/2 elements.
- * a and b MUST NOT overlap.
- */
-void Zf(poly_div_autoadj_fft)(fpr *restrict a,
-	const fpr *restrict b, unsigned logn);
-
-/*
- * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
- * representation. On input, g00, g01 and g11 are provided (where the
- * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10
- * and d11 values are written in g00, g01 and g11, respectively
- * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]).
- * (In fact, d00 = g00, so the g00 operand is left unmodified.)
- */
-void Zf(poly_LDL_fft)(const fpr *restrict g00,
-	fpr *restrict g01, fpr *restrict g11, unsigned logn);
-
-/*
- * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
- * representation. This is identical to poly_LDL_fft() except that
- * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written
- * in two other separate buffers provided as extra parameters.
- */
-void Zf(poly_LDLmv_fft)(fpr *restrict d11, fpr *restrict l10,
-	const fpr *restrict g00, const fpr *restrict g01,
-	const fpr *restrict g11, unsigned logn);
-
-/*
- * Apply "split" operation on a polynomial in FFT representation:
- * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1
- * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap.
- */
-void Zf(poly_split_fft)(fpr *restrict f0, fpr *restrict f1,
-	const fpr *restrict f, unsigned logn);
-
-/*
- * Apply "merge" operation on two polynomials in FFT representation:
- * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes
- * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1.
- * f MUST NOT overlap with either f0 or f1.
- */
-void Zf(poly_merge_fft)(fpr *restrict f,
-	const fpr *restrict f0, const fpr *restrict f1, unsigned logn);
-
-/* ==================================================================== */
-/*
- * Key pair generation.
- */
-
-/*
- * Required sizes of the temporary buffer (in bytes).
- *
- * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
- * or 2) where it is slightly greater.
- */
-#define FALCON_KEYGEN_TEMP_1      136
-#define FALCON_KEYGEN_TEMP_2      272
-#define FALCON_KEYGEN_TEMP_3      224
-#define FALCON_KEYGEN_TEMP_4      448
-#define FALCON_KEYGEN_TEMP_5      896
-#define FALCON_KEYGEN_TEMP_6     1792
-#define FALCON_KEYGEN_TEMP_7     3584
-#define FALCON_KEYGEN_TEMP_8     7168
-#define FALCON_KEYGEN_TEMP_9    14336
-#define FALCON_KEYGEN_TEMP_10   28672
-
-/*
- * Generate a new key pair. Randomness is extracted from the provided
- * SHAKE256 context, which must have already been seeded and flipped.
- * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_*
- * macros) and be aligned for the uint32_t, uint64_t and fpr types.
- *
- * The private key elements are written in f, g, F and G, and the
- * public key is written in h. Either or both of G and h may be NULL,
- * in which case the corresponding element is not returned (they can
- * be recomputed from f, g and F).
- *
- * tmp[] must have 64-bit alignment.
- * This function uses floating-point rounding (see set_fpu_cw()).
- */
-void Zf(keygen)(inner_shake256_context *rng,
-	int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
-	unsigned logn, uint8_t *tmp);
-
-/* ==================================================================== */
-/*
- * Signature generation.
- */
-
-/*
- * Expand a private key into the B0 matrix in FFT representation and
- * the LDL tree. All the values are written in 'expanded_key', for
- * a total of (8*logn+40)*2^logn bytes.
- *
- * The tmp[] array must have room for at least 48*2^logn bytes.
- *
- * tmp[] must have 64-bit alignment.
- * This function uses floating-point rounding (see set_fpu_cw()).
- */
-void Zf(expand_privkey)(fpr *restrict expanded_key,
-	const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
-	unsigned logn, uint8_t *restrict tmp);
-
-/*
- * Compute a signature over the provided hashed message (hm); the
- * signature value is one short vector. This function uses an
- * expanded key (as generated by Zf(expand_privkey)()).
- *
- * The sig[] and hm[] buffers may overlap.
- *
- * On successful output, the start of the tmp[] buffer contains the s1
- * vector (as int16_t elements).
- *
- * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
- *
- * tmp[] must have 64-bit alignment.
- * This function uses floating-point rounding (see set_fpu_cw()).
- */
-void Zf(sign_tree)(int16_t *sig, inner_shake256_context *rng,
-	const fpr *restrict expanded_key,
-	const uint16_t *hm, unsigned logn, uint8_t *tmp);
-
-/*
- * Compute a signature over the provided hashed message (hm); the
- * signature value is one short vector. This function uses a raw
- * key and dynamically recompute the B0 matrix and LDL tree; this
- * saves RAM since there is no needed for an expanded key, but
- * increases the signature cost.
- *
- * The sig[] and hm[] buffers may overlap.
- *
- * On successful output, the start of the tmp[] buffer contains the s1
- * vector (as int16_t elements).
- *
- * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
- *
- * tmp[] must have 64-bit alignment.
- * This function uses floating-point rounding (see set_fpu_cw()).
- */
-void Zf(sign_dyn)(int16_t *sig, inner_shake256_context *rng,
-	const int8_t *restrict f, const int8_t *restrict g,
-	const int8_t *restrict F, const int8_t *restrict G,
-	const uint16_t *hm, unsigned logn, uint8_t *tmp);
-
-/*
- * Internal sampler engine. Exported for tests.
- *
- * sampler_context wraps around a source of random numbers (PRNG) and
- * the sigma_min value (nominally dependent on the degree).
- *
- * sampler() takes as parameters:
- *   ctx      pointer to the sampler_context structure
- *   mu       center for the distribution
- *   isigma   inverse of the distribution standard deviation
- * It returns an integer sampled along the Gaussian distribution centered
- * on mu and of standard deviation sigma = 1/isigma.
- *
- * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
- * returns an integer sampled along a half-Gaussian with standard
- * deviation sigma0 = 1.8205 (center is 0, returned value is
- * nonnegative).
- */
-
-typedef struct {
-	prng p;
-	fpr sigma_min;
-} sampler_context;
-
-TARGET_AVX2
-int Zf(sampler)(void *ctx, fpr mu, fpr isigma);
-
-TARGET_AVX2
-int Zf(gaussian0_sampler)(prng *p);
-
-/* ==================================================================== */
-
-#endif
diff --git a/crypto_sign/falcon-512/m4-ct/keygen.c b/crypto_sign/falcon-512/m4-ct/keygen.c
deleted file mode 100644
index cf7de008..00000000
--- a/crypto_sign/falcon-512/m4-ct/keygen.c
+++ /dev/null
@@ -1,4301 +0,0 @@
-/*
- * Falcon key pair generation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-#define MKN(logn)   ((size_t)1 << (logn))
-
-/* ==================================================================== */
-/*
- * Modular arithmetics.
- *
- * We implement a few functions for computing modulo a small integer p.
- *
- * All functions require that 2^30 < p < 2^31. Moreover, operands must
- * be in the 0..p-1 range.
- *
- * Modular addition and subtraction work for all such p.
- *
- * Montgomery multiplication requires that p is odd, and must be provided
- * with an additional value p0i = -1/p mod 2^31. See below for some basics
- * on Montgomery multiplication.
- *
- * Division computes an inverse modulo p by an exponentiation (with
- * exponent p-2): this works only if p is prime. Multiplication
- * requirements also apply, i.e. p must be odd and p0i must be provided.
- *
- * The NTT and inverse NTT need all of the above, and also that
- * p = 1 mod 2048.
- *
- * -----------------------------------------------------------------------
- *
- * We use Montgomery representation with 31-bit values:
- *
- *   Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p.
- *   Montgomery representation of an integer x modulo p is x*R mod p.
- *
- *   Montgomery multiplication computes (x*y)/R mod p for
- *   operands x and y. Therefore:
- *
- *    - if operands are x*R and y*R (Montgomery representations of x and
- *      y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R
- *      mod p, which is the Montgomery representation of the product x*y;
- *
- *    - if operands are x*R and y (or x and y*R), then Montgomery
- *      multiplication returns x*y mod p: mixed-representation
- *      multiplications yield results in normal representation.
- *
- * To convert to Montgomery representation, we multiply by R, which is done
- * by Montgomery-multiplying by R^2. Stand-alone conversion back from
- * Montgomery representation is Montgomery-multiplication by 1.
- */
-
-/*
- * Precomputed small primes. Each element contains the following:
- *
- *  p   The prime itself.
- *
- *  g   A primitive root of phi = X^N+1 (in field Z_p).
- *
- *  s   The inverse of the product of all previous primes in the array,
- *      computed modulo p and in Montgomery representation.
- *
- * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
- * are listed in decreasing order.
- */
-
-typedef struct {
-	uint32_t p;
-	uint32_t g;
-	uint32_t s;
-} small_prime;
-
-static const small_prime PRIMES[] = {
-	{ 2147473409,  383167813,      10239 },
-	{ 2147389441,  211808905,  471403745 },
-	{ 2147387393,   37672282, 1329335065 },
-	{ 2147377153, 1977035326,  968223422 },
-	{ 2147358721, 1067163706,  132460015 },
-	{ 2147352577, 1606082042,  598693809 },
-	{ 2147346433, 2033915641, 1056257184 },
-	{ 2147338241, 1653770625,  421286710 },
-	{ 2147309569,  631200819, 1111201074 },
-	{ 2147297281, 2038364663, 1042003613 },
-	{ 2147295233, 1962540515,   19440033 },
-	{ 2147239937, 2100082663,  353296760 },
-	{ 2147235841, 1991153006, 1703918027 },
-	{ 2147217409,  516405114, 1258919613 },
-	{ 2147205121,  409347988, 1089726929 },
-	{ 2147196929,  927788991, 1946238668 },
-	{ 2147178497, 1136922411, 1347028164 },
-	{ 2147100673,  868626236,  701164723 },
-	{ 2147082241, 1897279176,  617820870 },
-	{ 2147074049, 1888819123,  158382189 },
-	{ 2147051521,   25006327,  522758543 },
-	{ 2147043329,  327546255,   37227845 },
-	{ 2147039233,  766324424, 1133356428 },
-	{ 2146988033, 1862817362,   73861329 },
-	{ 2146963457,  404622040,  653019435 },
-	{ 2146959361, 1936581214,  995143093 },
-	{ 2146938881, 1559770096,  634921513 },
-	{ 2146908161,  422623708, 1985060172 },
-	{ 2146885633, 1751189170,  298238186 },
-	{ 2146871297,  578919515,  291810829 },
-	{ 2146846721, 1114060353,  915902322 },
-	{ 2146834433, 2069565474,   47859524 },
-	{ 2146818049, 1552824584,  646281055 },
-	{ 2146775041, 1906267847, 1597832891 },
-	{ 2146756609, 1847414714, 1228090888 },
-	{ 2146744321, 1818792070, 1176377637 },
-	{ 2146738177, 1118066398, 1054971214 },
-	{ 2146736129,   52057278,  933422153 },
-	{ 2146713601,  592259376, 1406621510 },
-	{ 2146695169,  263161877, 1514178701 },
-	{ 2146656257,  685363115,  384505091 },
-	{ 2146650113,  927727032,  537575289 },
-	{ 2146646017,   52575506, 1799464037 },
-	{ 2146643969, 1276803876, 1348954416 },
-	{ 2146603009,  814028633, 1521547704 },
-	{ 2146572289, 1846678872, 1310832121 },
-	{ 2146547713,  919368090, 1019041349 },
-	{ 2146508801,  671847612,   38582496 },
-	{ 2146492417,  283911680,  532424562 },
-	{ 2146490369, 1780044827,  896447978 },
-	{ 2146459649,  327980850, 1327906900 },
-	{ 2146447361, 1310561493,  958645253 },
-	{ 2146441217,  412148926,  287271128 },
-	{ 2146437121,  293186449, 2009822534 },
-	{ 2146430977,  179034356, 1359155584 },
-	{ 2146418689, 1517345488, 1790248672 },
-	{ 2146406401, 1615820390, 1584833571 },
-	{ 2146404353,  826651445,  607120498 },
-	{ 2146379777,    3816988, 1897049071 },
-	{ 2146363393, 1221409784, 1986921567 },
-	{ 2146355201, 1388081168,  849968120 },
-	{ 2146336769, 1803473237, 1655544036 },
-	{ 2146312193, 1023484977,  273671831 },
-	{ 2146293761, 1074591448,  467406983 },
-	{ 2146283521,  831604668, 1523950494 },
-	{ 2146203649,  712865423, 1170834574 },
-	{ 2146154497, 1764991362, 1064856763 },
-	{ 2146142209,  627386213, 1406840151 },
-	{ 2146127873, 1638674429, 2088393537 },
-	{ 2146099201, 1516001018,  690673370 },
-	{ 2146093057, 1294931393,  315136610 },
-	{ 2146091009, 1942399533,  973539425 },
-	{ 2146078721, 1843461814, 2132275436 },
-	{ 2146060289, 1098740778,  360423481 },
-	{ 2146048001, 1617213232, 1951981294 },
-	{ 2146041857, 1805783169, 2075683489 },
-	{ 2146019329,  272027909, 1753219918 },
-	{ 2145986561, 1206530344, 2034028118 },
-	{ 2145976321, 1243769360, 1173377644 },
-	{ 2145964033,  887200839, 1281344586 },
-	{ 2145906689, 1651026455,  906178216 },
-	{ 2145875969, 1673238256, 1043521212 },
-	{ 2145871873, 1226591210, 1399796492 },
-	{ 2145841153, 1465353397, 1324527802 },
-	{ 2145832961, 1150638905,  554084759 },
-	{ 2145816577,  221601706,  427340863 },
-	{ 2145785857,  608896761,  316590738 },
-	{ 2145755137, 1712054942, 1684294304 },
-	{ 2145742849, 1302302867,  724873116 },
-	{ 2145728513,  516717693,  431671476 },
-	{ 2145699841,  524575579, 1619722537 },
-	{ 2145691649, 1925625239,  982974435 },
-	{ 2145687553,  463795662, 1293154300 },
-	{ 2145673217,  771716636,  881778029 },
-	{ 2145630209, 1509556977,  837364988 },
-	{ 2145595393,  229091856,  851648427 },
-	{ 2145587201, 1796903241,  635342424 },
-	{ 2145525761,  715310882, 1677228081 },
-	{ 2145495041, 1040930522,  200685896 },
-	{ 2145466369,  949804237, 1809146322 },
-	{ 2145445889, 1673903706,   95316881 },
-	{ 2145390593,  806941852, 1428671135 },
-	{ 2145372161, 1402525292,  159350694 },
-	{ 2145361921, 2124760298, 1589134749 },
-	{ 2145359873, 1217503067, 1561543010 },
-	{ 2145355777,  338341402,   83865711 },
-	{ 2145343489, 1381532164,  641430002 },
-	{ 2145325057, 1883895478, 1528469895 },
-	{ 2145318913, 1335370424,   65809740 },
-	{ 2145312769, 2000008042, 1919775760 },
-	{ 2145300481,  961450962, 1229540578 },
-	{ 2145282049,  910466767, 1964062701 },
-	{ 2145232897,  816527501,  450152063 },
-	{ 2145218561, 1435128058, 1794509700 },
-	{ 2145187841,   33505311, 1272467582 },
-	{ 2145181697,  269767433, 1380363849 },
-	{ 2145175553,   56386299, 1316870546 },
-	{ 2145079297, 2106880293, 1391797340 },
-	{ 2145021953, 1347906152,  720510798 },
-	{ 2145015809,  206769262, 1651459955 },
-	{ 2145003521, 1885513236, 1393381284 },
-	{ 2144960513, 1810381315,   31937275 },
-	{ 2144944129, 1306487838, 2019419520 },
-	{ 2144935937,   37304730, 1841489054 },
-	{ 2144894977, 1601434616,  157985831 },
-	{ 2144888833,   98749330, 2128592228 },
-	{ 2144880641, 1772327002, 2076128344 },
-	{ 2144864257, 1404514762, 2029969964 },
-	{ 2144827393,  801236594,  406627220 },
-	{ 2144806913,  349217443, 1501080290 },
-	{ 2144796673, 1542656776, 2084736519 },
-	{ 2144778241, 1210734884, 1746416203 },
-	{ 2144759809, 1146598851,  716464489 },
-	{ 2144757761,  286328400, 1823728177 },
-	{ 2144729089, 1347555695, 1836644881 },
-	{ 2144727041, 1795703790,  520296412 },
-	{ 2144696321, 1302475157,  852964281 },
-	{ 2144667649, 1075877614,  504992927 },
-	{ 2144573441,  198765808, 1617144982 },
-	{ 2144555009,  321528767,  155821259 },
-	{ 2144550913,  814139516, 1819937644 },
-	{ 2144536577,  571143206,  962942255 },
-	{ 2144524289, 1746733766,    2471321 },
-	{ 2144512001, 1821415077,  124190939 },
-	{ 2144468993,  917871546, 1260072806 },
-	{ 2144458753,  378417981, 1569240563 },
-	{ 2144421889,  175229668, 1825620763 },
-	{ 2144409601, 1699216963,  351648117 },
-	{ 2144370689, 1071885991,  958186029 },
-	{ 2144348161, 1763151227,  540353574 },
-	{ 2144335873, 1060214804,  919598847 },
-	{ 2144329729,  663515846, 1448552668 },
-	{ 2144327681, 1057776305,  590222840 },
-	{ 2144309249, 1705149168, 1459294624 },
-	{ 2144296961,  325823721, 1649016934 },
-	{ 2144290817,  738775789,  447427206 },
-	{ 2144243713,  962347618,  893050215 },
-	{ 2144237569, 1655257077,  900860862 },
-	{ 2144161793,  242206694, 1567868672 },
-	{ 2144155649,  769415308, 1247993134 },
-	{ 2144137217,  320492023,  515841070 },
-	{ 2144120833, 1639388522,  770877302 },
-	{ 2144071681, 1761785233,  964296120 },
-	{ 2144065537,  419817825,  204564472 },
-	{ 2144028673,  666050597, 2091019760 },
-	{ 2144010241, 1413657615, 1518702610 },
-	{ 2143952897, 1238327946,  475672271 },
-	{ 2143940609,  307063413, 1176750846 },
-	{ 2143918081, 2062905559,  786785803 },
-	{ 2143899649, 1338112849, 1562292083 },
-	{ 2143891457,   68149545,   87166451 },
-	{ 2143885313,  921750778,  394460854 },
-	{ 2143854593,  719766593,  133877196 },
-	{ 2143836161, 1149399850, 1861591875 },
-	{ 2143762433, 1848739366, 1335934145 },
-	{ 2143756289, 1326674710,  102999236 },
-	{ 2143713281,  808061791, 1156900308 },
-	{ 2143690753,  388399459, 1926468019 },
-	{ 2143670273, 1427891374, 1756689401 },
-	{ 2143666177, 1912173949,  986629565 },
-	{ 2143645697, 2041160111,  371842865 },
-	{ 2143641601, 1279906897, 2023974350 },
-	{ 2143635457,  720473174, 1389027526 },
-	{ 2143621121, 1298309455, 1732632006 },
-	{ 2143598593, 1548762216, 1825417506 },
-	{ 2143567873,  620475784, 1073787233 },
-	{ 2143561729, 1932954575,  949167309 },
-	{ 2143553537,  354315656, 1652037534 },
-	{ 2143541249,  577424288, 1097027618 },
-	{ 2143531009,  357862822,  478640055 },
-	{ 2143522817, 2017706025, 1550531668 },
-	{ 2143506433, 2078127419, 1824320165 },
-	{ 2143488001,  613475285, 1604011510 },
-	{ 2143469569, 1466594987,  502095196 },
-	{ 2143426561, 1115430331, 1044637111 },
-	{ 2143383553,    9778045, 1902463734 },
-	{ 2143377409, 1557401276, 2056861771 },
-	{ 2143363073,  652036455, 1965915971 },
-	{ 2143260673, 1464581171, 1523257541 },
-	{ 2143246337, 1876119649,  764541916 },
-	{ 2143209473, 1614992673, 1920672844 },
-	{ 2143203329,  981052047, 2049774209 },
-	{ 2143160321, 1847355533,  728535665 },
-	{ 2143129601,  965558457,  603052992 },
-	{ 2143123457, 2140817191,    8348679 },
-	{ 2143100929, 1547263683,  694209023 },
-	{ 2143092737,  643459066, 1979934533 },
-	{ 2143082497,  188603778, 2026175670 },
-	{ 2143062017, 1657329695,  377451099 },
-	{ 2143051777,  114967950,  979255473 },
-	{ 2143025153, 1698431342, 1449196896 },
-	{ 2143006721, 1862741675, 1739650365 },
-	{ 2142996481,  756660457,  996160050 },
-	{ 2142976001,  927864010, 1166847574 },
-	{ 2142965761,  905070557,  661974566 },
-	{ 2142916609,   40932754, 1787161127 },
-	{ 2142892033, 1987985648,  675335382 },
-	{ 2142885889,  797497211, 1323096997 },
-	{ 2142871553, 2068025830, 1411877159 },
-	{ 2142861313, 1217177090, 1438410687 },
-	{ 2142830593,  409906375, 1767860634 },
-	{ 2142803969, 1197788993,  359782919 },
-	{ 2142785537,  643817365,  513932862 },
-	{ 2142779393, 1717046338,  218943121 },
-	{ 2142724097,   89336830,  416687049 },
-	{ 2142707713,    5944581, 1356813523 },
-	{ 2142658561,  887942135, 2074011722 },
-	{ 2142638081,  151851972, 1647339939 },
-	{ 2142564353, 1691505537, 1483107336 },
-	{ 2142533633, 1989920200, 1135938817 },
-	{ 2142529537,  959263126, 1531961857 },
-	{ 2142527489,  453251129, 1725566162 },
-	{ 2142502913, 1536028102,  182053257 },
-	{ 2142498817,  570138730,  701443447 },
-	{ 2142416897,  326965800,  411931819 },
-	{ 2142363649, 1675665410, 1517191733 },
-	{ 2142351361,  968529566, 1575712703 },
-	{ 2142330881, 1384953238, 1769087884 },
-	{ 2142314497, 1977173242, 1833745524 },
-	{ 2142289921,   95082313, 1714775493 },
-	{ 2142283777,  109377615, 1070584533 },
-	{ 2142277633,   16960510,  702157145 },
-	{ 2142263297,  553850819,  431364395 },
-	{ 2142208001,  241466367, 2053967982 },
-	{ 2142164993, 1795661326, 1031836848 },
-	{ 2142097409, 1212530046,  712772031 },
-	{ 2142087169, 1763869720,  822276067 },
-	{ 2142078977,  644065713, 1765268066 },
-	{ 2142074881,  112671944,  643204925 },
-	{ 2142044161, 1387785471, 1297890174 },
-	{ 2142025729,  783885537, 1000425730 },
-	{ 2142011393,  905662232, 1679401033 },
-	{ 2141974529,  799788433,  468119557 },
-	{ 2141943809, 1932544124,  449305555 },
-	{ 2141933569, 1527403256,  841867925 },
-	{ 2141931521, 1247076451,  743823916 },
-	{ 2141902849, 1199660531,  401687910 },
-	{ 2141890561,  150132350, 1720336972 },
-	{ 2141857793, 1287438162,  663880489 },
-	{ 2141833217,  618017731, 1819208266 },
-	{ 2141820929,  999578638, 1403090096 },
-	{ 2141786113,   81834325, 1523542501 },
-	{ 2141771777,  120001928,  463556492 },
-	{ 2141759489,  122455485, 2124928282 },
-	{ 2141749249,  141986041,  940339153 },
-	{ 2141685761,  889088734,  477141499 },
-	{ 2141673473,  324212681, 1122558298 },
-	{ 2141669377, 1175806187, 1373818177 },
-	{ 2141655041, 1113654822,  296887082 },
-	{ 2141587457,  991103258, 1585913875 },
-	{ 2141583361, 1401451409, 1802457360 },
-	{ 2141575169, 1571977166,  712760980 },
-	{ 2141546497, 1107849376, 1250270109 },
-	{ 2141515777,  196544219,  356001130 },
-	{ 2141495297, 1733571506, 1060744866 },
-	{ 2141483009,  321552363, 1168297026 },
-	{ 2141458433,  505818251,  733225819 },
-	{ 2141360129, 1026840098,  948342276 },
-	{ 2141325313,  945133744, 2129965998 },
-	{ 2141317121, 1871100260, 1843844634 },
-	{ 2141286401, 1790639498, 1750465696 },
-	{ 2141267969, 1376858592,  186160720 },
-	{ 2141255681, 2129698296, 1876677959 },
-	{ 2141243393, 2138900688, 1340009628 },
-	{ 2141214721, 1933049835, 1087819477 },
-	{ 2141212673, 1898664939, 1786328049 },
-	{ 2141202433,  990234828,  940682169 },
-	{ 2141175809, 1406392421,  993089586 },
-	{ 2141165569, 1263518371,  289019479 },
-	{ 2141073409, 1485624211,  507864514 },
-	{ 2141052929, 1885134788,  311252465 },
-	{ 2141040641, 1285021247,  280941862 },
-	{ 2141028353, 1527610374,  375035110 },
-	{ 2141011969, 1400626168,  164696620 },
-	{ 2140999681,  632959608,  966175067 },
-	{ 2140997633, 2045628978, 1290889438 },
-	{ 2140993537, 1412755491,  375366253 },
-	{ 2140942337,  719477232,  785367828 },
-	{ 2140925953,   45224252,  836552317 },
-	{ 2140917761, 1157376588, 1001839569 },
-	{ 2140887041,  278480752, 2098732796 },
-	{ 2140837889, 1663139953,  924094810 },
-	{ 2140788737,  802501511, 2045368990 },
-	{ 2140766209, 1820083885, 1800295504 },
-	{ 2140764161, 1169561905, 2106792035 },
-	{ 2140696577,  127781498, 1885987531 },
-	{ 2140684289,   16014477, 1098116827 },
-	{ 2140653569,  665960598, 1796728247 },
-	{ 2140594177, 1043085491,  377310938 },
-	{ 2140579841, 1732838211, 1504505945 },
-	{ 2140569601,  302071939,  358291016 },
-	{ 2140567553,  192393733, 1909137143 },
-	{ 2140557313,  406595731, 1175330270 },
-	{ 2140549121, 1748850918,  525007007 },
-	{ 2140477441,  499436566, 1031159814 },
-	{ 2140469249, 1886004401, 1029951320 },
-	{ 2140426241, 1483168100, 1676273461 },
-	{ 2140420097, 1779917297,  846024476 },
-	{ 2140413953,  522948893, 1816354149 },
-	{ 2140383233, 1931364473, 1296921241 },
-	{ 2140366849, 1917356555,  147196204 },
-	{ 2140354561,   16466177, 1349052107 },
-	{ 2140348417, 1875366972, 1860485634 },
-	{ 2140323841,  456498717, 1790256483 },
-	{ 2140321793, 1629493973,  150031888 },
-	{ 2140315649, 1904063898,  395510935 },
-	{ 2140280833, 1784104328,  831417909 },
-	{ 2140250113,  256087139,  697349101 },
-	{ 2140229633,  388553070,  243875754 },
-	{ 2140223489,  747459608, 1396270850 },
-	{ 2140200961,  507423743, 1895572209 },
-	{ 2140162049,  580106016, 2045297469 },
-	{ 2140149761,  712426444,  785217995 },
-	{ 2140137473, 1441607584,  536866543 },
-	{ 2140119041,  346538902, 1740434653 },
-	{ 2140090369,  282642885,   21051094 },
-	{ 2140076033, 1407456228,  319910029 },
-	{ 2140047361, 1619330500, 1488632070 },
-	{ 2140041217, 2089408064, 2012026134 },
-	{ 2140008449, 1705524800, 1613440760 },
-	{ 2139924481, 1846208233, 1280649481 },
-	{ 2139906049,  989438755, 1185646076 },
-	{ 2139867137, 1522314850,  372783595 },
-	{ 2139842561, 1681587377,  216848235 },
-	{ 2139826177, 2066284988, 1784999464 },
-	{ 2139824129,  480888214, 1513323027 },
-	{ 2139789313,  847937200,  858192859 },
-	{ 2139783169, 1642000434, 1583261448 },
-	{ 2139770881,  940699589,  179702100 },
-	{ 2139768833,  315623242,  964612676 },
-	{ 2139666433,  331649203,  764666914 },
-	{ 2139641857, 2118730799, 1313764644 },
-	{ 2139635713,  519149027,  519212449 },
-	{ 2139598849, 1526413634, 1769667104 },
-	{ 2139574273,  551148610,  820739925 },
-	{ 2139568129, 1386800242,  472447405 },
-	{ 2139549697,  813760130, 1412328531 },
-	{ 2139537409, 1615286260, 1609362979 },
-	{ 2139475969, 1352559299, 1696720421 },
-	{ 2139455489, 1048691649, 1584935400 },
-	{ 2139432961,  836025845,  950121150 },
-	{ 2139424769, 1558281165, 1635486858 },
-	{ 2139406337, 1728402143, 1674423301 },
-	{ 2139396097, 1727715782, 1483470544 },
-	{ 2139383809, 1092853491, 1741699084 },
-	{ 2139369473,  690776899, 1242798709 },
-	{ 2139351041, 1768782380, 2120712049 },
-	{ 2139334657, 1739968247, 1427249225 },
-	{ 2139332609, 1547189119,  623011170 },
-	{ 2139310081, 1346827917, 1605466350 },
-	{ 2139303937,  369317948,  828392831 },
-	{ 2139301889, 1560417239, 1788073219 },
-	{ 2139283457, 1303121623,  595079358 },
-	{ 2139248641, 1354555286,  573424177 },
-	{ 2139240449,   60974056,  885781403 },
-	{ 2139222017,  355573421, 1221054839 },
-	{ 2139215873,  566477826, 1724006500 },
-	{ 2139150337,  871437673, 1609133294 },
-	{ 2139144193, 1478130914, 1137491905 },
-	{ 2139117569, 1854880922,  964728507 },
-	{ 2139076609,  202405335,  756508944 },
-	{ 2139062273, 1399715741,  884826059 },
-	{ 2139045889, 1051045798, 1202295476 },
-	{ 2139033601, 1707715206,  632234634 },
-	{ 2139006977, 2035853139,  231626690 },
-	{ 2138951681,  183867876,  838350879 },
-	{ 2138945537, 1403254661,  404460202 },
-	{ 2138920961,  310865011, 1282911681 },
-	{ 2138910721, 1328496553,  103472415 },
-	{ 2138904577,   78831681,  993513549 },
-	{ 2138902529, 1319697451, 1055904361 },
-	{ 2138816513,  384338872, 1706202469 },
-	{ 2138810369, 1084868275,  405677177 },
-	{ 2138787841,  401181788, 1964773901 },
-	{ 2138775553, 1850532988, 1247087473 },
-	{ 2138767361,  874261901, 1576073565 },
-	{ 2138757121, 1187474742,  993541415 },
-	{ 2138748929, 1782458888, 1043206483 },
-	{ 2138744833, 1221500487,  800141243 },
-	{ 2138738689,  413465368, 1450660558 },
-	{ 2138695681,  739045140,  342611472 },
-	{ 2138658817, 1355845756,  672674190 },
-	{ 2138644481,  608379162, 1538874380 },
-	{ 2138632193, 1444914034,  686911254 },
-	{ 2138607617,  484707818, 1435142134 },
-	{ 2138591233,  539460669, 1290458549 },
-	{ 2138572801, 2093538990, 2011138646 },
-	{ 2138552321, 1149786988, 1076414907 },
-	{ 2138546177,  840688206, 2108985273 },
-	{ 2138533889,  209669619,  198172413 },
-	{ 2138523649, 1975879426, 1277003968 },
-	{ 2138490881, 1351891144, 1976858109 },
-	{ 2138460161, 1817321013, 1979278293 },
-	{ 2138429441, 1950077177,  203441928 },
-	{ 2138400769,  908970113,  628395069 },
-	{ 2138398721,  219890864,  758486760 },
-	{ 2138376193, 1306654379,  977554090 },
-	{ 2138351617,  298822498, 2004708503 },
-	{ 2138337281,  441457816, 1049002108 },
-	{ 2138320897, 1517731724, 1442269609 },
-	{ 2138290177, 1355911197, 1647139103 },
-	{ 2138234881,  531313247, 1746591962 },
-	{ 2138214401, 1899410930,  781416444 },
-	{ 2138202113, 1813477173, 1622508515 },
-	{ 2138191873, 1086458299, 1025408615 },
-	{ 2138183681, 1998800427,  827063290 },
-	{ 2138173441, 1921308898,  749670117 },
-	{ 2138103809, 1620902804, 2126787647 },
-	{ 2138099713,  828647069, 1892961817 },
-	{ 2138085377,  179405355, 1525506535 },
-	{ 2138060801,  615683235, 1259580138 },
-	{ 2138044417, 2030277840, 1731266562 },
-	{ 2138042369, 2087222316, 1627902259 },
-	{ 2138032129,  126388712, 1108640984 },
-	{ 2138011649,  715026550, 1017980050 },
-	{ 2137993217, 1693714349, 1351778704 },
-	{ 2137888769, 1289762259, 1053090405 },
-	{ 2137853953,  199991890, 1254192789 },
-	{ 2137833473,  941421685,  896995556 },
-	{ 2137817089,  750416446, 1251031181 },
-	{ 2137792513,  798075119,  368077456 },
-	{ 2137786369,  878543495, 1035375025 },
-	{ 2137767937,    9351178, 1156563902 },
-	{ 2137755649, 1382297614, 1686559583 },
-	{ 2137724929, 1345472850, 1681096331 },
-	{ 2137704449,  834666929,  630551727 },
-	{ 2137673729, 1646165729, 1892091571 },
-	{ 2137620481,  778943821,   48456461 },
-	{ 2137618433, 1730837875, 1713336725 },
-	{ 2137581569,  805610339, 1378891359 },
-	{ 2137538561,  204342388, 1950165220 },
-	{ 2137526273, 1947629754, 1500789441 },
-	{ 2137516033,  719902645, 1499525372 },
-	{ 2137491457,  230451261,  556382829 },
-	{ 2137440257,  979573541,  412760291 },
-	{ 2137374721,  927841248, 1954137185 },
-	{ 2137362433, 1243778559,  861024672 },
-	{ 2137313281, 1341338501,  980638386 },
-	{ 2137311233,  937415182, 1793212117 },
-	{ 2137255937,  795331324, 1410253405 },
-	{ 2137243649,  150756339, 1966999887 },
-	{ 2137182209,  163346914, 1939301431 },
-	{ 2137171969, 1952552395,  758913141 },
-	{ 2137159681,  570788721,  218668666 },
-	{ 2137147393, 1896656810, 2045670345 },
-	{ 2137141249,  358493842,  518199643 },
-	{ 2137139201, 1505023029,  674695848 },
-	{ 2137133057,   27911103,  830956306 },
-	{ 2137122817,  439771337, 1555268614 },
-	{ 2137116673,  790988579, 1871449599 },
-	{ 2137110529,  432109234,  811805080 },
-	{ 2137102337, 1357900653, 1184997641 },
-	{ 2137098241,  515119035, 1715693095 },
-	{ 2137090049,  408575203, 2085660657 },
-	{ 2137085953, 2097793407, 1349626963 },
-	{ 2137055233, 1556739954, 1449960883 },
-	{ 2137030657, 1545758650, 1369303716 },
-	{ 2136987649,  332602570,  103875114 },
-	{ 2136969217, 1499989506, 1662964115 },
-	{ 2136924161,  857040753,    4738842 },
-	{ 2136895489, 1948872712,  570436091 },
-	{ 2136893441,   58969960, 1568349634 },
-	{ 2136887297, 2127193379,  273612548 },
-	{ 2136850433,  111208983, 1181257116 },
-	{ 2136809473, 1627275942, 1680317971 },
-	{ 2136764417, 1574888217,   14011331 },
-	{ 2136741889,   14011055, 1129154251 },
-	{ 2136727553,   35862563, 1838555253 },
-	{ 2136721409,  310235666, 1363928244 },
-	{ 2136698881, 1612429202, 1560383828 },
-	{ 2136649729, 1138540131,  800014364 },
-	{ 2136606721,  602323503, 1433096652 },
-	{ 2136563713,  182209265, 1919611038 },
-	{ 2136555521,  324156477,  165591039 },
-	{ 2136549377,  195513113,  217165345 },
-	{ 2136526849, 1050768046,  939647887 },
-	{ 2136508417, 1886286237, 1619926572 },
-	{ 2136477697,  609647664,   35065157 },
-	{ 2136471553,  679352216, 1452259468 },
-	{ 2136457217,  128630031,  824816521 },
-	{ 2136422401,   19787464, 1526049830 },
-	{ 2136420353,  698316836, 1530623527 },
-	{ 2136371201, 1651862373, 1804812805 },
-	{ 2136334337,  326596005,  336977082 },
-	{ 2136322049,   63253370, 1904972151 },
-	{ 2136297473,  312176076,  172182411 },
-	{ 2136248321,  381261841,  369032670 },
-	{ 2136242177,  358688773, 1640007994 },
-	{ 2136229889,  512677188,   75585225 },
-	{ 2136219649, 2095003250, 1970086149 },
-	{ 2136207361, 1909650722,  537760675 },
-	{ 2136176641, 1334616195, 1533487619 },
-	{ 2136158209, 2096285632, 1793285210 },
-	{ 2136143873, 1897347517,  293843959 },
-	{ 2136133633,  923586222, 1022655978 },
-	{ 2136096769, 1464868191, 1515074410 },
-	{ 2136094721, 2020679520, 2061636104 },
-	{ 2136076289,  290798503, 1814726809 },
-	{ 2136041473,  156415894, 1250757633 },
-	{ 2135996417,  297459940, 1132158924 },
-	{ 2135955457,  538755304, 1688831340 },
-	{ 0, 0, 0 }
-};
-
-/*
- * Reduce a small signed integer modulo a small prime. The source
- * value x MUST be such that -p < x < p.
- */
-static inline uint32_t
-modp_set(int32_t x, uint32_t p)
-{
-	uint32_t w;
-
-	w = (uint32_t)x;
-	w += p & -(w >> 31);
-	return w;
-}
-
-/*
- * Normalize a modular integer around 0.
- */
-static inline int32_t
-modp_norm(uint32_t x, uint32_t p)
-{
-	return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
-}
-
-/*
- * Compute -1/p mod 2^31. This works for all odd integers p that fit
- * on 31 bits.
- */
-static uint32_t
-modp_ninv31(uint32_t p)
-{
-	uint32_t y;
-
-	y = 2 - p;
-	y *= 2 - p * y;
-	y *= 2 - p * y;
-	y *= 2 - p * y;
-	y *= 2 - p * y;
-	return (uint32_t)0x7FFFFFFF & -y;
-}
-
-/*
- * Compute R = 2^31 mod p.
- */
-static inline uint32_t
-modp_R(uint32_t p)
-{
-	/*
-	 * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
-	 * 2^31 - p.
-	 */
-	return ((uint32_t)1 << 31) - p;
-}
-
-/*
- * Addition modulo p.
- */
-static inline uint32_t
-modp_add(uint32_t a, uint32_t b, uint32_t p)
-{
-	uint32_t d;
-
-	d = a + b - p;
-	d += p & -(d >> 31);
-	return d;
-}
-
-/*
- * Subtraction modulo p.
- */
-static inline uint32_t
-modp_sub(uint32_t a, uint32_t b, uint32_t p)
-{
-	uint32_t d;
-
-	d = a - b;
-	d += p & -(d >> 31);
-	return d;
-}
-
-/*
- * Halving modulo p.
- */
-/* unused
-static inline uint32_t
-modp_half(uint32_t a, uint32_t p)
-{
-	a += p & -(a & 1);
-	return a >> 1;
-}
-*/
-
-/*
- * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
- * It is required that p is an odd integer.
- */
-static inline uint32_t
-modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i)
-{
-	uint64_t z, w;
-	uint32_t d;
-
-	z = (uint64_t)a * (uint64_t)b;
-	w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
-	d = (uint32_t)((z + w) >> 31) - p;
-	d += p & -(d >> 31);
-	return d;
-}
-
-/*
- * Compute R2 = 2^62 mod p.
- */
-static uint32_t
-modp_R2(uint32_t p, uint32_t p0i)
-{
-	uint32_t z;
-
-	/*
-	 * Compute z = 2^31 mod p (this is the value 1 in Montgomery
-	 * representation), then double it with an addition.
-	 */
-	z = modp_R(p);
-	z = modp_add(z, z, p);
-
-	/*
-	 * Square it five times to obtain 2^32 in Montgomery representation
-	 * (i.e. 2^63 mod p).
-	 */
-	z = modp_montymul(z, z, p, p0i);
-	z = modp_montymul(z, z, p, p0i);
-	z = modp_montymul(z, z, p, p0i);
-	z = modp_montymul(z, z, p, p0i);
-	z = modp_montymul(z, z, p, p0i);
-
-	/*
-	 * Halve the value mod p to get 2^62.
-	 */
-	z = (z + (p & -(z & 1))) >> 1;
-	return z;
-}
-
-/*
- * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
- * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
- * -1/p mod 2^31; R2 must be equal to 2^62 mod p.
- */
-static inline uint32_t
-modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2)
-{
-	int i;
-	uint32_t r, z;
-
-	/*
-	 * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
-	 * representation of (2^31)^e mod p, where e = x-1.
-	 * R2 is 2^31 in Montgomery representation.
-	 */
-	x --;
-	r = R2;
-	z = modp_R(p);
-	for (i = 0; (1U << i) <= x; i ++) {
-		if ((x & (1U << i)) != 0) {
-			z = modp_montymul(z, r, p, p0i);
-		}
-		r = modp_montymul(r, r, p, p0i);
-	}
-	return z;
-}
-
-/*
- * Division modulo p. If the divisor (b) is 0, then 0 is returned.
- * This function computes proper results only when p is prime.
- * Parameters:
- *   a     dividend
- *   b     divisor
- *   p     odd prime modulus
- *   p0i   -1/p mod 2^31
- *   R     2^31 mod R
- */
-static uint32_t
-modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R)
-{
-	uint32_t z, e;
-	int i;
-
-	e = p - 2;
-	z = R;
-	for (i = 30; i >= 0; i --) {
-		uint32_t z2;
-
-		z = modp_montymul(z, z, p, p0i);
-		z2 = modp_montymul(z, b, p, p0i);
-		z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
-	}
-
-	/*
-	 * The loop above just assumed that b was in Montgomery
-	 * representation, i.e. really contained b*R; under that
-	 * assumption, it returns 1/b in Montgomery representation,
-	 * which is R/b. But we gave it b in normal representation,
-	 * so the loop really returned R/(b/R) = R^2/b.
-	 *
-	 * We want a/b, so we need one Montgomery multiplication with a,
-	 * which also remove one of the R factors, and another such
-	 * multiplication to remove the second R factor.
-	 */
-	z = modp_montymul(z, 1, p, p0i);
-	return modp_montymul(a, z, p, p0i);
-}
-
-/*
- * Bit-reversal index table.
- */
-static const uint16_t REV10[] = {
-	   0,  512,  256,  768,  128,  640,  384,  896,   64,  576,  320,  832,
-	 192,  704,  448,  960,   32,  544,  288,  800,  160,  672,  416,  928,
-	  96,  608,  352,  864,  224,  736,  480,  992,   16,  528,  272,  784,
-	 144,  656,  400,  912,   80,  592,  336,  848,  208,  720,  464,  976,
-	  48,  560,  304,  816,  176,  688,  432,  944,  112,  624,  368,  880,
-	 240,  752,  496, 1008,    8,  520,  264,  776,  136,  648,  392,  904,
-	  72,  584,  328,  840,  200,  712,  456,  968,   40,  552,  296,  808,
-	 168,  680,  424,  936,  104,  616,  360,  872,  232,  744,  488, 1000,
-	  24,  536,  280,  792,  152,  664,  408,  920,   88,  600,  344,  856,
-	 216,  728,  472,  984,   56,  568,  312,  824,  184,  696,  440,  952,
-	 120,  632,  376,  888,  248,  760,  504, 1016,    4,  516,  260,  772,
-	 132,  644,  388,  900,   68,  580,  324,  836,  196,  708,  452,  964,
-	  36,  548,  292,  804,  164,  676,  420,  932,  100,  612,  356,  868,
-	 228,  740,  484,  996,   20,  532,  276,  788,  148,  660,  404,  916,
-	  84,  596,  340,  852,  212,  724,  468,  980,   52,  564,  308,  820,
-	 180,  692,  436,  948,  116,  628,  372,  884,  244,  756,  500, 1012,
-	  12,  524,  268,  780,  140,  652,  396,  908,   76,  588,  332,  844,
-	 204,  716,  460,  972,   44,  556,  300,  812,  172,  684,  428,  940,
-	 108,  620,  364,  876,  236,  748,  492, 1004,   28,  540,  284,  796,
-	 156,  668,  412,  924,   92,  604,  348,  860,  220,  732,  476,  988,
-	  60,  572,  316,  828,  188,  700,  444,  956,  124,  636,  380,  892,
-	 252,  764,  508, 1020,    2,  514,  258,  770,  130,  642,  386,  898,
-	  66,  578,  322,  834,  194,  706,  450,  962,   34,  546,  290,  802,
-	 162,  674,  418,  930,   98,  610,  354,  866,  226,  738,  482,  994,
-	  18,  530,  274,  786,  146,  658,  402,  914,   82,  594,  338,  850,
-	 210,  722,  466,  978,   50,  562,  306,  818,  178,  690,  434,  946,
-	 114,  626,  370,  882,  242,  754,  498, 1010,   10,  522,  266,  778,
-	 138,  650,  394,  906,   74,  586,  330,  842,  202,  714,  458,  970,
-	  42,  554,  298,  810,  170,  682,  426,  938,  106,  618,  362,  874,
-	 234,  746,  490, 1002,   26,  538,  282,  794,  154,  666,  410,  922,
-	  90,  602,  346,  858,  218,  730,  474,  986,   58,  570,  314,  826,
-	 186,  698,  442,  954,  122,  634,  378,  890,  250,  762,  506, 1018,
-	   6,  518,  262,  774,  134,  646,  390,  902,   70,  582,  326,  838,
-	 198,  710,  454,  966,   38,  550,  294,  806,  166,  678,  422,  934,
-	 102,  614,  358,  870,  230,  742,  486,  998,   22,  534,  278,  790,
-	 150,  662,  406,  918,   86,  598,  342,  854,  214,  726,  470,  982,
-	  54,  566,  310,  822,  182,  694,  438,  950,  118,  630,  374,  886,
-	 246,  758,  502, 1014,   14,  526,  270,  782,  142,  654,  398,  910,
-	  78,  590,  334,  846,  206,  718,  462,  974,   46,  558,  302,  814,
-	 174,  686,  430,  942,  110,  622,  366,  878,  238,  750,  494, 1006,
-	  30,  542,  286,  798,  158,  670,  414,  926,   94,  606,  350,  862,
-	 222,  734,  478,  990,   62,  574,  318,  830,  190,  702,  446,  958,
-	 126,  638,  382,  894,  254,  766,  510, 1022,    1,  513,  257,  769,
-	 129,  641,  385,  897,   65,  577,  321,  833,  193,  705,  449,  961,
-	  33,  545,  289,  801,  161,  673,  417,  929,   97,  609,  353,  865,
-	 225,  737,  481,  993,   17,  529,  273,  785,  145,  657,  401,  913,
-	  81,  593,  337,  849,  209,  721,  465,  977,   49,  561,  305,  817,
-	 177,  689,  433,  945,  113,  625,  369,  881,  241,  753,  497, 1009,
-	   9,  521,  265,  777,  137,  649,  393,  905,   73,  585,  329,  841,
-	 201,  713,  457,  969,   41,  553,  297,  809,  169,  681,  425,  937,
-	 105,  617,  361,  873,  233,  745,  489, 1001,   25,  537,  281,  793,
-	 153,  665,  409,  921,   89,  601,  345,  857,  217,  729,  473,  985,
-	  57,  569,  313,  825,  185,  697,  441,  953,  121,  633,  377,  889,
-	 249,  761,  505, 1017,    5,  517,  261,  773,  133,  645,  389,  901,
-	  69,  581,  325,  837,  197,  709,  453,  965,   37,  549,  293,  805,
-	 165,  677,  421,  933,  101,  613,  357,  869,  229,  741,  485,  997,
-	  21,  533,  277,  789,  149,  661,  405,  917,   85,  597,  341,  853,
-	 213,  725,  469,  981,   53,  565,  309,  821,  181,  693,  437,  949,
-	 117,  629,  373,  885,  245,  757,  501, 1013,   13,  525,  269,  781,
-	 141,  653,  397,  909,   77,  589,  333,  845,  205,  717,  461,  973,
-	  45,  557,  301,  813,  173,  685,  429,  941,  109,  621,  365,  877,
-	 237,  749,  493, 1005,   29,  541,  285,  797,  157,  669,  413,  925,
-	  93,  605,  349,  861,  221,  733,  477,  989,   61,  573,  317,  829,
-	 189,  701,  445,  957,  125,  637,  381,  893,  253,  765,  509, 1021,
-	   3,  515,  259,  771,  131,  643,  387,  899,   67,  579,  323,  835,
-	 195,  707,  451,  963,   35,  547,  291,  803,  163,  675,  419,  931,
-	  99,  611,  355,  867,  227,  739,  483,  995,   19,  531,  275,  787,
-	 147,  659,  403,  915,   83,  595,  339,  851,  211,  723,  467,  979,
-	  51,  563,  307,  819,  179,  691,  435,  947,  115,  627,  371,  883,
-	 243,  755,  499, 1011,   11,  523,  267,  779,  139,  651,  395,  907,
-	  75,  587,  331,  843,  203,  715,  459,  971,   43,  555,  299,  811,
-	 171,  683,  427,  939,  107,  619,  363,  875,  235,  747,  491, 1003,
-	  27,  539,  283,  795,  155,  667,  411,  923,   91,  603,  347,  859,
-	 219,  731,  475,  987,   59,  571,  315,  827,  187,  699,  443,  955,
-	 123,  635,  379,  891,  251,  763,  507, 1019,    7,  519,  263,  775,
-	 135,  647,  391,  903,   71,  583,  327,  839,  199,  711,  455,  967,
-	  39,  551,  295,  807,  167,  679,  423,  935,  103,  615,  359,  871,
-	 231,  743,  487,  999,   23,  535,  279,  791,  151,  663,  407,  919,
-	  87,  599,  343,  855,  215,  727,  471,  983,   55,  567,  311,  823,
-	 183,  695,  439,  951,  119,  631,  375,  887,  247,  759,  503, 1015,
-	  15,  527,  271,  783,  143,  655,  399,  911,   79,  591,  335,  847,
-	 207,  719,  463,  975,   47,  559,  303,  815,  175,  687,  431,  943,
-	 111,  623,  367,  879,  239,  751,  495, 1007,   31,  543,  287,  799,
-	 159,  671,  415,  927,   95,  607,  351,  863,  223,  735,  479,  991,
-	  63,  575,  319,  831,  191,  703,  447,  959,  127,  639,  383,  895,
-	 255,  767,  511, 1023
-};
-
-/*
- * Compute the roots for NTT and inverse NTT (binary case). Input
- * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
- * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
- *   gm[rev(i)] = g^i mod p
- *   igm[rev(i)] = (1/g)^i mod p
- * where rev() is the "bit reversal" function over 10 bits. It fills
- * the arrays only up to N = 2^logn values.
- *
- * The values stored in gm[] and igm[] are in Montgomery representation.
- *
- * p must be a prime such that p = 1 mod 2048.
- */
-static void
-modp_mkgm2(uint32_t *restrict gm, uint32_t *restrict igm, unsigned logn,
-	uint32_t g, uint32_t p, uint32_t p0i)
-{
-	size_t u, n;
-	unsigned k;
-	uint32_t ig, x1, x2, R2;
-
-	n = (size_t)1 << logn;
-
-	/*
-	 * We want g such that g^(2N) = 1 mod p, but the provided
-	 * generator has order 2048. We must square it a few times.
-	 */
-	R2 = modp_R2(p, p0i);
-	g = modp_montymul(g, R2, p, p0i);
-	for (k = logn; k < 10; k ++) {
-		g = modp_montymul(g, g, p, p0i);
-	}
-
-	ig = modp_div(R2, g, p, p0i, modp_R(p));
-	k = 10 - logn;
-	x1 = x2 = modp_R(p);
-	for (u = 0; u < n; u ++) {
-		size_t v;
-
-		v = REV10[u << k];
-		gm[v] = x1;
-		igm[v] = x2;
-		x1 = modp_montymul(x1, g, p, p0i);
-		x2 = modp_montymul(x2, ig, p, p0i);
-	}
-}
-
-/*
- * Compute the NTT over a polynomial (binary case). Polynomial elements
- * are a[0], a[stride], a[2 * stride]...
- */
-static void
-modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
-	uint32_t p, uint32_t p0i)
-{
-	size_t t, m, n;
-
-	if (logn == 0) {
-		return;
-	}
-	n = (size_t)1 << logn;
-	t = n;
-	for (m = 1; m < n; m <<= 1) {
-		size_t ht, u, v1;
-
-		ht = t >> 1;
-		for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
-			uint32_t s;
-			size_t v;
-			uint32_t *r1, *r2;
-
-			s = gm[m + u];
-			r1 = a + v1 * stride;
-			r2 = r1 + ht * stride;
-			for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
-				uint32_t x, y;
-
-				x = *r1;
-				y = modp_montymul(*r2, s, p, p0i);
-				*r1 = modp_add(x, y, p);
-				*r2 = modp_sub(x, y, p);
-			}
-		}
-		t = ht;
-	}
-}
-
-/*
- * Compute the inverse NTT over a polynomial (binary case).
- */
-static void
-modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
-	uint32_t p, uint32_t p0i)
-{
-	size_t t, m, n, k;
-	uint32_t ni;
-	uint32_t *r;
-
-	if (logn == 0) {
-		return;
-	}
-	n = (size_t)1 << logn;
-	t = 1;
-	for (m = n; m > 1; m >>= 1) {
-		size_t hm, dt, u, v1;
-
-		hm = m >> 1;
-		dt = t << 1;
-		for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
-			uint32_t s;
-			size_t v;
-			uint32_t *r1, *r2;
-
-			s = igm[hm + u];
-			r1 = a + v1 * stride;
-			r2 = r1 + t * stride;
-			for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
-				uint32_t x, y;
-
-				x = *r1;
-				y = *r2;
-				*r1 = modp_add(x, y, p);
-				*r2 = modp_montymul(
-					modp_sub(x, y, p), s, p, p0i);;
-			}
-		}
-		t = dt;
-	}
-
-	/*
-	 * We need 1/n in Montgomery representation, i.e. R/n. Since
-	 * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
-	 * thus a simple shift will do.
-	 */
-	ni = (uint32_t)1 << (31 - logn);
-	for (k = 0, r = a; k < n; k ++, r += stride) {
-		*r = modp_montymul(*r, ni, p, p0i);
-	}
-}
-
-/*
- * Simplified macros for NTT and iNTT (binary case) when the elements
- * are consecutive in RAM.
- */
-#define modp_NTT2(a, gm, logn, p, p0i)   modp_NTT2_ext(a, 1, gm, logn, p, p0i)
-#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)
-
-/*
- * Given polynomial f in NTT representation modulo p, compute f' of degree
- * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
- * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
- *
- * The new polynomial is written "in place" over the first N/2 elements
- * of f.
- *
- * If applied logn times successively on a given polynomial, the resulting
- * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
- *
- * This function applies only to the binary case; it is invoked from
- * solve_NTRU_binary_depth1().
- */
-static void
-modp_poly_rec_res(uint32_t *f, unsigned logn,
-	uint32_t p, uint32_t p0i, uint32_t R2)
-{
-	size_t hn, u;
-
-	hn = (size_t)1 << (logn - 1);
-	for (u = 0; u < hn; u ++) {
-		uint32_t w0, w1;
-
-		w0 = f[(u << 1) + 0];
-		w1 = f[(u << 1) + 1];
-		f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
-	}
-}
-
-/* ==================================================================== */
-/*
- * Custom bignum implementation.
- *
- * This is a very reduced set of functionalities. We need to do the
- * following operations:
- *
- *  - Rebuild the resultant and the polynomial coefficients from their
- *    values modulo small primes (of length 31 bits each).
- *
- *  - Compute an extended GCD between the two computed resultants.
- *
- *  - Extract top bits and add scaled values during the successive steps
- *    of Babai rounding.
- *
- * When rebuilding values using CRT, we must also recompute the product
- * of the small prime factors. We always do it one small factor at a
- * time, so the "complicated" operations can be done modulo the small
- * prime with the modp_* functions. CRT coefficients (inverses) are
- * precomputed.
- *
- * All values are positive until the last step: when the polynomial
- * coefficients have been rebuilt, we normalize them around 0. But then,
- * only additions and subtractions on the upper few bits are needed
- * afterwards.
- *
- * We keep big integers as arrays of 31-bit words (in uint32_t values);
- * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
- * makes it easier to keep track of carries. When negative values are
- * used, two's complement is used.
- */
-
-/*
- * Subtract integer b from integer a. Both integers are supposed to have
- * the same size. The carry (0 or 1) is returned. Source arrays a and b
- * MUST be distinct.
- *
- * The operation is performed as described above if ctr = 1. If
- * ctl = 0, the value a[] is unmodified, but all memory accesses are
- * still performed, and the carry is computed and returned.
- */
-static uint32_t
-zint_sub(uint32_t *restrict a, const uint32_t *restrict b, size_t len,
-	uint32_t ctl)
-{
-	size_t u;
-	uint32_t cc, m;
-
-	cc = 0;
-	m = -ctl;
-	for (u = 0; u < len; u ++) {
-		uint32_t aw, w;
-
-		aw = a[u];
-		w = aw - b[u] - cc;
-		cc = w >> 31;
-		aw ^= ((w & 0x7FFFFFFF) ^ aw) & m;
-		a[u] = aw;
-	}
-	return cc;
-}
-
-/*
- * Mutiply the provided big integer m with a small value x.
- * This function assumes that x < 2^31. The carry word is returned.
- */
-static uint32_t
-zint_mul_small(uint32_t *m, size_t mlen, uint32_t x)
-{
-	size_t u;
-	uint32_t cc;
-
-	cc = 0;
-	for (u = 0; u < mlen; u ++) {
-		uint64_t z;
-
-		z = (uint64_t)m[u] * (uint64_t)x + cc;
-		m[u] = (uint32_t)z & 0x7FFFFFFF;
-		cc = (uint32_t)(z >> 31);
-	}
-	return cc;
-}
-
-/*
- * Reduce a big integer d modulo a small integer p.
- * Rules:
- *  d is unsigned
- *  p is prime
- *  2^30 < p < 2^31
- *  p0i = -(1/p) mod 2^31
- *  R2 = 2^62 mod p
- */
-static uint32_t
-zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
-	uint32_t p, uint32_t p0i, uint32_t R2)
-{
-	uint32_t x;
-	size_t u;
-
-	/*
-	 * Algorithm: we inject words one by one, starting with the high
-	 * word. Each step is:
-	 *  - multiply x by 2^31
-	 *  - add new word
-	 */
-	x = 0;
-	u = dlen;
-	while (u -- > 0) {
-		uint32_t w;
-
-		x = modp_montymul(x, R2, p, p0i);
-		w = d[u] - p;
-		w += p & -(w >> 31);
-		x = modp_add(x, w, p);
-	}
-	return x;
-}
-
-/*
- * Similar to zint_mod_small_unsigned(), except that d may be signed.
- * Extra parameter is Rx = 2^(31*dlen) mod p.
- */
-static uint32_t
-zint_mod_small_signed(const uint32_t *d, size_t dlen,
-	uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx)
-{
-	uint32_t z;
-
-	if (dlen == 0) {
-		return 0;
-	}
-	z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
-	z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
-	return z;
-}
-
-/*
- * Add y*s to x. x and y initially have length 'len' words; the new x
- * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must
- * not overlap.
- */
-static void
-zint_add_mul_small(uint32_t *restrict x,
-	const uint32_t *restrict y, size_t len, uint32_t s)
-{
-	size_t u;
-	uint32_t cc;
-
-	cc = 0;
-	for (u = 0; u < len; u ++) {
-		uint32_t xw, yw;
-		uint64_t z;
-
-		xw = x[u];
-		yw = y[u];
-		z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
-		x[u] = (uint32_t)z & 0x7FFFFFFF;
-		cc = (uint32_t)(z >> 31);
-	}
-	x[len] = cc;
-}
-
-/*
- * Normalize a modular integer around 0: if x > p/2, then x is replaced
- * with x - p (signed encoding with two's complement); otherwise, x is
- * untouched. The two integers x and p are encoded over the same length.
- */
-static void
-zint_norm_zero(uint32_t *restrict x, const uint32_t *restrict p, size_t len)
-{
-	size_t u;
-	uint32_t r, bb;
-
-	/*
-	 * Compare x with p/2. We use the shifted version of p, and p
-	 * is odd, so we really compare with (p-1)/2; we want to perform
-	 * the subtraction if and only if x > (p-1)/2.
-	 */
-	r = 0;
-	bb = 0;
-	u = len;
-	while (u -- > 0) {
-		uint32_t wx, wp, cc;
-
-		/*
-		 * Get the two words to compare in wx and wp (both over
-		 * 31 bits exactly).
-		 */
-		wx = x[u];
-		wp = (p[u] >> 1) | (bb << 30);
-		bb = p[u] & 1;
-
-		/*
-		 * We set cc to -1, 0 or 1, depending on whether wp is
-		 * lower than, equal to, or greater than wx.
-		 */
-		cc = wp - wx;
-		cc = ((-cc) >> 31) | -(cc >> 31);
-
-		/*
-		 * If r != 0 then it is either 1 or -1, and we keep its
-		 * value. Otherwise, if r = 0, then we replace it with cc.
-		 */
-		r |= cc & ((r & 1) - 1);
-	}
-
-	/*
-	 * At this point, r = -1, 0 or 1, depending on whether (p-1)/2
-	 * is lower than, equal to, or greater than x. We thus want to
-	 * do the subtraction only if r = -1.
-	 */
-	zint_sub(x, p, len, r >> 31);
-}
-
-/*
- * Rebuild integers from their RNS representation. There are 'num'
- * integers, and each consists in 'xlen' words. 'xx' points at that
- * first word of the first integer; subsequent integers are accessed
- * by adding 'xstride' repeatedly.
- *
- * The words of an integer are the RNS representation of that integer,
- * using the provided 'primes' are moduli. This function replaces
- * each integer with its multi-word value (little-endian order).
- *
- * If "normalize_signed" is non-zero, then the returned value is
- * normalized to the -m/2..m/2 interval (where m is the product of all
- * small prime moduli); two's complement is used for negative values.
- */
-static void
-zint_rebuild_CRT(uint32_t *restrict xx, size_t xlen, size_t xstride,
-	size_t num, const small_prime *primes, int normalize_signed,
-	uint32_t *restrict tmp)
-{
-	size_t u;
-	uint32_t *x;
-
-	tmp[0] = primes[0].p;
-	for (u = 1; u < xlen; u ++) {
-		/*
-		 * At the entry of each loop iteration:
-		 *  - the first u words of each array have been
-		 *    reassembled;
-		 *  - the first u words of tmp[] contains the
-		 * product of the prime moduli processed so far.
-		 *
-		 * We call 'q' the product of all previous primes.
-		 */
-		uint32_t p, p0i, s, R2;
-		size_t v;
-
-		p = primes[u].p;
-		s = primes[u].s;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-
-		for (v = 0, x = xx; v < num; v ++, x += xstride) {
-			uint32_t xp, xq, xr;
-			/*
-			 * xp = the integer x modulo the prime p for this
-			 *      iteration
-			 * xq = (x mod q) mod p
-			 */
-			xp = x[u];
-			xq = zint_mod_small_unsigned(x, u, p, p0i, R2);
-
-			/*
-			 * New value is (x mod q) + q * (s * (xp - xq) mod p)
-			 */
-			xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
-			zint_add_mul_small(x, tmp, u, xr);
-		}
-
-		/*
-		 * Update product of primes in tmp[].
-		 */
-		tmp[u] = zint_mul_small(tmp, u, p);
-	}
-
-	/*
-	 * Normalize the reconstructed values around 0.
-	 */
-	if (normalize_signed) {
-		for (u = 0, x = xx; u < num; u ++, x += xstride) {
-			zint_norm_zero(x, tmp, xlen);
-		}
-	}
-}
-
-/*
- * Negate a big integer conditionally: value a is replaced with -a if
- * and only if ctl = 1. Control value ctl must be 0 or 1.
- */
-static void
-zint_negate(uint32_t *a, size_t len, uint32_t ctl)
-{
-	size_t u;
-	uint32_t cc, m;
-
-	/*
-	 * If ctl = 1 then we flip the bits of a by XORing with
-	 * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR
-	 * with 0 and add 0, which leaves the value unchanged.
-	 */
-	cc = ctl;
-	m = -ctl >> 1;
-	for (u = 0; u < len; u ++) {
-		uint32_t aw;
-
-		aw = a[u];
-		aw = (aw ^ m) + cc;
-		a[u] = aw & 0x7FFFFFFF;
-		cc = aw >> 31;
-	}
-}
-
-/*
- * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
- * The low bits are dropped (the caller should compute the coefficients
- * such that these dropped bits are all zeros). If either or both
- * yields a negative value, then the value is negated.
- *
- * Returned value is:
- *  0  both values were positive
- *  1  new a had to be negated
- *  2  new b had to be negated
- *  3  both new a and new b had to be negated
- *
- * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
- */
-static uint32_t
-zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
-	int64_t xa, int64_t xb, int64_t ya, int64_t yb)
-{
-	size_t u;
-	int64_t cca, ccb;
-	uint32_t nega, negb;
-
-	cca = 0;
-	ccb = 0;
-	for (u = 0; u < len; u ++) {
-		uint32_t wa, wb;
-		uint64_t za, zb;
-
-		wa = a[u];
-		wb = b[u];
-		za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca;
-		zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb;
-		if (u > 0) {
-			a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
-			b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
-		}
-		cca = *(int64_t *)&za >> 31;
-		ccb = *(int64_t *)&zb >> 31;
-	}
-	a[len - 1] = (uint32_t)cca;
-	b[len - 1] = (uint32_t)ccb;
-
-	nega = (uint32_t)((uint64_t)cca >> 63);
-	negb = (uint32_t)((uint64_t)ccb >> 63);
-	zint_negate(a, len, nega);
-	zint_negate(b, len, negb);
-	return nega | (negb << 1);
-}
-
-/*
- * Finish modular reduction. Rules on input parameters:
- *
- *   if neg = 1, then -m <= a < 0
- *   if neg = 0, then 0 <= a < 2*m
- *
- * If neg = 0, then the top word of a[] is allowed to use 32 bits.
- *
- * Modulus m must be odd.
- */
-static void
-zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg)
-{
-	size_t u;
-	uint32_t cc, xm, ym;
-
-	/*
-	 * First pass: compare a (assumed nonnegative) with m. Note that
-	 * if the top word uses 32 bits, subtracting m must yield a
-	 * value less than 2^31 since a < 2*m.
-	 */
-	cc = 0;
-	for (u = 0; u < len; u ++) {
-		cc = (a[u] - m[u] - cc) >> 31;
-	}
-
-	/*
-	 * If neg = 1 then we must add m (regardless of cc)
-	 * If neg = 0 and cc = 0 then we must subtract m
-	 * If neg = 0 and cc = 1 then we must do nothing
-	 *
-	 * In the loop below, we conditionally subtract either m or -m
-	 * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1);
-	 * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0.
-	 */
-	xm = -neg >> 1;
-	ym = -(neg | (1 - cc));
-	cc = neg;
-	for (u = 0; u < len; u ++) {
-		uint32_t aw, mw;
-
-		aw = a[u];
-		mw = (m[u] ^ xm) & ym;
-		aw = aw - mw - cc;
-		a[u] = aw & 0x7FFFFFFF;
-		cc = aw >> 31;
-	}
-}
-
-/*
- * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
- * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
- */
-static void
-zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
-	uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb)
-{
-	size_t u;
-	int64_t cca, ccb;
-	uint32_t fa, fb;
-
-	/*
-	 * These are actually four combined Montgomery multiplications.
-	 */
-	cca = 0;
-	ccb = 0;
-	fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
-	fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
-	for (u = 0; u < len; u ++) {
-		uint32_t wa, wb;
-		uint64_t za, zb;
-
-		wa = a[u];
-		wb = b[u];
-		za = wa * (uint64_t)xa + wb * (uint64_t)xb
-			+ m[u] * (uint64_t)fa + (uint64_t)cca;
-		zb = wa * (uint64_t)ya + wb * (uint64_t)yb
-			+ m[u] * (uint64_t)fb + (uint64_t)ccb;
-		if (u > 0) {
-			a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
-			b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
-		}
-		cca = *(int64_t *)&za >> 31;
-		ccb = *(int64_t *)&zb >> 31;
-	}
-	a[len - 1] = (uint32_t)cca;
-	b[len - 1] = (uint32_t)ccb;
-
-	/*
-	 * At this point:
-	 *   -m <= a < 2*m
-	 *   -m <= b < 2*m
-	 * (this is a case of Montgomery reduction)
-	 * The top words of 'a' and 'b' may have a 32-th bit set.
-	 * We want to add or subtract the modulus, as required.
-	 */
-	zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
-	zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
-}
-
-/*
- * Compute a GCD between two positive big integers x and y. The two
- * integers must be odd. Returned value is 1 if the GCD is 1, 0
- * otherwise. When 1 is returned, arrays u and v are filled with values
- * such that:
- *   0 <= u <= y
- *   0 <= v <= x
- *   x*u - y*v = 1
- * x[] and y[] are unmodified. Both input values must have the same
- * encoded length. Temporary array must be large enough to accommodate 4
- * extra values of that length. Arrays u, v and tmp may not overlap with
- * each other, or with either x or y.
- */
-static int
-zint_bezout(uint32_t *restrict u, uint32_t *restrict v,
-	const uint32_t *restrict x, const uint32_t *restrict y,
-	size_t len, uint32_t *restrict tmp)
-{
-	/*
-	 * Algorithm is an extended binary GCD. We maintain 6 values
-	 * a, b, u0, u1, v0 and v1 with the following invariants:
-	 *
-	 *  a = x*u0 - y*v0
-	 *  b = x*u1 - y*v1
-	 *  0 <= a <= x
-	 *  0 <= b <= y
-	 *  0 <= u0 < y
-	 *  0 <= v0 < x
-	 *  0 <= u1 <= y
-	 *  0 <= v1 < x
-	 *
-	 * Initial values are:
-	 *
-	 *  a = x   u0 = 1   v0 = 0
-	 *  b = y   u1 = y   v1 = x-1
-	 *
-	 * Each iteration reduces either a or b, and maintains the
-	 * invariants. Algorithm stops when a = b, at which point their
-	 * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
-	 * the values (u,v) we want to return.
-	 *
-	 * The formal definition of the algorithm is a sequence of steps:
-	 *
-	 *  - If a is even, then:
-	 *        a <- a/2
-	 *        u0 <- u0/2 mod y
-	 *        v0 <- v0/2 mod x
-	 *
-	 *  - Otherwise, if b is even, then:
-	 *        b <- b/2
-	 *        u1 <- u1/2 mod y
-	 *        v1 <- v1/2 mod x
-	 *
-	 *  - Otherwise, if a > b, then:
-	 *        a <- (a-b)/2
-	 *        u0 <- (u0-u1)/2 mod y
-	 *        v0 <- (v0-v1)/2 mod x
-	 *
-	 *  - Otherwise:
-	 *        b <- (b-a)/2
-	 *        u1 <- (u1-u0)/2 mod y
-	 *        v1 <- (v1-v0)/2 mod y
-	 *
-	 * We can show that the operations above preserve the invariants:
-	 *
-	 *  - If a is even, then u0 and v0 are either both even or both
-	 *    odd (since a = x*u0 - y*v0, and x and y are both odd).
-	 *    If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2).
-	 *    Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way,
-	 *    the a = x*u0 - y*v0 invariant is preserved.
-	 *
-	 *  - The same holds for the case where b is even.
-	 *
-	 *  - If a and b are odd, and a > b, then:
-	 *
-	 *      a-b = x*(u0-u1) - y*(v0-v1)
-	 *
-	 *    In that situation, if u0 < u1, then x*(u0-u1) < 0, but
-	 *    a-b > 0; therefore, it must be that v0 < v1, and the
-	 *    first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x),
-	 *    which preserves the invariants. Otherwise, if u0 > u1,
-	 *    then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and
-	 *    b >= 0, hence a-b <= x. It follows that, in that case,
-	 *    v0-v1 >= 0. The first part of the update is then:
-	 *    (u0,v0) <- (u0-u1,v0-v1), which again preserves the
-	 *    invariants.
-	 *
-	 *    Either way, once the subtraction is done, the new value of
-	 *    a, which is the difference of two odd values, is even,
-	 *    and the remaining of this step is a subcase of the
-	 *    first algorithm case (i.e. when a is even).
-	 *
-	 *  - If a and b are odd, and b > a, then the a similar
-	 *    argument holds.
-	 *
-	 * The values a and b start at x and y, respectively. Since x
-	 * and y are odd, their GCD is odd, and it is easily seen that
-	 * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b);
-	 * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a
-	 * or b is reduced by at least one bit at each iteration, so
-	 * the algorithm necessarily converges on the case a = b, at
-	 * which point the common value is the GCD.
-	 *
-	 * In the algorithm expressed above, when a = b, the fourth case
-	 * applies, and sets b = 0. Since a contains the GCD of x and y,
-	 * which are both odd, a must be odd, and subsequent iterations
-	 * (if any) will simply divide b by 2 repeatedly, which has no
-	 * consequence. Thus, the algorithm can run for more iterations
-	 * than necessary; the final GCD will be in a, and the (u,v)
-	 * coefficients will be (u0,v0).
-	 *
-	 *
-	 * The presentation above is bit-by-bit. It can be sped up by
-	 * noticing that all decisions are taken based on the low bits
-	 * and high bits of a and b. We can extract the two top words
-	 * and low word of each of a and b, and compute reduction
-	 * parameters pa, pb, qa and qb such that the new values for
-	 * a and b are:
-	 *    a' = (a*pa + b*pb) / (2^31)
-	 *    b' = (a*qa + b*qb) / (2^31)
-	 * the two divisions being exact. The coefficients are obtained
-	 * just from the extracted words, and may be slightly off, requiring
-	 * an optional correction: if a' < 0, then we replace pa with -pa
-	 * and pb with -pb. Each such step will reduce the total length
-	 * (sum of lengths of a and b) by at least 30 bits at each
-	 * iteration.
-	 */
-	uint32_t *u0, *u1, *v0, *v1, *a, *b;
-	uint32_t x0i, y0i;
-	uint32_t num, rc;
-	size_t j;
-
-	if (len == 0) {
-		return 0;
-	}
-
-	/*
-	 * u0 and v0 are the u and v result buffers; the four other
-	 * values (u1, v1, a and b) are taken from tmp[].
-	 */
-	u0 = u;
-	v0 = v;
-	u1 = tmp;
-	v1 = u1 + len;
-	a = v1 + len;
-	b = a + len;
-
-	/*
-	 * We'll need the Montgomery reduction coefficients.
-	 */
-	x0i = modp_ninv31(x[0]);
-	y0i = modp_ninv31(y[0]);
-
-	/*
-	 * Initialize a, b, u0, u1, v0 and v1.
-	 *  a = x   u0 = 1   v0 = 0
-	 *  b = y   u1 = y   v1 = x-1
-	 * Note that x is odd, so computing x-1 is easy.
-	 */
-	memcpy(a, x, len * sizeof *x);
-	memcpy(b, y, len * sizeof *y);
-	u0[0] = 1;
-	memset(u0 + 1, 0, (len - 1) * sizeof *u0);
-	memset(v0, 0, len * sizeof *v0);
-	memcpy(u1, y, len * sizeof *u1);
-	memcpy(v1, x, len * sizeof *v1);
-	v1[0] --;
-
-	/*
-	 * Each input operand may be as large as 31*len bits, and we
-	 * reduce the total length by at least 30 bits at each iteration.
-	 */
-	for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) {
-		uint32_t c0, c1;
-		uint32_t a0, a1, b0, b1;
-		uint64_t a_hi, b_hi;
-		uint32_t a_lo, b_lo;
-		int64_t pa, pb, qa, qb;
-		int i;
-		uint32_t r;
-
-		/*
-		 * Extract the top words of a and b. If j is the highest
-		 * index >= 1 such that a[j] != 0 or b[j] != 0, then we
-		 * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1].
-		 * If a and b are down to one word each, then we use
-		 * a[0] and b[0].
-		 */
-		c0 = (uint32_t)-1;
-		c1 = (uint32_t)-1;
-		a0 = 0;
-		a1 = 0;
-		b0 = 0;
-		b1 = 0;
-		j = len;
-		while (j -- > 0) {
-			uint32_t aw, bw;
-
-			aw = a[j];
-			bw = b[j];
-			a0 ^= (a0 ^ aw) & c0;
-			a1 ^= (a1 ^ aw) & c1;
-			b0 ^= (b0 ^ bw) & c0;
-			b1 ^= (b1 ^ bw) & c1;
-			c1 = c0;
-			c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
-		}
-
-		/*
-		 * If c1 = 0, then we grabbed two words for a and b.
-		 * If c1 != 0 but c0 = 0, then we grabbed one word. It
-		 * is not possible that c1 != 0 and c0 != 0, because that
-		 * would mean that both integers are zero.
-		 */
-		a1 |= a0 & c1;
-		a0 &= ~c1;
-		b1 |= b0 & c1;
-		b0 &= ~c1;
-		a_hi = ((uint64_t)a0 << 31) + a1;
-		b_hi = ((uint64_t)b0 << 31) + b1;
-		a_lo = a[0];
-		b_lo = b[0];
-
-		/*
-		 * Compute reduction factors:
-		 *
-		 *   a' = a*pa + b*pb
-		 *   b' = a*qa + b*qb
-		 *
-		 * such that a' and b' are both multiple of 2^31, but are
-		 * only marginally larger than a and b.
-		 */
-		pa = 1;
-		pb = 0;
-		qa = 0;
-		qb = 1;
-		for (i = 0; i < 31; i ++) {
-			/*
-			 * At each iteration:
-			 *
-			 *   a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
-			 *   b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
-			 *   a <- a/2 if: a is even
-			 *   b <- b/2 if: a is odd, b is even
-			 *
-			 * We multiply a_lo and b_lo by 2 at each
-			 * iteration, thus a division by 2 really is a
-			 * non-multiplication by 2.
-			 */
-			uint32_t rt, oa, ob, cAB, cBA, cA;
-			uint64_t rz;
-
-			/*
-			 * rt = 1 if a_hi > b_hi, 0 otherwise.
-			 */
-			rz = b_hi - a_hi;
-			rt = (uint32_t)((rz ^ ((a_hi ^ b_hi)
-				& (a_hi ^ rz))) >> 63);
-
-			/*
-			 * cAB = 1 if b must be subtracted from a
-			 * cBA = 1 if a must be subtracted from b
-			 * cA = 1 if a must be divided by 2
-			 *
-			 * Rules:
-			 *
-			 *   cAB and cBA cannot both be 1.
-			 *   If a is not divided by 2, b is.
-			 */
-			oa = (a_lo >> i) & 1;
-			ob = (b_lo >> i) & 1;
-			cAB = oa & ob & rt;
-			cBA = oa & ob & ~rt;
-			cA = cAB | (oa ^ 1);
-
-			/*
-			 * Conditional subtractions.
-			 */
-			a_lo -= b_lo & -cAB;
-			a_hi -= b_hi & -(uint64_t)cAB;
-			pa -= qa & -(int64_t)cAB;
-			pb -= qb & -(int64_t)cAB;
-			b_lo -= a_lo & -cBA;
-			b_hi -= a_hi & -(uint64_t)cBA;
-			qa -= pa & -(int64_t)cBA;
-			qb -= pb & -(int64_t)cBA;
-
-			/*
-			 * Shifting.
-			 */
-			a_lo += a_lo & (cA - 1);
-			pa += pa & ((int64_t)cA - 1);
-			pb += pb & ((int64_t)cA - 1);
-			a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
-			b_lo += b_lo & -cA;
-			qa += qa & -(int64_t)cA;
-			qb += qb & -(int64_t)cA;
-			b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
-		}
-
-		/*
-		 * Apply the computed parameters to our values. We
-		 * may have to correct pa and pb depending on the
-		 * returned value of zint_co_reduce() (when a and/or b
-		 * had to be negated).
-		 */
-		r = zint_co_reduce(a, b, len, pa, pb, qa, qb);
-		pa -= (pa + pa) & -(int64_t)(r & 1);
-		pb -= (pb + pb) & -(int64_t)(r & 1);
-		qa -= (qa + qa) & -(int64_t)(r >> 1);
-		qb -= (qb + qb) & -(int64_t)(r >> 1);
-		zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb);
-		zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb);
-	}
-
-	/*
-	 * At that point, array a[] should contain the GCD, and the
-	 * results (u,v) should already be set. We check that the GCD
-	 * is indeed 1. We also check that the two operands x and y
-	 * are odd.
-	 */
-	rc = a[0] ^ 1;
-	for (j = 1; j < len; j ++) {
-		rc |= a[j];
-	}
-	return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]);
-}
-
-/*
- * Add k*y*2^sc to x. The result is assumed to fit in the array of
- * size xlen (truncation is applied if necessary).
- * Scale factor 'sc' is provided as sch and scl, such that:
- *   sch = sc / 31
- *   scl = sc % 31
- * xlen MUST NOT be lower than ylen.
- *
- * x[] and y[] are both signed integers, using two's complement for
- * negative values.
- */
-static void
-zint_add_scaled_mul_small(uint32_t *restrict x, size_t xlen,
-	const uint32_t *restrict y, size_t ylen, int32_t k,
-	uint32_t sch, uint32_t scl)
-{
-	size_t u;
-	uint32_t ysign, tw;
-	int32_t cc;
-
-	if (ylen == 0) {
-		return;
-	}
-
-	ysign = -(y[ylen - 1] >> 30) >> 1;
-	tw = 0;
-	cc = 0;
-	for (u = sch; u < xlen; u ++) {
-		size_t v;
-		uint32_t wy, wys, ccu;
-		uint64_t z;
-
-		/*
-		 * Get the next word of y (scaled).
-		 */
-		v = u - sch;
-		wy = v < ylen ? y[v] : ysign;
-		wys = ((wy << scl) & 0x7FFFFFFF) | tw;
-		tw = wy >> (31 - scl);
-
-		/*
-		 * The expression below does not overflow.
-		 */
-		z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc);
-		x[u] = (uint32_t)z & 0x7FFFFFFF;
-
-		/*
-		 * Right-shifting the signed value z would yield
-		 * implementation-defined results (arithmetic shift is
-		 * not guaranteed). However, we can cast to unsigned,
-		 * and get the next carry as an unsigned word. We can
-		 * then convert it back to signed by using the guaranteed
-		 * fact that 'int32_t' uses two's complement with no
-		 * trap representation or padding bit, and with a layout
-		 * compatible with that of 'uint32_t'.
-		 */
-		ccu = (uint32_t)(z >> 31);
-		cc = *(int32_t *)&ccu;
-	}
-}
-
-/*
- * Subtract y*2^sc from x. The result is assumed to fit in the array of
- * size xlen (truncation is applied if necessary).
- * Scale factor 'sc' is provided as sch and scl, such that:
- *   sch = sc / 31
- *   scl = sc % 31
- * xlen MUST NOT be lower than ylen.
- *
- * x[] and y[] are both signed integers, using two's complement for
- * negative values.
- */
-static void
-zint_sub_scaled(uint32_t *restrict x, size_t xlen,
-	const uint32_t *restrict y, size_t ylen, uint32_t sch, uint32_t scl)
-{
-	size_t u;
-	uint32_t ysign, tw;
-	uint32_t cc;
-
-	if (ylen == 0) {
-		return;
-	}
-
-	ysign = -(y[ylen - 1] >> 30) >> 1;
-	tw = 0;
-	cc = 0;
-	for (u = sch; u < xlen; u ++) {
-		size_t v;
-		uint32_t w, wy, wys;
-
-		/*
-		 * Get the next word of y (scaled).
-		 */
-		v = u - sch;
-		wy = v < ylen ? y[v] : ysign;
-		wys = ((wy << scl) & 0x7FFFFFFF) | tw;
-		tw = wy >> (31 - scl);
-
-		w = x[u] - wys - cc;
-		x[u] = w & 0x7FFFFFFF;
-		cc = w >> 31;
-	}
-}
-
-/*
- * Convert a one-word signed big integer into a signed value.
- */
-static inline int32_t
-zint_one_to_plain(const uint32_t *x)
-{
-	uint32_t w;
-
-	w = x[0];
-	w |= (w & 0x40000000) << 1;
-	return *(int32_t *)&w;
-}
-
-/* ==================================================================== */
-
-/*
- * Convert a polynomial to floating-point values.
- *
- * Each coefficient has length flen words, and starts fstride words after
- * the previous.
- *
- * IEEE-754 binary64 values can represent values in a finite range,
- * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large,
- * they should be "trimmed" by pointing not to the lowest word of each,
- * but upper.
- */
-static void
-poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
-	unsigned logn)
-{
-	size_t n, u;
-
-	n = MKN(logn);
-	if (flen == 0) {
-		for (u = 0; u < n; u ++) {
-			d[u] = fpr_zero;
-		}
-		return;
-	}
-	for (u = 0; u < n; u ++, f += fstride) {
-		size_t v;
-		uint32_t neg, cc, xm;
-		fpr x, fsc;
-
-		/*
-		 * Get sign of the integer; if it is negative, then we
-		 * will load its absolute value instead, and negate the
-		 * result.
-		 */
-		neg = -(f[flen - 1] >> 30);
-		xm = neg >> 1;
-		cc = neg & 1;
-		x = fpr_zero;
-		fsc = fpr_one;
-		for (v = 0; v < flen; v ++, fsc = fpr_mul(fsc, fpr_ptwo31)) {
-			uint32_t w;
-
-			w = (f[v] ^ xm) + cc;
-			cc = w >> 31;
-			w &= 0x7FFFFFFF;
-			w -= (w << 1) & neg;
-			x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc));
-		}
-		d[u] = x;
-	}
-}
-
-/*
- * Convert a polynomial to small integers. Source values are supposed
- * to be one-word integers, signed over 31 bits. Returned value is 0
- * if any of the coefficients exceeds the provided limit (in absolute
- * value), or 1 on success.
- *
- * This is not constant-time; this is not a problem here, because on
- * any failure, the NTRU-solving process will be deemed to have failed
- * and the (f,g) polynomials will be discarded.
- */
-static int
-poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn)
-{
-	size_t n, u;
-
-	n = MKN(logn);
-	for (u = 0; u < n; u ++) {
-		int32_t z;
-
-		z = zint_one_to_plain(s + u);
-		if (z < -lim || z > lim) {
-			return 0;
-		}
-		d[u] = (int8_t)z;
-	}
-	return 1;
-}
-
-/*
- * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
- * Coefficients of polynomial k are small integers (signed values in the
- * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31
- * and scl = sc % 31.
- *
- * This function implements the basic quadratic multiplication algorithm,
- * which is efficient in space (no extra buffer needed) but slow at
- * high degree.
- */
-static void
-poly_sub_scaled(uint32_t *restrict F, size_t Flen, size_t Fstride,
-	const uint32_t *restrict f, size_t flen, size_t fstride,
-	const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn)
-{
-	size_t n, u;
-
-	n = MKN(logn);
-	for (u = 0; u < n; u ++) {
-		int32_t kf;
-		size_t v;
-		uint32_t *x;
-		const uint32_t *y;
-
-		kf = -k[u];
-		x = F + u * Fstride;
-		y = f;
-		for (v = 0; v < n; v ++) {
-			zint_add_scaled_mul_small(
-				x, Flen, y, flen, kf, sch, scl);
-			if (u + v == n - 1) {
-				x = F;
-				kf = -kf;
-			} else {
-				x += Fstride;
-			}
-			y += fstride;
-		}
-	}
-}
-
-/*
- * Subtract k*f from F. Coefficients of polynomial k are small integers
- * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
- * assumes that the degree is large, and integers relatively small.
- * The value sc is provided as sch = sc / 31 and scl = sc % 31.
- */
-static void
-poly_sub_scaled_ntt(uint32_t *restrict F, size_t Flen, size_t Fstride,
-	const uint32_t *restrict f, size_t flen, size_t fstride,
-	const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn,
-	uint32_t *restrict tmp)
-{
-	uint32_t *gm, *igm, *fk, *t1, *x;
-	const uint32_t *y;
-	size_t n, u, tlen;
-	const small_prime *primes;
-
-	n = MKN(logn);
-	tlen = flen + 1;
-	gm = tmp;
-	igm = gm + MKN(logn);
-	fk = igm + MKN(logn);
-	t1 = fk + n * tlen;
-
-	primes = PRIMES;
-
-	/*
-	 * Compute k*f in fk[], in RNS notation.
-	 */
-	for (u = 0; u < tlen; u ++) {
-		uint32_t p, p0i, R2, Rx;
-		size_t v;
-
-		p = primes[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-		Rx = modp_Rx((unsigned)flen, p, p0i, R2);
-		modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
-
-		for (v = 0; v < n; v ++) {
-			t1[v] = modp_set(k[v], p);
-		}
-		modp_NTT2(t1, gm, logn, p, p0i);
-		for (v = 0, y = f, x = fk + u;
-			v < n; v ++, y += fstride, x += tlen)
-		{
-			*x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
-		}
-		modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
-		for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
-			*x = modp_montymul(
-				modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
-		}
-		modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
-	}
-
-	/*
-	 * Rebuild k*f.
-	 */
-	zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);
-
-	/*
-	 * Subtract k*f, scaled, from F.
-	 */
-	for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
-		zint_sub_scaled(x, Flen, y, tlen, sch, scl);
-	}
-}
-
-/* ==================================================================== */
-
-#if FALCON_KG_CHACHA20  // yyyKG_CHACHA20+1
-
-#define RNG_CONTEXT   prng
-#define get_rng_u64   prng_get_u64
-
-#else  // yyyKG_CHACHA20+0
-
-#define RNG_CONTEXT   inner_shake256_context
-
-/*
- * Get a random 8-byte integer from a SHAKE-based RNG. This function
- * ensures consistent interpretation of the SHAKE output so that
- * the same values will be obtained over different platforms, in case
- * a known seed is used.
- */
-static inline uint64_t
-get_rng_u64(inner_shake256_context *rng)
-{
-	/*
-	 * We enforce little-endian representation.
-	 */
-
-#if FALCON_LE  // yyyLE+1
-	/*
-	 * On little-endian systems we just interpret the bytes "as is"
-	 * (this is correct because the exact-width types such as
-	 * 'uint64_t' are guaranteed to have no padding and no trap
-	 * representation).
-	 */
-	uint64_t r;
-
-	inner_shake256_extract(rng, (uint8_t *)&r, sizeof r);
-	return r;
-#else  // yyyLE+0
-	uint8_t tmp[8];
-
-	inner_shake256_extract(rng, tmp, sizeof tmp);
-	return (uint64_t)tmp[0]
-		| ((uint64_t)tmp[1] << 8)
-		| ((uint64_t)tmp[2] << 16)
-		| ((uint64_t)tmp[3] << 24)
-		| ((uint64_t)tmp[4] << 32)
-		| ((uint64_t)tmp[5] << 40)
-		| ((uint64_t)tmp[6] << 48)
-		| ((uint64_t)tmp[7] << 56);
-#endif  // yyyLE-
-}
-
-#endif  // yyyKG_CHACHA20-
-
-/*
- * Table below incarnates a discrete Gaussian distribution:
- *    D(x) = exp(-(x^2)/(2*sigma^2))
- * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
- * Element 0 of the table is P(x = 0).
- * For k > 0, element k is P(x >= k+1 | x > 0).
- * Probabilities are scaled up by 2^63.
- */
-static const uint64_t gauss_1024_12289[] = {
-	 1283868770400643928u,  6416574995475331444u,  4078260278032692663u,
-	 2353523259288686585u,  1227179971273316331u,   575931623374121527u,
-	  242543240509105209u,    91437049221049666u,    30799446349977173u,
-	    9255276791179340u,     2478152334826140u,      590642893610164u,
-	     125206034929641u,       23590435911403u,        3948334035941u,
-	        586753615614u,          77391054539u,           9056793210u,
-	           940121950u,             86539696u,              7062824u,
-	              510971u,                32764u,                 1862u,
-	                  94u,                    4u,                    0u
-};
-
-/*
- * Generate a random value with a Gaussian distribution centered on 0.
- * The RNG must be ready for extraction (already flipped).
- *
- * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
- * precomputed table is for N = 1024. Since the sum of two independent
- * values of standard deviation sigma has standard deviation
- * sigma*sqrt(2), then we can just generate more values and add them
- * together for lower dimensions.
- */
-static int
-mkgauss(RNG_CONTEXT *rng, unsigned logn)
-{
-	unsigned u, g;
-	int val;
-
-	g = 1U << (10 - logn);
-	val = 0;
-	for (u = 0; u < g; u ++) {
-		/*
-		 * Each iteration generates one value with the
-		 * Gaussian distribution for N = 1024.
-		 *
-		 * We use two random 64-bit values. First value
-		 * decides on whether the generated value is 0, and,
-		 * if not, the sign of the value. Second random 64-bit
-		 * word is used to generate the non-zero value.
-		 *
-		 * For constant-time code we have to read the complete
-		 * table. This has negligible cost, compared with the
-		 * remainder of the keygen process (solving the NTRU
-		 * equation).
-		 */
-		uint64_t r;
-		uint32_t f, v, k, neg;
-
-		/*
-		 * First value:
-		 *  - flag 'neg' is randomly selected to be 0 or 1.
-		 *  - flag 'f' is set to 1 if the generated value is zero,
-		 *    or set to 0 otherwise.
-		 */
-		r = get_rng_u64(rng);
-		neg = (uint32_t)(r >> 63);
-		r &= ~((uint64_t)1 << 63);
-		f = (uint32_t)((r - gauss_1024_12289[0]) >> 63);
-
-		/*
-		 * We produce a new random 63-bit integer r, and go over
-		 * the array, starting at index 1. We store in v the
-		 * index of the first array element which is not greater
-		 * than r, unless the flag f was already 1.
-		 */
-		v = 0;
-		r = get_rng_u64(rng);
-		r &= ~((uint64_t)1 << 63);
-		for (k = 1; k < (sizeof gauss_1024_12289)
-			/ (sizeof gauss_1024_12289[0]); k ++)
-		{
-			uint32_t t;
-
-			t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1;
-			v |= k & -(t & (f ^ 1));
-			f |= t;
-		}
-
-		/*
-		 * We apply the sign ('neg' flag). If the value is zero,
-		 * the sign has no effect.
-		 */
-		v = (v ^ -neg) + neg;
-
-		/*
-		 * Generated value is added to val.
-		 */
-		val += *(int32_t *)&v;
-	}
-	return val;
-}
-
-/*
- * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit
- * words, of intermediate values in the computation:
- *
- *   MAX_BL_SMALL[depth]: length for the input f and g at that depth
- *   MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth
- *
- * Rules:
- *
- *  - Within an array, values grow.
- *
- *  - The 'SMALL' array must have an entry for maximum depth, corresponding
- *    to the size of values used in the binary GCD. There is no such value
- *    for the 'LARGE' array (the binary GCD yields already reduced
- *    coefficients).
- *
- *  - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
- *
- *  - Values must be large enough to handle the common cases, with some
- *    margins.
- *
- *  - Values must not be "too large" either because we will convert some
- *    integers into floating-point values by considering the top 10 words,
- *    i.e. 310 bits; hence, for values of length more than 10 words, we
- *    should take care to have the length centered on the expected size.
- *
- * The following average lengths, in bits, have been measured on thousands
- * of random keys (fg = max length of the absolute value of coefficients
- * of f and g at that depth; FG = idem for the unreduced F and G; for the
- * maximum depth, F and G are the output of binary GCD, multiplied by q;
- * for each value, the average and standard deviation are provided).
- *
- * Binary case:
- *    depth: 10    fg: 6307.52 (24.48)    FG: 6319.66 (24.51)
- *    depth:  9    fg: 3138.35 (12.25)    FG: 9403.29 (27.55)
- *    depth:  8    fg: 1576.87 ( 7.49)    FG: 4703.30 (14.77)
- *    depth:  7    fg:  794.17 ( 4.98)    FG: 2361.84 ( 9.31)
- *    depth:  6    fg:  400.67 ( 3.10)    FG: 1188.68 ( 6.04)
- *    depth:  5    fg:  202.22 ( 1.87)    FG:  599.81 ( 3.87)
- *    depth:  4    fg:  101.62 ( 1.02)    FG:  303.49 ( 2.38)
- *    depth:  3    fg:   50.37 ( 0.53)    FG:  153.65 ( 1.39)
- *    depth:  2    fg:   24.07 ( 0.25)    FG:   78.20 ( 0.73)
- *    depth:  1    fg:   10.99 ( 0.08)    FG:   39.82 ( 0.41)
- *    depth:  0    fg:    4.00 ( 0.00)    FG:   19.61 ( 0.49)
- *
- * Integers are actually represented either in binary notation over
- * 31-bit words (signed, using two's complement), or in RNS, modulo
- * many small primes. These small primes are close to, but slightly
- * lower than, 2^31. Use of RNS loses less than two bits, even for
- * the largest values.
- *
- * IMPORTANT: if these values are modified, then the temporary buffer
- * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed
- * accordingly.
- */
-
-static const size_t MAX_BL_SMALL[] = {
-	1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209
-};
-
-static const size_t MAX_BL_LARGE[] = {
-	2, 2, 5, 7, 12, 21, 40, 78, 157, 308
-};
-
-/*
- * Average and standard deviation for the maximum size (in bits) of
- * coefficients of (f,g), depending on depth. These values are used
- * to compute bounds for Babai's reduction.
- */
-static const struct {
-	int avg;
-	int std;
-} BITLENGTH[] = {
-	{    4,  0 },
-	{   11,  1 },
-	{   24,  1 },
-	{   50,  1 },
-	{  102,  1 },
-	{  202,  2 },
-	{  401,  4 },
-	{  794,  5 },
-	{ 1577,  8 },
-	{ 3138, 13 },
-	{ 6308, 25 }
-};
-
-/*
- * Minimal recursion depth at which we rebuild intermediate values
- * when reconstructing f and g.
- */
-#define DEPTH_INT_FG   4
-
-/*
- * Compute squared norm of a short vector. Returned value is saturated to
- * 2^32-1 if it is not lower than 2^31.
- */
-static uint32_t
-poly_small_sqnorm(const int8_t *f, unsigned logn)
-{
-	size_t n, u;
-	uint32_t s, ng;
-
-	n = MKN(logn);
-	s = 0;
-	ng = 0;
-	for (u = 0; u < n; u ++) {
-		int32_t z;
-
-		z = f[u];
-		s += (uint32_t)(z * z);
-		ng |= s;
-	}
-	return s | -(ng >> 31);
-}
-
-/*
- * Align (upwards) the provided 'data' pointer with regards to 'base'
- * so that the offset is a multiple of the size of 'fpr'.
- */
-static fpr *
-align_fpr(void *base, void *data)
-{
-	uint8_t *cb, *cd;
-	size_t k, km;
-
-	cb = base;
-	cd = data;
-	k = (size_t)(cd - cb);
-	km = k % sizeof(fpr);
-	if (km) {
-		k += (sizeof(fpr)) - km;
-	}
-	return (fpr *)(cb + k);
-}
-
-/*
- * Align (upwards) the provided 'data' pointer with regards to 'base'
- * so that the offset is a multiple of the size of 'uint32_t'.
- */
-static uint32_t *
-align_u32(void *base, void *data)
-{
-	uint8_t *cb, *cd;
-	size_t k, km;
-
-	cb = base;
-	cd = data;
-	k = (size_t)(cd - cb);
-	km = k % sizeof(uint32_t);
-	if (km) {
-		k += (sizeof(uint32_t)) - km;
-	}
-	return (uint32_t *)(cb + k);
-}
-
-/*
- * Convert a small vector to floating point.
- */
-static void
-poly_small_to_fp(fpr *x, const int8_t *f, unsigned logn)
-{
-	size_t n, u;
-
-	n = MKN(logn);
-	for (u = 0; u < n; u ++) {
-		x[u] = fpr_of(f[u]);
-	}
-}
-
-/*
- * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
- * individual length.
- *
- * Output: f',g' of degree N/2, with the length for 'depth+1'.
- *
- * Values are in RNS; input and/or output may also be in NTT.
- */
-static void
-make_fg_step(uint32_t *data, unsigned logn, unsigned depth,
-	int in_ntt, int out_ntt)
-{
-	size_t n, hn, u;
-	size_t slen, tlen;
-	uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
-	const small_prime *primes;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-	slen = MAX_BL_SMALL[depth];
-	tlen = MAX_BL_SMALL[depth + 1];
-	primes = PRIMES;
-
-	/*
-	 * Prepare room for the result.
-	 */
-	fd = data;
-	gd = fd + hn * tlen;
-	fs = gd + hn * tlen;
-	gs = fs + n * slen;
-	gm = gs + n * slen;
-	igm = gm + n;
-	t1 = igm + n;
-	memmove(fs, data, 2 * n * slen * sizeof *data);
-
-	/*
-	 * First slen words: we use the input values directly, and apply
-	 * inverse NTT as we go.
-	 */
-	for (u = 0; u < slen; u ++) {
-		uint32_t p, p0i, R2;
-		size_t v;
-		uint32_t *x;
-
-		p = primes[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-		modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
-
-		for (v = 0, x = fs + u; v < n; v ++, x += slen) {
-			t1[v] = *x;
-		}
-		if (!in_ntt) {
-			modp_NTT2(t1, gm, logn, p, p0i);
-		}
-		for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
-			uint32_t w0, w1;
-
-			w0 = t1[(v << 1) + 0];
-			w1 = t1[(v << 1) + 1];
-			*x = modp_montymul(
-				modp_montymul(w0, w1, p, p0i), R2, p, p0i);
-		}
-		if (in_ntt) {
-			modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i);
-		}
-
-		for (v = 0, x = gs + u; v < n; v ++, x += slen) {
-			t1[v] = *x;
-		}
-		if (!in_ntt) {
-			modp_NTT2(t1, gm, logn, p, p0i);
-		}
-		for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
-			uint32_t w0, w1;
-
-			w0 = t1[(v << 1) + 0];
-			w1 = t1[(v << 1) + 1];
-			*x = modp_montymul(
-				modp_montymul(w0, w1, p, p0i), R2, p, p0i);
-		}
-		if (in_ntt) {
-			modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i);
-		}
-
-		if (!out_ntt) {
-			modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
-			modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
-		}
-	}
-
-	/*
-	 * Since the fs and gs words have been de-NTTized, we can use the
-	 * CRT to rebuild the values.
-	 */
-	zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
-	zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);
-
-	/*
-	 * Remaining words: use modular reductions to extract the values.
-	 */
-	for (u = slen; u < tlen; u ++) {
-		uint32_t p, p0i, R2, Rx;
-		size_t v;
-		uint32_t *x;
-
-		p = primes[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-		Rx = modp_Rx((unsigned)slen, p, p0i, R2);
-		modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
-		for (v = 0, x = fs; v < n; v ++, x += slen) {
-			t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
-		}
-		modp_NTT2(t1, gm, logn, p, p0i);
-		for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
-			uint32_t w0, w1;
-
-			w0 = t1[(v << 1) + 0];
-			w1 = t1[(v << 1) + 1];
-			*x = modp_montymul(
-				modp_montymul(w0, w1, p, p0i), R2, p, p0i);
-		}
-		for (v = 0, x = gs; v < n; v ++, x += slen) {
-			t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
-		}
-		modp_NTT2(t1, gm, logn, p, p0i);
-		for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
-			uint32_t w0, w1;
-
-			w0 = t1[(v << 1) + 0];
-			w1 = t1[(v << 1) + 1];
-			*x = modp_montymul(
-				modp_montymul(w0, w1, p, p0i), R2, p, p0i);
-		}
-
-		if (!out_ntt) {
-			modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
-			modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
-		}
-	}
-}
-
-/*
- * Compute f and g at a specific depth, in RNS notation.
- *
- * Returned values are stored in the data[] array, at slen words per integer.
- *
- * Conditions:
- *   0 <= depth <= logn
- *
- * Space use in data[]: enough room for any two successive values (f', g',
- * f and g).
- */
-static void
-make_fg(uint32_t *data, const int8_t *f, const int8_t *g,
-	unsigned logn, unsigned depth, int out_ntt)
-{
-	size_t n, u;
-	uint32_t *ft, *gt, p0;
-	unsigned d;
-	const small_prime *primes;
-
-	n = MKN(logn);
-	ft = data;
-	gt = ft + n;
-	primes = PRIMES;
-	p0 = primes[0].p;
-	for (u = 0; u < n; u ++) {
-		ft[u] = modp_set(f[u], p0);
-		gt[u] = modp_set(g[u], p0);
-	}
-
-	if (depth == 0 && out_ntt) {
-		uint32_t *gm, *igm;
-		uint32_t p, p0i;
-
-		p = primes[0].p;
-		p0i = modp_ninv31(p);
-		gm = gt + n;
-		igm = gm + MKN(logn);
-		modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
-		modp_NTT2(ft, gm, logn, p, p0i);
-		modp_NTT2(gt, gm, logn, p, p0i);
-		return;
-	}
-
-	for (d = 0; d < depth; d ++) {
-		make_fg_step(data, logn - d, d,
-			d != 0, (d + 1) < depth || out_ntt);
-	}
-}
-
-/*
- * Solving the NTRU equation, deepest level: compute the resultants of
- * f and g with X^N+1, and use binary GCD. The F and G values are
- * returned in tmp[].
- *
- * Returned value: 1 on success, 0 on error.
- */
-static int
-solve_NTRU_deepest(unsigned logn_top,
-	const int8_t *f, const int8_t *g, uint32_t *tmp)
-{
-	size_t len;
-	uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
-	const small_prime *primes;
-
-	len = MAX_BL_SMALL[logn_top];
-	primes = PRIMES;
-
-	Fp = tmp;
-	Gp = Fp + len;
-	fp = Gp + len;
-	gp = fp + len;
-	t1 = gp + len;
-
-	make_fg(fp, f, g, logn_top, logn_top, 0);
-
-	/*
-	 * We use the CRT to rebuild the resultants as big integers.
-	 * There are two such big integers. The resultants are always
-	 * nonnegative.
-	 */
-	zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);
-
-	/*
-	 * Apply the binary GCD. The zint_bezout() function works only
-	 * if both inputs are odd.
-	 *
-	 * We can test on the result and return 0 because that would
-	 * imply failure of the NTRU solving equation, and the (f,g)
-	 * values will be abandoned in that case.
-	 */
-	if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
-		return 0;
-	}
-
-	/*
-	 * Multiply the two values by the target value q. Values must
-	 * fit in the destination arrays.
-	 * We can again test on the returned words: a non-zero output
-	 * of zint_mul_small() means that we exceeded our array
-	 * capacity, and that implies failure and rejection of (f,g).
-	 */
-	q = 12289;
-	if (zint_mul_small(Fp, len, q) != 0
-		|| zint_mul_small(Gp, len, q) != 0)
-	{
-		return 0;
-	}
-
-	return 1;
-}
-
-/*
- * Solving the NTRU equation, intermediate level. Upon entry, the F and G
- * from the previous level should be in the tmp[] array.
- * This function MAY be invoked for the top-level (in which case depth = 0).
- *
- * Returned value: 1 on success, 0 on error.
- */
-static int
-solve_NTRU_intermediate(unsigned logn_top,
-	const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp)
-{
-	/*
-	 * In this function, 'logn' is the log2 of the degree for
-	 * this step. If N = 2^logn, then:
-	 *  - the F and G values already in fk->tmp (from the deeper
-	 *    levels) have degree N/2;
-	 *  - this function should return F and G of degree N.
-	 */
-	unsigned logn;
-	size_t n, hn, slen, dlen, llen, rlen, FGlen, u;
-	uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
-	fpr *rt1, *rt2, *rt3, *rt4, *rt5;
-	int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k;
-	uint32_t *x, *y;
-	int32_t *k;
-	const small_prime *primes;
-
-	logn = logn_top - depth;
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-
-	/*
-	 * slen = size for our input f and g; also size of the reduced
-	 *        F and G we return (degree N)
-	 *
-	 * dlen = size of the F and G obtained from the deeper level
-	 *        (degree N/2 or N/3)
-	 *
-	 * llen = size for intermediary F and G before reduction (degree N)
-	 *
-	 * We build our non-reduced F and G as two independent halves each,
-	 * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
-	 */
-	slen = MAX_BL_SMALL[depth];
-	dlen = MAX_BL_SMALL[depth + 1];
-	llen = MAX_BL_LARGE[depth];
-	primes = PRIMES;
-
-	/*
-	 * Fd and Gd are the F and G from the deeper level.
-	 */
-	Fd = tmp;
-	Gd = Fd + dlen * hn;
-
-	/*
-	 * Compute the input f and g for this level. Note that we get f
-	 * and g in RNS + NTT representation.
-	 */
-	ft = Gd + dlen * hn;
-	make_fg(ft, f, g, logn_top, depth, 1);
-
-	/*
-	 * Move the newly computed f and g to make room for our candidate
-	 * F and G (unreduced).
-	 */
-	Ft = tmp;
-	Gt = Ft + n * llen;
-	t1 = Gt + n * llen;
-	memmove(t1, ft, 2 * n * slen * sizeof *ft);
-	ft = t1;
-	gt = ft + slen * n;
-	t1 = gt + slen * n;
-
-	/*
-	 * Move Fd and Gd _after_ f and g.
-	 */
-	memmove(t1, Fd, 2 * hn * dlen * sizeof *Fd);
-	Fd = t1;
-	Gd = Fd + hn * dlen;
-
-	/*
-	 * We reduce Fd and Gd modulo all the small primes we will need,
-	 * and store the values in Ft and Gt (only n/2 values in each).
-	 */
-	for (u = 0; u < llen; u ++) {
-		uint32_t p, p0i, R2, Rx;
-		size_t v;
-		uint32_t *xs, *ys, *xd, *yd;
-
-		p = primes[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-		Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
-		for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
-			v < hn;
-			v ++, xs += dlen, ys += dlen, xd += llen, yd += llen)
-		{
-			*xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
-			*yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
-		}
-	}
-
-	/*
-	 * We do not need Fd and Gd after that point.
-	 */
-
-	/*
-	 * Compute our F and G modulo sufficiently many small primes.
-	 */
-	for (u = 0; u < llen; u ++) {
-		uint32_t p, p0i, R2;
-		uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
-		size_t v;
-
-		/*
-		 * All computations are done modulo p.
-		 */
-		p = primes[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-
-		/*
-		 * If we processed slen words, then f and g have been
-		 * de-NTTized, and are in RNS; we can rebuild them.
-		 */
-		if (u == slen) {
-			zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
-			zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
-		}
-
-		gm = t1;
-		igm = gm + n;
-		fx = igm + n;
-		gx = fx + n;
-
-		modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
-
-		if (u < slen) {
-			for (v = 0, x = ft + u, y = gt + u;
-				v < n; v ++, x += slen, y += slen)
-			{
-				fx[v] = *x;
-				gx[v] = *y;
-			}
-			modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i);
-			modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i);
-		} else {
-			uint32_t Rx;
-
-			Rx = modp_Rx((unsigned)slen, p, p0i, R2);
-			for (v = 0, x = ft, y = gt;
-				v < n; v ++, x += slen, y += slen)
-			{
-				fx[v] = zint_mod_small_signed(x, slen,
-					p, p0i, R2, Rx);
-				gx[v] = zint_mod_small_signed(y, slen,
-					p, p0i, R2, Rx);
-			}
-			modp_NTT2(fx, gm, logn, p, p0i);
-			modp_NTT2(gx, gm, logn, p, p0i);
-		}
-
-		/*
-		 * Get F' and G' modulo p and in NTT representation
-		 * (they have degree n/2). These values were computed in
-		 * a previous step, and stored in Ft and Gt.
-		 */
-		Fp = gx + n;
-		Gp = Fp + hn;
-		for (v = 0, x = Ft + u, y = Gt + u;
-			v < hn; v ++, x += llen, y += llen)
-		{
-			Fp[v] = *x;
-			Gp[v] = *y;
-		}
-		modp_NTT2(Fp, gm, logn - 1, p, p0i);
-		modp_NTT2(Gp, gm, logn - 1, p, p0i);
-
-		/*
-		 * Compute our F and G modulo p.
-		 *
-		 * General case:
-		 *
-		 *   we divide degree by d = 2 or 3
-		 *   f'(x^d) = N(f)(x^d) = f * adj(f)
-		 *   g'(x^d) = N(g)(x^d) = g * adj(g)
-		 *   f'*G' - g'*F' = q
-		 *   F = F'(x^d) * adj(g)
-		 *   G = G'(x^d) * adj(f)
-		 *
-		 * We compute things in the NTT. We group roots of phi
-		 * such that all roots x in a group share the same x^d.
-		 * If the roots in a group are x_1, x_2... x_d, then:
-		 *
-		 *   N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
-		 *
-		 * Thus, we have:
-		 *
-		 *   G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
-		 *   G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
-		 *   ...
-		 *   G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
-		 *
-		 * In all cases, we can thus compute F and G in NTT
-		 * representation by a few simple multiplications.
-		 * Moreover, in our chosen NTT representation, roots
-		 * from the same group are consecutive in RAM.
-		 */
-		for (v = 0, x = Ft + u, y = Gt + u; v < hn;
-			v ++, x += (llen << 1), y += (llen << 1))
-		{
-			uint32_t ftA, ftB, gtA, gtB;
-			uint32_t mFp, mGp;
-
-			ftA = fx[(v << 1) + 0];
-			ftB = fx[(v << 1) + 1];
-			gtA = gx[(v << 1) + 0];
-			gtB = gx[(v << 1) + 1];
-			mFp = modp_montymul(Fp[v], R2, p, p0i);
-			mGp = modp_montymul(Gp[v], R2, p, p0i);
-			x[0] = modp_montymul(gtB, mFp, p, p0i);
-			x[llen] = modp_montymul(gtA, mFp, p, p0i);
-			y[0] = modp_montymul(ftB, mGp, p, p0i);
-			y[llen] = modp_montymul(ftA, mGp, p, p0i);
-		}
-		modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
-		modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
-	}
-
-	/*
-	 * Rebuild F and G with the CRT.
-	 */
-	zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
-	zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);
-
-	/*
-	 * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
-	 * order).
-	 */
-
-	/*
-	 * Apply Babai reduction to bring back F and G to size slen.
-	 *
-	 * We use the FFT to compute successive approximations of the
-	 * reduction coefficient. We first isolate the top bits of
-	 * the coefficients of f and g, and convert them to floating
-	 * point; with the FFT, we compute adj(f), adj(g), and
-	 * 1/(f*adj(f)+g*adj(g)).
-	 *
-	 * Then, we repeatedly apply the following:
-	 *
-	 *   - Get the top bits of the coefficients of F and G into
-	 *     floating point, and use the FFT to compute:
-	 *        (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
-	 *
-	 *   - Convert back that value into normal representation, and
-	 *     round it to the nearest integers, yielding a polynomial k.
-	 *     Proper scaling is applied to f, g, F and G so that the
-	 *     coefficients fit on 32 bits (signed).
-	 *
-	 *   - Subtract k*f from F and k*g from G.
-	 *
-	 * Under normal conditions, this process reduces the size of F
-	 * and G by some bits at each iteration. For constant-time
-	 * operation, we do not want to measure the actual length of
-	 * F and G; instead, we do the following:
-	 *
-	 *   - f and g are converted to floating-point, with some scaling
-	 *     if necessary to keep values in the representable range.
-	 *
-	 *   - For each iteration, we _assume_ a maximum size for F and G,
-	 *     and use the values at that size. If we overreach, then
-	 *     we get zeros, which is harmless: the resulting coefficients
-	 *     of k will be 0 and the value won't be reduced.
-	 *
-	 *   - We conservatively assume that F and G will be reduced by
-	 *     at least 25 bits at each iteration.
-	 *
-	 * Even when reaching the bottom of the reduction, reduction
-	 * coefficient will remain low. If it goes out-of-range, then
-	 * something wrong occurred and the whole NTRU solving fails.
-	 */
-
-	/*
-	 * Memory layout:
-	 *  - We need to compute and keep adj(f), adj(g), and
-	 *    1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
-	 *    respectively).
-	 *  - At each iteration we need two extra fp buffer (N fp values),
-	 *    and produce a k (N 32-bit words). k will be shared with one
-	 *    of the fp buffers.
-	 *  - To compute k*f and k*g efficiently (with the NTT), we need
-	 *    some extra room; we reuse the space of the temporary buffers.
-	 *
-	 * Arrays of 'fpr' are obtained from the temporary array itself.
-	 * We ensure that the base is at a properly aligned offset (the
-	 * source array tmp[] is supposed to be already aligned).
-	 */
-
-	rt3 = align_fpr(tmp, t1);
-	rt4 = rt3 + n;
-	rt5 = rt4 + n;
-	rt1 = rt5 + (n >> 1);
-	k = (int32_t *)align_u32(tmp, rt1);
-	rt2 = align_fpr(tmp, k + n);
-	if (rt2 < (rt1 + n)) {
-		rt2 = rt1 + n;
-	}
-	t1 = (uint32_t *)k + n;
-
-	/*
-	 * Get f and g into rt3 and rt4 as floating-point approximations.
-	 *
-	 * We need to "scale down" the floating-point representation of
-	 * coefficients when they are too big. We want to keep the value
-	 * below 2^310 or so. Thus, when values are larger than 10 words,
-	 * we consider only the top 10 words. Array lengths have been
-	 * computed so that average maximum length will fall in the
-	 * middle or the upper half of these top 10 words.
-	 */
-	rlen = (slen > 10) ? 10 : slen;
-	poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn);
-	poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn);
-
-	/*
-	 * Values in rt3 and rt4 are downscaled by 2^(scale_fg).
-	 */
-	scale_fg = 31 * (int)(slen - rlen);
-
-	/*
-	 * Estimated boundaries for the maximum size (in bits) of the
-	 * coefficients of (f,g). We use the measured average, and
-	 * allow for a deviation of at most six times the standard
-	 * deviation.
-	 */
-	minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std;
-	maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std;
-
-	/*
-	 * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f)
-	 * and adj(g) in rt3 and rt4, respectively.
-	 */
-	Zf(FFT)(rt3, logn);
-	Zf(FFT)(rt4, logn);
-	Zf(poly_invnorm2_fft)(rt5, rt3, rt4, logn);
-	Zf(poly_adj_fft)(rt3, logn);
-	Zf(poly_adj_fft)(rt4, logn);
-
-	/*
-	 * Reduce F and G repeatedly.
-	 *
-	 * The expected maximum bit length of coefficients of F and G
-	 * is kept in maxbl_FG, with the corresponding word length in
-	 * FGlen.
-	 */
-	FGlen = llen;
-	maxbl_FG = 31 * (int)llen;
-
-	/*
-	 * Each reduction operation computes the reduction polynomial
-	 * "k". We need that polynomial to have coefficients that fit
-	 * on 32-bit signed integers, with some scaling; thus, we use
-	 * a descending sequence of scaling values, down to zero.
-	 *
-	 * The size of the coefficients of k is (roughly) the difference
-	 * between the size of the coefficients of (F,G) and the size
-	 * of the coefficients of (f,g). Thus, the maximum size of the
-	 * coefficients of k is, at the start, maxbl_FG - minbl_fg;
-	 * this is our starting scale value for k.
-	 *
-	 * We need to estimate the size of (F,G) during the execution of
-	 * the algorithm; we are allowed some overestimation but not too
-	 * much (poly_big_to_fp() uses a 310-bit window). Generally
-	 * speaking, after applying a reduction with k scaled to
-	 * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd,
-	 * where 'dd' is a few bits to account for the fact that the
-	 * reduction is never perfect (intuitively, dd is on the order
-	 * of sqrt(N), so at most 5 bits; we here allow for 10 extra
-	 * bits).
-	 *
-	 * The size of (f,g) is not known exactly, but maxbl_fg is an
-	 * upper bound.
-	 */
-	scale_k = maxbl_FG - minbl_fg;
-
-	for (;;) {
-		int scale_FG, dc, new_maxbl_FG;
-		uint32_t scl, sch;
-		fpr pdc, pt;
-
-		/*
-		 * Convert current F and G into floating-point. We apply
-		 * scaling if the current length is more than 10 words.
-		 */
-		rlen = (FGlen > 10) ? 10 : FGlen;
-		scale_FG = 31 * (int)(FGlen - rlen);
-		poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn);
-		poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn);
-
-		/*
-		 * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2.
-		 */
-		Zf(FFT)(rt1, logn);
-		Zf(FFT)(rt2, logn);
-		Zf(poly_mul_fft)(rt1, rt3, logn);
-		Zf(poly_mul_fft)(rt2, rt4, logn);
-		Zf(poly_add)(rt2, rt1, logn);
-		Zf(poly_mul_autoadj_fft)(rt2, rt5, logn);
-		Zf(iFFT)(rt2, logn);
-
-		/*
-		 * (f,g) are scaled by 'scale_fg', meaning that the
-		 * numbers in rt3/rt4 should be multiplied by 2^(scale_fg)
-		 * to have their true mathematical value.
-		 *
-		 * (F,G) are similarly scaled by 'scale_FG'. Therefore,
-		 * the value we computed in rt2 is scaled by
-		 * 'scale_FG-scale_fg'.
-		 *
-		 * We want that value to be scaled by 'scale_k', hence we
-		 * apply a corrective scaling. After scaling, the values
-		 * should fit in -2^31-1..+2^31-1.
-		 */
-		dc = scale_k - scale_FG + scale_fg;
-
-		/*
-		 * We will need to multiply values by 2^(-dc). The value
-		 * 'dc' is not secret, so we can compute 2^(-dc) with a
-		 * non-constant-time process.
-		 * (We could use ldexp(), but we prefer to avoid any
-		 * dependency on libm. When using FP emulation, we could
-		 * use our fpr_ldexp(), which is constant-time.)
-		 */
-		if (dc < 0) {
-			dc = -dc;
-			pt = fpr_two;
-		} else {
-			pt = fpr_onehalf;
-		}
-		pdc = fpr_one;
-		while (dc != 0) {
-			if ((dc & 1) != 0) {
-				pdc = fpr_mul(pdc, pt);
-			}
-			dc >>= 1;
-			pt = fpr_sqr(pt);
-		}
-
-		for (u = 0; u < n; u ++) {
-			fpr xv;
-
-			xv = fpr_mul(rt2[u], pdc);
-
-			/*
-			 * Sometimes the values can be out-of-bounds if
-			 * the algorithm fails; we must not call
-			 * fpr_rint() (and cast to int32_t) if the value
-			 * is not in-bounds. Note that the test does not
-			 * break constant-time discipline, since any
-			 * failure here implies that we discard the current
-			 * secret key (f,g).
-			 */
-			if (!fpr_lt(fpr_mtwo31m1, xv)
-				|| !fpr_lt(xv, fpr_ptwo31m1))
-			{
-				return 0;
-			}
-			k[u] = (int32_t)fpr_rint(xv);
-		}
-
-		/*
-		 * Values in k[] are integers. They really are scaled
-		 * down by maxbl_FG - minbl_fg bits.
-		 *
-		 * If we are at low depth, then we use the NTT to
-		 * compute k*f and k*g.
-		 */
-		sch = (uint32_t)(scale_k / 31);
-		scl = (uint32_t)(scale_k % 31);
-		if (depth <= DEPTH_INT_FG) {
-			poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen,
-				k, sch, scl, logn, t1);
-			poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen,
-				k, sch, scl, logn, t1);
-		} else {
-			poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen,
-				k, sch, scl, logn);
-			poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen,
-				k, sch, scl, logn);
-		}
-
-		/*
-		 * We compute the new maximum size of (F,G), assuming that
-		 * (f,g) has _maximal_ length (i.e. that reduction is
-		 * "late" instead of "early". We also adjust FGlen
-		 * accordingly.
-		 */
-		new_maxbl_FG = scale_k + maxbl_fg + 10;
-		if (new_maxbl_FG < maxbl_FG) {
-			maxbl_FG = new_maxbl_FG;
-			if ((int)FGlen * 31 >= maxbl_FG + 31) {
-				FGlen --;
-			}
-		}
-
-		/*
-		 * We suppose that scaling down achieves a reduction by
-		 * at least 25 bits per iteration. We stop when we have
-		 * done the loop with an unscaled k.
-		 */
-		if (scale_k <= 0) {
-			break;
-		}
-		scale_k -= 25;
-		if (scale_k < 0) {
-			scale_k = 0;
-		}
-	}
-
-	/*
-	 * If (F,G) length was lowered below 'slen', then we must take
-	 * care to re-extend the sign.
-	 */
-	if (FGlen < slen) {
-		for (u = 0; u < n; u ++, Ft += llen, Gt += llen) {
-			size_t v;
-			uint32_t sw;
-
-			sw = -(Ft[FGlen - 1] >> 30) >> 1;
-			for (v = FGlen; v < slen; v ++) {
-				Ft[v] = sw;
-			}
-			sw = -(Gt[FGlen - 1] >> 30) >> 1;
-			for (v = FGlen; v < slen; v ++) {
-				Gt[v] = sw;
-			}
-		}
-	}
-
-	/*
-	 * Compress encoding of all values to 'slen' words (this is the
-	 * expected output format).
-	 */
-	for (u = 0, x = tmp, y = tmp;
-		u < (n << 1); u ++, x += slen, y += llen)
-	{
-		memmove(x, y, slen * sizeof *y);
-	}
-	return 1;
-}
-
-/*
- * Solving the NTRU equation, binary case, depth = 1. Upon entry, the
- * F and G from the previous level should be in the tmp[] array.
- *
- * Returned value: 1 on success, 0 on error.
- */
-static int
-solve_NTRU_binary_depth1(unsigned logn_top,
-	const int8_t *f, const int8_t *g, uint32_t *tmp)
-{
-	/*
-	 * The first half of this function is a copy of the corresponding
-	 * part in solve_NTRU_intermediate(), for the reconstruction of
-	 * the unreduced F and G. The second half (Babai reduction) is
-	 * done differently, because the unreduced F and G fit in 53 bits
-	 * of precision, allowing a much simpler process with lower RAM
-	 * usage.
-	 */
-	unsigned depth, logn;
-	size_t n_top, n, hn, slen, dlen, llen, u;
-	uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
-	fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6;
-	uint32_t *x, *y;
-
-	depth = 1;
-	n_top = (size_t)1 << logn_top;
-	logn = logn_top - depth;
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-
-	/*
-	 * Equations are:
-	 *
-	 *   f' = f0^2 - X^2*f1^2
-	 *   g' = g0^2 - X^2*g1^2
-	 *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
-	 *   F = F'*(g0 - X*g1)
-	 *   G = G'*(f0 - X*f1)
-	 *
-	 * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
-	 * degree N/2 (their odd-indexed coefficients are all zero).
-	 */
-
-	/*
-	 * slen = size for our input f and g; also size of the reduced
-	 *        F and G we return (degree N)
-	 *
-	 * dlen = size of the F and G obtained from the deeper level
-	 *        (degree N/2)
-	 *
-	 * llen = size for intermediary F and G before reduction (degree N)
-	 *
-	 * We build our non-reduced F and G as two independent halves each,
-	 * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
-	 */
-	slen = MAX_BL_SMALL[depth];
-	dlen = MAX_BL_SMALL[depth + 1];
-	llen = MAX_BL_LARGE[depth];
-
-	/*
-	 * Fd and Gd are the F and G from the deeper level. Ft and Gt
-	 * are the destination arrays for the unreduced F and G.
-	 */
-	Fd = tmp;
-	Gd = Fd + dlen * hn;
-	Ft = Gd + dlen * hn;
-	Gt = Ft + llen * n;
-
-	/*
-	 * We reduce Fd and Gd modulo all the small primes we will need,
-	 * and store the values in Ft and Gt.
-	 */
-	for (u = 0; u < llen; u ++) {
-		uint32_t p, p0i, R2, Rx;
-		size_t v;
-		uint32_t *xs, *ys, *xd, *yd;
-
-		p = PRIMES[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-		Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
-		for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
-			v < hn;
-			v ++, xs += dlen, ys += dlen, xd += llen, yd += llen)
-		{
-			*xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
-			*yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
-		}
-	}
-
-	/*
-	 * Now Fd and Gd are not needed anymore; we can squeeze them out.
-	 */
-	memmove(tmp, Ft, llen * n * sizeof(uint32_t));
-	Ft = tmp;
-	memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t));
-	Gt = Ft + llen * n;
-	ft = Gt + llen * n;
-	gt = ft + slen * n;
-
-	t1 = gt + slen * n;
-
-	/*
-	 * Compute our F and G modulo sufficiently many small primes.
-	 */
-	for (u = 0; u < llen; u ++) {
-		uint32_t p, p0i, R2;
-		uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
-		unsigned e;
-		size_t v;
-
-		/*
-		 * All computations are done modulo p.
-		 */
-		p = PRIMES[u].p;
-		p0i = modp_ninv31(p);
-		R2 = modp_R2(p, p0i);
-
-		/*
-		 * We recompute things from the source f and g, of full
-		 * degree. However, we will need only the n first elements
-		 * of the inverse NTT table (igm); the call to modp_mkgm()
-		 * below will fill n_top elements in igm[] (thus overflowing
-		 * into fx[]) but later code will overwrite these extra
-		 * elements.
-		 */
-		gm = t1;
-		igm = gm + n_top;
-		fx = igm + n;
-		gx = fx + n_top;
-		modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i);
-
-		/*
-		 * Set ft and gt to f and g modulo p, respectively.
-		 */
-		for (v = 0; v < n_top; v ++) {
-			fx[v] = modp_set(f[v], p);
-			gx[v] = modp_set(g[v], p);
-		}
-
-		/*
-		 * Convert to NTT and compute our f and g.
-		 */
-		modp_NTT2(fx, gm, logn_top, p, p0i);
-		modp_NTT2(gx, gm, logn_top, p, p0i);
-		for (e = logn_top; e > logn; e --) {
-			modp_poly_rec_res(fx, e, p, p0i, R2);
-			modp_poly_rec_res(gx, e, p, p0i, R2);
-		}
-
-		/*
-		 * From that point onward, we only need tables for
-		 * degree n, so we can save some space.
-		 */
-		if (depth > 0) { /* always true */
-			memmove(gm + n, igm, n * sizeof *igm);
-			igm = gm + n;
-			memmove(igm + n, fx, n * sizeof *ft);
-			fx = igm + n;
-			memmove(fx + n, gx, n * sizeof *gt);
-			gx = fx + n;
-		}
-
-		/*
-		 * Get F' and G' modulo p and in NTT representation
-		 * (they have degree n/2). These values were computed
-		 * in a previous step, and stored in Ft and Gt.
-		 */
-		Fp = gx + n;
-		Gp = Fp + hn;
-		for (v = 0, x = Ft + u, y = Gt + u;
-			v < hn; v ++, x += llen, y += llen)
-		{
-			Fp[v] = *x;
-			Gp[v] = *y;
-		}
-		modp_NTT2(Fp, gm, logn - 1, p, p0i);
-		modp_NTT2(Gp, gm, logn - 1, p, p0i);
-
-		/*
-		 * Compute our F and G modulo p.
-		 *
-		 * Equations are:
-		 *
-		 *   f'(x^2) = N(f)(x^2) = f * adj(f)
-		 *   g'(x^2) = N(g)(x^2) = g * adj(g)
-		 *
-		 *   f'*G' - g'*F' = q
-		 *
-		 *   F = F'(x^2) * adj(g)
-		 *   G = G'(x^2) * adj(f)
-		 *
-		 * The NTT representation of f is f(w) for all w which
-		 * are roots of phi. In the binary case, as well as in
-		 * the ternary case for all depth except the deepest,
-		 * these roots can be grouped in pairs (w,-w), and we
-		 * then have:
-		 *
-		 *   f(w) = adj(f)(-w)
-		 *   f(-w) = adj(f)(w)
-		 *
-		 * and w^2 is then a root for phi at the half-degree.
-		 *
-		 * At the deepest level in the ternary case, this still
-		 * holds, in the following sense: the roots of x^2-x+1
-		 * are (w,-w^2) (for w^3 = -1, and w != -1), and we
-		 * have:
-		 *
-		 *   f(w) = adj(f)(-w^2)
-		 *   f(-w^2) = adj(f)(w)
-		 *
-		 * In all case, we can thus compute F and G in NTT
-		 * representation by a few simple multiplications.
-		 * Moreover, the two roots for each pair are consecutive
-		 * in our bit-reversal encoding.
-		 */
-		for (v = 0, x = Ft + u, y = Gt + u;
-			v < hn; v ++, x += (llen << 1), y += (llen << 1))
-		{
-			uint32_t ftA, ftB, gtA, gtB;
-			uint32_t mFp, mGp;
-
-			ftA = fx[(v << 1) + 0];
-			ftB = fx[(v << 1) + 1];
-			gtA = gx[(v << 1) + 0];
-			gtB = gx[(v << 1) + 1];
-			mFp = modp_montymul(Fp[v], R2, p, p0i);
-			mGp = modp_montymul(Gp[v], R2, p, p0i);
-			x[0] = modp_montymul(gtB, mFp, p, p0i);
-			x[llen] = modp_montymul(gtA, mFp, p, p0i);
-			y[0] = modp_montymul(ftB, mGp, p, p0i);
-			y[llen] = modp_montymul(ftA, mGp, p, p0i);
-		}
-		modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
-		modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
-
-		/*
-		 * Also save ft and gt (only up to size slen).
-		 */
-		if (u < slen) {
-			modp_iNTT2(fx, igm, logn, p, p0i);
-			modp_iNTT2(gx, igm, logn, p, p0i);
-			for (v = 0, x = ft + u, y = gt + u;
-				v < n; v ++, x += slen, y += slen)
-			{
-				*x = fx[v];
-				*y = gx[v];
-			}
-		}
-	}
-
-	/*
-	 * Rebuild f, g, F and G with the CRT. Note that the elements of F
-	 * and G are consecutive, and thus can be rebuilt in a single
-	 * loop; similarly, the elements of f and g are consecutive.
-	 */
-	zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1);
-	zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1);
-
-	/*
-	 * Here starts the Babai reduction, specialized for depth = 1.
-	 *
-	 * Candidates F and G (from Ft and Gt), and base f and g (ft and gt),
-	 * are converted to floating point. There is no scaling, and a
-	 * single pass is sufficient.
-	 */
-
-	/*
-	 * Convert F and G into floating point (rt1 and rt2).
-	 */
-	rt1 = align_fpr(tmp, gt + slen * n);
-	rt2 = rt1 + n;
-	poly_big_to_fp(rt1, Ft, llen, llen, logn);
-	poly_big_to_fp(rt2, Gt, llen, llen, logn);
-
-	/*
-	 * Integer representation of F and G is no longer needed, we
-	 * can remove it.
-	 */
-	memmove(tmp, ft, 2 * slen * n * sizeof *ft);
-	ft = tmp;
-	gt = ft + slen * n;
-	rt3 = align_fpr(tmp, gt + slen * n);
-	memmove(rt3, rt1, 2 * n * sizeof *rt1);
-	rt1 = rt3;
-	rt2 = rt1 + n;
-	rt3 = rt2 + n;
-	rt4 = rt3 + n;
-
-	/*
-	 * Convert f and g into floating point (rt3 and rt4).
-	 */
-	poly_big_to_fp(rt3, ft, slen, slen, logn);
-	poly_big_to_fp(rt4, gt, slen, slen, logn);
-
-	/*
-	 * Remove unneeded ft and gt.
-	 */
-	memmove(tmp, rt1, 4 * n * sizeof *rt1);
-	rt1 = (fpr *)tmp;
-	rt2 = rt1 + n;
-	rt3 = rt2 + n;
-	rt4 = rt3 + n;
-
-	/*
-	 * We now have:
-	 *   rt1 = F
-	 *   rt2 = G
-	 *   rt3 = f
-	 *   rt4 = g
-	 * in that order in RAM. We convert all of them to FFT.
-	 */
-	Zf(FFT)(rt1, logn);
-	Zf(FFT)(rt2, logn);
-	Zf(FFT)(rt3, logn);
-	Zf(FFT)(rt4, logn);
-
-	/*
-	 * Compute:
-	 *   rt5 = F*adj(f) + G*adj(g)
-	 *   rt6 = 1 / (f*adj(f) + g*adj(g))
-	 * (Note that rt6 is half-length.)
-	 */
-	rt5 = rt4 + n;
-	rt6 = rt5 + n;
-	Zf(poly_add_muladj_fft)(rt5, rt1, rt2, rt3, rt4, logn);
-	Zf(poly_invnorm2_fft)(rt6, rt3, rt4, logn);
-
-	/*
-	 * Compute:
-	 *   rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g))
-	 */
-	Zf(poly_mul_autoadj_fft)(rt5, rt6, logn);
-
-	/*
-	 * Compute k as the rounded version of rt5. Check that none of
-	 * the values is larger than 2^63-1 (in absolute value)
-	 * because that would make the fpr_rint() do something undefined;
-	 * note that any out-of-bounds value here implies a failure and
-	 * (f,g) will be discarded, so we can make a simple test.
-	 */
-	Zf(iFFT)(rt5, logn);
-	for (u = 0; u < n; u ++) {
-		fpr z;
-
-		z = rt5[u];
-		if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) {
-			return 0;
-		}
-		rt5[u] = fpr_of(fpr_rint(z));
-	}
-	Zf(FFT)(rt5, logn);
-
-	/*
-	 * Subtract k*f from F, and k*g from G.
-	 */
-	Zf(poly_mul_fft)(rt3, rt5, logn);
-	Zf(poly_mul_fft)(rt4, rt5, logn);
-	Zf(poly_sub)(rt1, rt3, logn);
-	Zf(poly_sub)(rt2, rt4, logn);
-	Zf(iFFT)(rt1, logn);
-	Zf(iFFT)(rt2, logn);
-
-	/*
-	 * Convert back F and G to integers, and return.
-	 */
-	Ft = tmp;
-	Gt = Ft + n;
-	rt3 = align_fpr(tmp, Gt + n);
-	memmove(rt3, rt1, 2 * n * sizeof *rt1);
-	rt1 = rt3;
-	rt2 = rt1 + n;
-	for (u = 0; u < n; u ++) {
-		Ft[u] = (uint32_t)fpr_rint(rt1[u]);
-		Gt[u] = (uint32_t)fpr_rint(rt2[u]);
-	}
-
-	return 1;
-}
-
-/*
- * Solving the NTRU equation, top level. Upon entry, the F and G
- * from the previous level should be in the tmp[] array.
- *
- * Returned value: 1 on success, 0 on error.
- */
-static int
-solve_NTRU_binary_depth0(unsigned logn,
-	const int8_t *f, const int8_t *g, uint32_t *tmp)
-{
-	size_t n, hn, u;
-	uint32_t p, p0i, R2;
-	uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5;
-	uint32_t *gm, *igm, *ft, *gt;
-	fpr *rt2, *rt3;
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-
-	/*
-	 * Equations are:
-	 *
-	 *   f' = f0^2 - X^2*f1^2
-	 *   g' = g0^2 - X^2*g1^2
-	 *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
-	 *   F = F'*(g0 - X*g1)
-	 *   G = G'*(f0 - X*f1)
-	 *
-	 * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
-	 * degree N/2 (their odd-indexed coefficients are all zero).
-	 *
-	 * Everything should fit in 31-bit integers, hence we can just use
-	 * the first small prime p = 2147473409.
-	 */
-	p = PRIMES[0].p;
-	p0i = modp_ninv31(p);
-	R2 = modp_R2(p, p0i);
-
-	Fp = tmp;
-	Gp = Fp + hn;
-	ft = Gp + hn;
-	gt = ft + n;
-	gm = gt + n;
-	igm = gm + n;
-
-	modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i);
-
-	/*
-	 * Convert F' anf G' in NTT representation.
-	 */
-	for (u = 0; u < hn; u ++) {
-		Fp[u] = modp_set(zint_one_to_plain(Fp + u), p);
-		Gp[u] = modp_set(zint_one_to_plain(Gp + u), p);
-	}
-	modp_NTT2(Fp, gm, logn - 1, p, p0i);
-	modp_NTT2(Gp, gm, logn - 1, p, p0i);
-
-	/*
-	 * Load f and g and convert them to NTT representation.
-	 */
-	for (u = 0; u < n; u ++) {
-		ft[u] = modp_set(f[u], p);
-		gt[u] = modp_set(g[u], p);
-	}
-	modp_NTT2(ft, gm, logn, p, p0i);
-	modp_NTT2(gt, gm, logn, p, p0i);
-
-	/*
-	 * Build the unreduced F,G in ft and gt.
-	 */
-	for (u = 0; u < n; u += 2) {
-		uint32_t ftA, ftB, gtA, gtB;
-		uint32_t mFp, mGp;
-
-		ftA = ft[u + 0];
-		ftB = ft[u + 1];
-		gtA = gt[u + 0];
-		gtB = gt[u + 1];
-		mFp = modp_montymul(Fp[u >> 1], R2, p, p0i);
-		mGp = modp_montymul(Gp[u >> 1], R2, p, p0i);
-		ft[u + 0] = modp_montymul(gtB, mFp, p, p0i);
-		ft[u + 1] = modp_montymul(gtA, mFp, p, p0i);
-		gt[u + 0] = modp_montymul(ftB, mGp, p, p0i);
-		gt[u + 1] = modp_montymul(ftA, mGp, p, p0i);
-	}
-	modp_iNTT2(ft, igm, logn, p, p0i);
-	modp_iNTT2(gt, igm, logn, p, p0i);
-
-	Gp = Fp + n;
-	t1 = Gp + n;
-	memmove(Fp, ft, 2 * n * sizeof *ft);
-
-	/*
-	 * We now need to apply the Babai reduction. At that point,
-	 * we have F and G in two n-word arrays.
-	 *
-	 * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g)
-	 * modulo p, using the NTT. We still move memory around in
-	 * order to save RAM.
-	 */
-	t2 = t1 + n;
-	t3 = t2 + n;
-	t4 = t3 + n;
-	t5 = t4 + n;
-
-	/*
-	 * Compute the NTT tables in t1 and t2. We do not keep t2
-	 * (we'll recompute it later on).
-	 */
-	modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i);
-
-	/*
-	 * Convert F and G to NTT.
-	 */
-	modp_NTT2(Fp, t1, logn, p, p0i);
-	modp_NTT2(Gp, t1, logn, p, p0i);
-
-	/*
-	 * Load f and adj(f) in t4 and t5, and convert them to NTT
-	 * representation.
-	 */
-	t4[0] = t5[0] = modp_set(f[0], p);
-	for (u = 1; u < n; u ++) {
-		t4[u] = modp_set(f[u], p);
-		t5[n - u] = modp_set(-f[u], p);
-	}
-	modp_NTT2(t4, t1, logn, p, p0i);
-	modp_NTT2(t5, t1, logn, p, p0i);
-
-	/*
-	 * Compute F*adj(f) in t2, and f*adj(f) in t3.
-	 */
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = modp_montymul(t5[u], R2, p, p0i);
-		t2[u] = modp_montymul(w, Fp[u], p, p0i);
-		t3[u] = modp_montymul(w, t4[u], p, p0i);
-	}
-
-	/*
-	 * Load g and adj(g) in t4 and t5, and convert them to NTT
-	 * representation.
-	 */
-	t4[0] = t5[0] = modp_set(g[0], p);
-	for (u = 1; u < n; u ++) {
-		t4[u] = modp_set(g[u], p);
-		t5[n - u] = modp_set(-g[u], p);
-	}
-	modp_NTT2(t4, t1, logn, p, p0i);
-	modp_NTT2(t5, t1, logn, p, p0i);
-
-	/*
-	 * Add G*adj(g) to t2, and g*adj(g) to t3.
-	 */
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = modp_montymul(t5[u], R2, p, p0i);
-		t2[u] = modp_add(t2[u],
-			modp_montymul(w, Gp[u], p, p0i), p);
-		t3[u] = modp_add(t3[u],
-			modp_montymul(w, t4[u], p, p0i), p);
-	}
-
-	/*
-	 * Convert back t2 and t3 to normal representation (normalized
-	 * around 0), and then
-	 * move them to t1 and t2. We first need to recompute the
-	 * inverse table for NTT.
-	 */
-	modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i);
-	modp_iNTT2(t2, t4, logn, p, p0i);
-	modp_iNTT2(t3, t4, logn, p, p0i);
-	for (u = 0; u < n; u ++) {
-		t1[u] = (uint32_t)modp_norm(t2[u], p);
-		t2[u] = (uint32_t)modp_norm(t3[u], p);
-	}
-
-	/*
-	 * At that point, array contents are:
-	 *
-	 *   F (NTT representation) (Fp)
-	 *   G (NTT representation) (Gp)
-	 *   F*adj(f)+G*adj(g) (t1)
-	 *   f*adj(f)+g*adj(g) (t2)
-	 *
-	 * We want to divide t1 by t2. The result is not integral; it
-	 * must be rounded. We thus need to use the FFT.
-	 */
-
-	/*
-	 * Get f*adj(f)+g*adj(g) in FFT representation. Since this
-	 * polynomial is auto-adjoint, all its coordinates in FFT
-	 * representation are actually real, so we can truncate off
-	 * the imaginary parts.
-	 */
-	rt3 = align_fpr(tmp, t3);
-	for (u = 0; u < n; u ++) {
-		rt3[u] = fpr_of(((int32_t *)t2)[u]);
-	}
-	Zf(FFT)(rt3, logn);
-	rt2 = align_fpr(tmp, t2);
-	memmove(rt2, rt3, hn * sizeof *rt3);
-
-	/*
-	 * Convert F*adj(f)+G*adj(g) in FFT representation.
-	 */
-	rt3 = rt2 + hn;
-	for (u = 0; u < n; u ++) {
-		rt3[u] = fpr_of(((int32_t *)t1)[u]);
-	}
-	Zf(FFT)(rt3, logn);
-
-	/*
-	 * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get
-	 * its rounded normal representation in t1.
-	 */
-	Zf(poly_div_autoadj_fft)(rt3, rt2, logn);
-	Zf(iFFT)(rt3, logn);
-	for (u = 0; u < n; u ++) {
-		t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p);
-	}
-
-	/*
-	 * RAM contents are now:
-	 *
-	 *   F (NTT representation) (Fp)
-	 *   G (NTT representation) (Gp)
-	 *   k (t1)
-	 *
-	 * We want to compute F-k*f, and G-k*g.
-	 */
-	t2 = t1 + n;
-	t3 = t2 + n;
-	t4 = t3 + n;
-	t5 = t4 + n;
-	modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i);
-	for (u = 0; u < n; u ++) {
-		t4[u] = modp_set(f[u], p);
-		t5[u] = modp_set(g[u], p);
-	}
-	modp_NTT2(t1, t2, logn, p, p0i);
-	modp_NTT2(t4, t2, logn, p, p0i);
-	modp_NTT2(t5, t2, logn, p, p0i);
-	for (u = 0; u < n; u ++) {
-		uint32_t kw;
-
-		kw = modp_montymul(t1[u], R2, p, p0i);
-		Fp[u] = modp_sub(Fp[u],
-			modp_montymul(kw, t4[u], p, p0i), p);
-		Gp[u] = modp_sub(Gp[u],
-			modp_montymul(kw, t5[u], p, p0i), p);
-	}
-	modp_iNTT2(Fp, t3, logn, p, p0i);
-	modp_iNTT2(Gp, t3, logn, p, p0i);
-	for (u = 0; u < n; u ++) {
-		Fp[u] = (uint32_t)modp_norm(Fp[u], p);
-		Gp[u] = (uint32_t)modp_norm(Gp[u], p);
-	}
-
-	return 1;
-}
-
-/*
- * Solve the NTRU equation. Returned value is 1 on success, 0 on error.
- * G can be NULL, in which case that value is computed but not returned.
- * If any of the coefficients of F and G exceeds lim (in absolute value),
- * then 0 is returned.
- */
-static int
-solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
-	const int8_t *f, const int8_t *g, int lim, uint32_t *tmp)
-{
-	size_t n, u;
-	uint32_t *ft, *gt, *Ft, *Gt, *gm;
-	uint32_t p, p0i, r;
-	const small_prime *primes;
-
-	n = MKN(logn);
-
-	if (!solve_NTRU_deepest(logn, f, g, tmp)) {
-		return 0;
-	}
-
-	/*
-	 * For logn <= 2, we need to use solve_NTRU_intermediate()
-	 * directly, because coefficients are a bit too large and
-	 * do not fit the hypotheses in solve_NTRU_binary_depth0().
-	 */
-	if (logn <= 2) {
-		unsigned depth;
-
-		depth = logn;
-		while (depth -- > 0) {
-			if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
-				return 0;
-			}
-		}
-	} else {
-		unsigned depth;
-
-		depth = logn;
-		while (depth -- > 2) {
-			if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
-				return 0;
-			}
-		}
-		if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) {
-			return 0;
-		}
-		if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) {
-			return 0;
-		}
-	}
-
-	/*
-	 * If no buffer has been provided for G, use a temporary one.
-	 */
-	if (G == NULL) {
-		G = (int8_t *)(tmp + 2 * n);
-	}
-
-	/*
-	 * Final F and G are in fk->tmp, one word per coefficient
-	 * (signed value over 31 bits).
-	 */
-	if (!poly_big_to_small(F, tmp, lim, logn)
-		|| !poly_big_to_small(G, tmp + n, lim, logn))
-	{
-		return 0;
-	}
-
-	/*
-	 * Verify that the NTRU equation is fulfilled. Since all elements
-	 * have short lengths, verifying modulo a small prime p works, and
-	 * allows using the NTT.
-	 *
-	 * We put Gt[] first in tmp[], and process it first, so that it does
-	 * not overlap with G[] in case we allocated it ourselves.
-	 */
-	Gt = tmp;
-	ft = Gt + n;
-	gt = ft + n;
-	Ft = gt + n;
-	gm = Ft + n;
-
-	primes = PRIMES;
-	p = primes[0].p;
-	p0i = modp_ninv31(p);
-	modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i);
-	for (u = 0; u < n; u ++) {
-		Gt[u] = modp_set(G[u], p);
-	}
-	for (u = 0; u < n; u ++) {
-		ft[u] = modp_set(f[u], p);
-		gt[u] = modp_set(g[u], p);
-		Ft[u] = modp_set(F[u], p);
-	}
-	modp_NTT2(ft, gm, logn, p, p0i);
-	modp_NTT2(gt, gm, logn, p, p0i);
-	modp_NTT2(Ft, gm, logn, p, p0i);
-	modp_NTT2(Gt, gm, logn, p, p0i);
-	r = modp_montymul(12289, 1, p, p0i);
-	for (u = 0; u < n; u ++) {
-		uint32_t z;
-
-		z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i),
-			modp_montymul(gt[u], Ft[u], p, p0i), p);
-		if (z != r) {
-			return 0;
-		}
-	}
-
-	return 1;
-}
-
-/*
- * Generate a random polynomial with a Gaussian distribution. This function
- * also makes sure that the resultant of the polynomial with phi is odd.
- */
-static void
-poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn)
-{
-	size_t n, u;
-	unsigned mod2;
-
-	n = MKN(logn);
-	mod2 = 0;
-	for (u = 0; u < n; u ++) {
-		int s;
-
-	restart:
-		s = mkgauss(rng, logn);
-
-		/*
-		 * We need the coefficient to fit within -127..+127;
-		 * realistically, this is always the case except for
-		 * the very low degrees (N = 2 or 4), for which there
-		 * is no real security anyway.
-		 */
-		if (s < -127 || s > 127) {
-			goto restart;
-		}
-
-		/*
-		 * We need the sum of all coefficients to be 1; otherwise,
-		 * the resultant of the polynomial with X^N+1 will be even,
-		 * and the binary GCD will fail.
-		 */
-		if (u == n - 1) {
-			if ((mod2 ^ (unsigned)(s & 1)) == 0) {
-				goto restart;
-			}
-		} else {
-			mod2 ^= (unsigned)(s & 1);
-		}
-		f[u] = (int8_t)s;
-	}
-}
-
-/* see falcon.h */
-void
-Zf(keygen)(inner_shake256_context *rng,
-	int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
-	unsigned logn, uint8_t *tmp)
-{
-	/*
-	 * Algorithm is the following:
-	 *
-	 *  - Generate f and g with the Gaussian distribution.
-	 *
-	 *  - If either Res(f,phi) or Res(g,phi) is even, try again.
-	 *
-	 *  - If ||(f,g)|| is too large, try again.
-	 *
-	 *  - If ||B~_{f,g}|| is too large, try again.
-	 *
-	 *  - If f is not invertible mod phi mod q, try again.
-	 *
-	 *  - Compute h = g/f mod phi mod q.
-	 *
-	 *  - Solve the NTRU equation fG - gF = q; if the solving fails,
-	 *    try again. Usual failure condition is when Res(f,phi)
-	 *    and Res(g,phi) are not prime to each other.
-	 */
-	size_t n, u;
-	uint16_t *h2, *tmp2;
-	RNG_CONTEXT *rc;
-#if FALCON_KG_CHACHA20  // yyyKG_CHACHA20+1
-	prng p;
-#endif  // yyyKG_CHACHA20-
-
-	n = MKN(logn);
-#if FALCON_KG_CHACHA20  // yyyKG_CHACHA20+1
-	Zf(prng_init)(&p, rng);
-	rc = &p;
-#else // yyyKG_CHACHA20+0
-	rc = rng;
-#endif  // yyyKG_CHACHA20-
-
-	/*
-	 * We need to generate f and g randomly, until we find values
-	 * such that the norm of (g,-f), and of the orthogonalized
-	 * vector, are satisfying. The orthogonalized vector is:
-	 *   (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g)))
-	 * (it is actually the (N+1)-th row of the Gram-Schmidt basis).
-	 *
-	 * In the binary case, coefficients of f and g are generated
-	 * independently of each other, with a discrete Gaussian
-	 * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then,
-	 * the two vectors have expected norm 1.17*sqrt(q), which is
-	 * also our acceptance bound: we require both vectors to be no
-	 * larger than that (this will be satisfied about 1/4th of the
-	 * time, thus we expect sampling new (f,g) about 4 times for that
-	 * step).
-	 *
-	 * We require that Res(f,phi) and Res(g,phi) are both odd (the
-	 * NTRU equation solver requires it).
-	 */
-	for (;;) {
-		fpr *rt1, *rt2, *rt3;
-		fpr bnorm;
-		uint32_t normf, normg, norm;
-		int lim;
-
-		/*
-		 * The poly_small_mkgauss() function makes sure
-		 * that the sum of coefficients is 1 modulo 2
-		 * (i.e. the resultant of the polynomial with phi
-		 * will be odd).
-		 */
-		poly_small_mkgauss(rc, f, logn);
-		poly_small_mkgauss(rc, g, logn);
-
-		/*
-		 * Verify that all coefficients are within the bounds
-		 * defined in max_fg_bits. This is the case with
-		 * overwhelming probability; this guarantees that the
-		 * key will be encodable with FALCON_COMP_TRIM.
-		 */
-		lim = 1 << (Zf(max_fg_bits)[logn] - 1);
-		for (u = 0; u < n; u ++) {
-			/*
-			 * We can use non-CT tests since on any failure
-			 * we will discard f and g.
-			 */
-			if (f[u] >= lim || f[u] <= -lim
-				|| g[u] >= lim || g[u] <= -lim)
-			{
-				lim = -1;
-				break;
-			}
-		}
-		if (lim < 0) {
-			continue;
-		}
-
-		/*
-		 * Bound is 1.17*sqrt(q). We compute the squared
-		 * norms. With q = 12289, the squared bound is:
-		 *   (1.17^2)* 12289 = 16822.4121
-		 * Since f and g are integral, the squared norm
-		 * of (g,-f) is an integer.
-		 */
-		normf = poly_small_sqnorm(f, logn);
-		normg = poly_small_sqnorm(g, logn);
-		norm = (normf + normg) | -((normf | normg) >> 31);
-		if (norm >= 16823) {
-			continue;
-		}
-
-		/*
-		 * We compute the orthogonalized vector norm.
-		 */
-		rt1 = (fpr *)tmp;
-		rt2 = rt1 + n;
-		rt3 = rt2 + n;
-		poly_small_to_fp(rt1, f, logn);
-		poly_small_to_fp(rt2, g, logn);
-		Zf(FFT)(rt1, logn);
-		Zf(FFT)(rt2, logn);
-		Zf(poly_invnorm2_fft)(rt3, rt1, rt2, logn);
-		Zf(poly_adj_fft)(rt1, logn);
-		Zf(poly_adj_fft)(rt2, logn);
-		Zf(poly_mulconst)(rt1, fpr_q, logn);
-		Zf(poly_mulconst)(rt2, fpr_q, logn);
-		Zf(poly_mul_autoadj_fft)(rt1, rt3, logn);
-		Zf(poly_mul_autoadj_fft)(rt2, rt3, logn);
-		Zf(iFFT)(rt1, logn);
-		Zf(iFFT)(rt2, logn);
-		bnorm = fpr_zero;
-		for (u = 0; u < n; u ++) {
-			bnorm = fpr_add(bnorm, fpr_sqr(rt1[u]));
-			bnorm = fpr_add(bnorm, fpr_sqr(rt2[u]));
-		}
-		if (!fpr_lt(bnorm, fpr_bnorm_max)) {
-			continue;
-		}
-
-		/*
-		 * Compute public key h = g/f mod X^N+1 mod q. If this
-		 * fails, we must restart.
-		 */
-		if (h == NULL) {
-			h2 = (uint16_t *)tmp;
-			tmp2 = h2 + n;
-		} else {
-			h2 = h;
-			tmp2 = (uint16_t *)tmp;
-		}
-		if (!Zf(compute_public)(h2, f, g, logn, (uint8_t *)tmp2)) {
-			continue;
-		}
-
-		/*
-		 * Solve the NTRU equation to get F and G.
-		 */
-		lim = (1 << (Zf(max_FG_bits)[logn] - 1)) - 1;
-		if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) {
-			continue;
-		}
-
-		/*
-		 * Key pair is generated.
-		 */
-		break;
-	}
-}
diff --git a/crypto_sign/falcon-512/m4-ct/pqm4.c b/crypto_sign/falcon-512/m4-ct/pqm4.c
deleted file mode 100644
index d5b8a64e..00000000
--- a/crypto_sign/falcon-512/m4-ct/pqm4.c
+++ /dev/null
@@ -1,348 +0,0 @@
-#include <stddef.h>
-#include <string.h>
-
-#include "api.h"
-#include "inner.h"
-#include "randombytes.h"
-
-/* ==================================================================== */
-
-/*
- * Falcon degree is N = 2^LOGN, where LOGN=9 (for Falcon-512) or 10
- * (for Falcon-1024). We use the advertised public key size to know
- * which degree is used.
- */
-#if CRYPTO_PUBLICKEYBYTES == 897
-#define LOGN   9
-#elif CRYPTO_PUBLICKEYBYTES == 1793
-#define LOGN   10
-#else
-#error Unknown Falcon degree (unexpected public key size)
-#endif
-
-#define N   ((size_t)1 << LOGN)
-#define NONCELEN   40
-#define SEEDLEN    48
-
-/*
- * If the private key length is larger than 10000, then this is the
- * variant with precomputed expanded keys.
- */
-#if CRYPTO_SECRETKEYBYTES > 10000
-#define KG_EXPAND   1
-#else
-#define KG_EXPAND   0
-#endif
-
-/*
- * Common buffer, to avoid bulky stack allocation. The buffer sizes are
- * all expressed in bytes, but the buffer must be suitably aligned for
- * 64-bit integers and floating-point values.
- *
- * Required size (in bytes):
- *
- *   With expanded key:
- *      keygen:  48*N + 6*N = 54*N
- *      sign:    48*N + 2*N = 50*N
- *      vrfy:    8*N
- *
- *   Without expanded key:
- *      keygen:  28*N + 5*N = 33*N
- *      sign:    72*N + 6*N = 78*N
- *      vrfy:    8*N
- */
-static union {
-#if KG_EXPAND
-	uint8_t b[54 * N];
-#else
-	uint8_t b[78 * N];
-#endif
-	uint64_t dummy_u64;
-	fpr dummy_fp;
-} tmp;
-
-
-int
-crypto_sign_keypair(unsigned char *pk, unsigned char *sk)
-{
-	int8_t *f, *g, *F, *G;
-	uint16_t *h;
-	inner_shake256_context rng;
-	unsigned char seed[SEEDLEN];
-#if KG_EXPAND
-	size_t v;
-#else
-	size_t u, v;
-#endif
-	unsigned sav_cw;
-
-#if KG_EXPAND
-	f = (int8_t *)&tmp.b[48 * N];
-	g = f + N;
-	F = g + N;
-	G = F + N;
-	h = (uint16_t *)(G + N);
-#else
-	f = (int8_t *)&tmp.b[28 * N];
-	g = f + N;
-	F = g + N;
-	G = NULL;
-	h = (uint16_t *)(F + N);
-#endif
-
-	randombytes(seed, SEEDLEN);
-	inner_shake256_init(&rng);
-	inner_shake256_inject(&rng, seed, SEEDLEN);
-	inner_shake256_flip(&rng);
-	sav_cw = set_fpu_cw(2);
-	Zf(keygen)(&rng, f, g, F, G, h, LOGN, tmp.b);
-
-#if KG_EXPAND
-	/*
-	 * Expand private key.
-	 */
-	Zf(expand_privkey)((fpr *)sk, f, g, F, G, LOGN, tmp.b);
-	set_fpu_cw(sav_cw);
-#else
-	set_fpu_cw(sav_cw);
-
-	/*
-	 * Encode private key.
-	 */
-	sk[0] = 0x50 + LOGN;
-	u = 1;
-	v = Zf(trim_i8_encode)(sk + u, CRYPTO_SECRETKEYBYTES - u,
-		f, LOGN, Zf(max_fg_bits)[LOGN]);
-	if (v == 0) {
-		return -1;
-	}
-	u += v;
-	v = Zf(trim_i8_encode)(sk + u, CRYPTO_SECRETKEYBYTES - u,
-		g, LOGN, Zf(max_fg_bits)[LOGN]);
-	if (v == 0) {
-		return -1;
-	}
-	u += v;
-	v = Zf(trim_i8_encode)(sk + u, CRYPTO_SECRETKEYBYTES - u,
-		F, LOGN, Zf(max_FG_bits)[LOGN]);
-	if (v == 0) {
-		return -1;
-	}
-	u += v;
-	if (u != CRYPTO_SECRETKEYBYTES) {
-		return -1;
-	}
-#endif
-
-	/*
-	 * Encode public key.
-	 */
-	pk[0] = 0x00 + LOGN;
-	v = Zf(modq_encode)(pk + 1, CRYPTO_PUBLICKEYBYTES - 1, h, LOGN);
-	if (v != CRYPTO_PUBLICKEYBYTES - 1) {
-		return -1;
-	}
-
-	return 0;
-}
-
-int
-crypto_sign(unsigned char *sm, size_t *smlen,
-	const unsigned char *m, size_t mlen,
-	const unsigned char *sk)
-{
-#if KG_EXPAND
-	const fpr *expanded_key;
-#else
-	int8_t *f, *g, *F, *G;
-	size_t u, v;
-#endif
-	int16_t *sig;
-	uint16_t *hm;
-	unsigned char seed[SEEDLEN], nonce[NONCELEN];
-	unsigned char *esig;
-	inner_shake256_context sc;
-	size_t sig_len;
-	unsigned sav_cw;
-
-#if KG_EXPAND
-	sig = (int16_t *)&tmp.b[48 * N];
-#else
-	f = (int8_t *)&tmp.b[72 * N];
-	g = f + N;
-	F = g + N;
-	G = F + N;
-	sig = (int16_t *)(G + N);
-#endif
-	hm = (uint16_t *)sig;  /* hm[] is shared with sig[] */
-	esig = (unsigned char *)tmp.b;
-
-#if KG_EXPAND
-	/*
-	 * Expanded key is provided "as is".
-	 */
-	expanded_key = (const fpr *)sk;
-#else
-	/*
-	 * Decode the private key.
-	 */
-	if (sk[0] != 0x50 + LOGN) {
-		return -1;
-	}
-	u = 1;
-	v = Zf(trim_i8_decode)(f, LOGN, Zf(max_fg_bits)[LOGN],
-		sk + u, CRYPTO_SECRETKEYBYTES - u);
-	if (v == 0) {
-		return -1;
-	}
-	u += v;
-	v = Zf(trim_i8_decode)(g, LOGN, Zf(max_fg_bits)[LOGN],
-		sk + u, CRYPTO_SECRETKEYBYTES - u);
-	if (v == 0) {
-		return -1;
-	}
-	u += v;
-	v = Zf(trim_i8_decode)(F, LOGN, Zf(max_FG_bits)[LOGN],
-		sk + u, CRYPTO_SECRETKEYBYTES - u);
-	if (v == 0) {
-		return -1;
-	}
-	u += v;
-	if (u != CRYPTO_SECRETKEYBYTES) {
-		return -1;
-	}
-	if (!Zf(complete_private)(G, f, g, F, LOGN, tmp.b)) {
-		return -1;
-	}
-#endif
-
-	/*
-	 * Create a random nonce (40 bytes).
-	 */
-	randombytes(nonce, NONCELEN);
-
-	/*
-	 * Hash message nonce + message into a vector.
-	 */
-	inner_shake256_init(&sc);
-	inner_shake256_inject(&sc, nonce, NONCELEN);
-	inner_shake256_inject(&sc, m, mlen);
-	inner_shake256_flip(&sc);
-	Zf(hash_to_point_vartime)(&sc, hm, LOGN);
-
-	/*
-	 * Initialize a RNG.
-	 */
-	randombytes(seed, SEEDLEN);
-	inner_shake256_init(&sc);
-	inner_shake256_inject(&sc, seed, SEEDLEN);
-	inner_shake256_flip(&sc);
-
-	/*
-	 * Compute the signature.
-	 */
-	sav_cw = set_fpu_cw(2);
-#if KG_EXPAND
-	Zf(sign_tree)(sig, &sc, expanded_key, hm, LOGN, tmp.b);
-#else
-	Zf(sign_dyn)(sig, &sc, f, g, F, G, hm, LOGN, tmp.b);
-#endif
-	set_fpu_cw(sav_cw);
-
-	/*
-	 * Encode the signature and bundle it with the message. Format is:
-	 *   signature length     2 bytes, big-endian
-	 *   nonce                40 bytes
-	 *   message              mlen bytes
-	 *   signature            slen bytes
-	 */
-	esig[0] = 0x20 + LOGN;
-	sig_len = Zf(comp_encode)(esig + 1, CRYPTO_BYTES - 1, sig, LOGN);
-	if (sig_len == 0) {
-		return -1;
-	}
-	sig_len ++;
-	memmove(sm + 2 + NONCELEN, m, mlen);
-	sm[0] = (unsigned char)(sig_len >> 8);
-	sm[1] = (unsigned char)sig_len;
-	memcpy(sm + 2, nonce, NONCELEN);
-	memcpy(sm + 2 + NONCELEN + mlen, esig, sig_len);
-	*smlen = 2 + NONCELEN + mlen + sig_len;
-	return 0;
-}
-
-int
-crypto_sign_open(unsigned char *m, size_t *mlen,
-	const unsigned char *sm, size_t smlen,
-	const unsigned char *pk)
-{
-	uint16_t *h, *hm;
-	int16_t *sig;
-	const unsigned char *esig;
-	inner_shake256_context sc;
-	size_t sig_len, msg_len;
-
-	h = (uint16_t *)&tmp.b[2 * N];
-	hm = h + N;
-	sig = (int16_t *)(hm + N);
-
-	/*
-	 * Decode public key.
-	 */
-	if (pk[0] != 0x00 + LOGN) {
-		return -1;
-	}
-	if (Zf(modq_decode)(h, LOGN, pk + 1, CRYPTO_PUBLICKEYBYTES - 1)
-		!= CRYPTO_PUBLICKEYBYTES - 1)
-	{
-		return -1;
-	}
-	Zf(to_ntt_monty)(h, LOGN);
-
-	/*
-	 * Find nonce, signature, message length.
-	 */
-	if (smlen < 2 + NONCELEN) {
-		return -1;
-	}
-	sig_len = ((size_t)sm[0] << 8) | (size_t)sm[1];
-	if (sig_len > (smlen - 2 - NONCELEN)) {
-		return -1;
-	}
-	msg_len = smlen - 2 - NONCELEN - sig_len;
-
-	/*
-	 * Decode signature.
-	 */
-	esig = sm + 2 + NONCELEN + msg_len;
-	if (sig_len < 1 || esig[0] != 0x20 + LOGN) {
-		return -1;
-	}
-	if (Zf(comp_decode)(sig, LOGN,
-		esig + 1, sig_len - 1) != sig_len - 1)
-	{
-		return -1;
-	}
-
-	/*
-	 * Hash nonce + message into a vector.
-	 */
-	inner_shake256_init(&sc);
-	inner_shake256_inject(&sc, sm + 2, NONCELEN + msg_len);
-	inner_shake256_flip(&sc);
-	Zf(hash_to_point_vartime)(&sc, hm, LOGN);
-
-	/*
-	 * Verify signature.
-	 */
-	if (!Zf(verify_raw)(hm, sig, h, LOGN, tmp.b)) {
-		return -1;
-	}
-
-	/*
-	 * Return plaintext.
-	 */
-	memmove(m, sm + 2 + NONCELEN, msg_len);
-	*mlen = msg_len;
-	return 0;
-}
diff --git a/crypto_sign/falcon-512/m4-ct/rng.c b/crypto_sign/falcon-512/m4-ct/rng.c
deleted file mode 100644
index d2ecb7af..00000000
--- a/crypto_sign/falcon-512/m4-ct/rng.c
+++ /dev/null
@@ -1,379 +0,0 @@
-/*
- * PRNG and interface to the system RNG.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include <assert.h>
-
-#include "inner.h"
-
-// yyyNIST+0 yyyPQCLEAN+0
-/*
- * Include relevant system header files. For Win32, this will also need
- * linking with advapi32.dll, which we trigger with an appropriate #pragma.
- */
-#if FALCON_RAND_GETENTROPY
-#include <unistd.h>
-#endif
-#if FALCON_RAND_URANDOM
-#include <sys/types.h>
-#if !FALCON_RAND_GETENTROPY
-#include <unistd.h>
-#endif
-#include <fcntl.h>
-#include <errno.h>
-#endif
-#if FALCON_RAND_WIN32
-#include <windows.h>
-#include <wincrypt.h>
-#pragma comment(lib, "advapi32")
-#endif
-
-/* see inner.h */
-int
-Zf(get_seed)(void *seed, size_t len)
-{
-	(void)seed;
-	if (len == 0) {
-		return 1;
-	}
-#if FALCON_RAND_GETENTROPY
-	if (getentropy(seed, len) == 0) {
-		return 1;
-	}
-#endif
-#if FALCON_RAND_URANDOM
-	{
-		int f;
-
-		f = open("/dev/urandom", O_RDONLY);
-		if (f >= 0) {
-			while (len > 0) {
-				ssize_t rlen;
-
-				rlen = read(f, seed, len);
-				if (rlen < 0) {
-					if (errno == EINTR) {
-						continue;
-					}
-					break;
-				}
-				seed = (uint8_t *)seed + rlen;
-				len -= (size_t)rlen;
-			}
-			close(f);
-			if (len == 0) {
-				return 1;
-			}
-		}
-	}
-#endif
-#if FALCON_RAND_WIN32
-	{
-		HCRYPTPROV hp;
-
-		if (CryptAcquireContext(&hp, 0, 0, PROV_RSA_FULL,
-			CRYPT_VERIFYCONTEXT | CRYPT_SILENT))
-		{
-			BOOL r;
-
-			r = CryptGenRandom(hp, (DWORD)len, seed);
-			CryptReleaseContext(hp, 0);
-			if (r) {
-				return 1;
-			}
-		}
-	}
-#endif
-	return 0;
-}
-// yyyNIST- yyyPQCLEAN-
-
-/* see inner.h */
-void
-Zf(prng_init)(prng *p, inner_shake256_context *src)
-{
-#if FALCON_LE  // yyyLE+1
-	inner_shake256_extract(src, p->state.d, 56);
-#else  // yyyLE+0
-	/*
-	 * To ensure reproducibility for a given seed, we
-	 * must enforce little-endian interpretation of
-	 * the state words.
-	 */
-	uint8_t tmp[56];
-	uint64_t th, tl;
-	int i;
-
-	inner_shake256_extract(src, tmp, 56);
-	for (i = 0; i < 14; i ++) {
-		uint32_t w;
-
-		w = (uint32_t)tmp[(i << 2) + 0]
-			| ((uint32_t)tmp[(i << 2) + 1] << 8)
-			| ((uint32_t)tmp[(i << 2) + 2] << 16)
-			| ((uint32_t)tmp[(i << 2) + 3] << 24);
-		*(uint32_t *)(p->state.d + (i << 2)) = w;
-	}
-	tl = *(uint32_t *)(p->state.d + 48);
-	th = *(uint32_t *)(p->state.d + 52);
-	*(uint64_t *)(p->state.d + 48) = tl + (th << 32);
-#endif  // yyyLE-
-	Zf(prng_refill)(p);
-}
-
-/*
- * PRNG based on ChaCha20.
- *
- * State consists in key (32 bytes) then IV (16 bytes) and block counter
- * (8 bytes). Normally, we should not care about local endianness (this
- * is for a PRNG), but for the NIST competition we need reproducible KAT
- * vectors that work across architectures, so we enforce little-endian
- * interpretation where applicable. Moreover, output words are "spread
- * out" over the output buffer with the interleaving pattern that is
- * naturally obtained from the AVX2 implementation that runs eight
- * ChaCha20 instances in parallel.
- *
- * The block counter is XORed into the first 8 bytes of the IV.
- */
-TARGET_AVX2
-void
-Zf(prng_refill)(prng *p)
-{
-#if FALCON_AVX2 // yyyAVX2+1
-
-	static const uint32_t CW[] = {
-		0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
-	};
-
-	uint64_t cc;
-	size_t u;
-	int i;
-	uint32_t *sw;
-	union {
-		uint32_t w[16];
-		__m256i y[2];  /* for alignment */
-	} t;
-	__m256i state[16], init[16];
-
-	sw = (uint32_t *)p->state.d;
-
-	/*
-	 * XOR next counter values into state.
-	 */
-	cc = *(uint64_t *)(p->state.d + 48);
-	for (u = 0; u < 8; u ++) {
-		t.w[u] = (uint32_t)(cc + u);
-		t.w[u + 8] = (uint32_t)((cc + u) >> 32);
-	}
-	*(uint64_t *)(p->state.d + 48) = cc + 8;
-
-	/*
-	 * Load state.
-	 */
-	for (u = 0; u < 4; u ++) {
-		state[u] = init[u] =
-			_mm256_broadcastd_epi32(_mm_cvtsi32_si128(CW[u]));
-	}
-	for (u = 0; u < 10; u ++) {
-		state[u + 4] = init[u + 4] =
-			_mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[u]));
-	}
-	state[14] = init[14] = _mm256_xor_si256(
-		_mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[10])),
-		_mm256_loadu_si256((__m256i *)&t.w[0]));
-	state[15] = init[15] = _mm256_xor_si256(
-		_mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[11])),
-		_mm256_loadu_si256((__m256i *)&t.w[8]));
-
-	/*
-	 * Do all rounds.
-	 */
-	for (i = 0; i < 10; i ++) {
-
-#define QROUND(a, b, c, d)   do { \
-		state[a] = _mm256_add_epi32(state[a], state[b]); \
-		state[d] = _mm256_xor_si256(state[d], state[a]); \
-		state[d] = _mm256_or_si256( \
-			_mm256_slli_epi32(state[d], 16), \
-			_mm256_srli_epi32(state[d], 16)); \
-		state[c] = _mm256_add_epi32(state[c], state[d]); \
-		state[b] = _mm256_xor_si256(state[b], state[c]); \
-		state[b] = _mm256_or_si256( \
-			_mm256_slli_epi32(state[b], 12), \
-			_mm256_srli_epi32(state[b], 20)); \
-		state[a] = _mm256_add_epi32(state[a], state[b]); \
-		state[d] = _mm256_xor_si256(state[d], state[a]); \
-		state[d] = _mm256_or_si256( \
-			_mm256_slli_epi32(state[d],  8), \
-			_mm256_srli_epi32(state[d], 24)); \
-		state[c] = _mm256_add_epi32(state[c], state[d]); \
-		state[b] = _mm256_xor_si256(state[b], state[c]); \
-		state[b] = _mm256_or_si256( \
-			_mm256_slli_epi32(state[b], 7), \
-			_mm256_srli_epi32(state[b], 25)); \
-	} while (0)
-
-		QROUND( 0,  4,  8, 12);
-		QROUND( 1,  5,  9, 13);
-		QROUND( 2,  6, 10, 14);
-		QROUND( 3,  7, 11, 15);
-		QROUND( 0,  5, 10, 15);
-		QROUND( 1,  6, 11, 12);
-		QROUND( 2,  7,  8, 13);
-		QROUND( 3,  4,  9, 14);
-
-#undef QROUND
-
-	}
-
-	/*
-	 * Add initial state back and encode the result in the destination
-	 * buffer. We can dump the AVX2 values "as is" because the non-AVX2
-	 * code uses a compatible order of values.
-	 */
-	for (u = 0; u < 16; u ++) {
-		_mm256_storeu_si256((__m256i *)&p->buf.d[u << 5],
-			_mm256_add_epi32(state[u], init[u]));
-	}
-
-#else // yyyAVX2+0
-
-	static const uint32_t CW[] = {
-		0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
-	};
-
-	uint64_t cc;
-	size_t u;
-
-	/*
-	 * State uses local endianness. Only the output bytes must be
-	 * converted to little endian (if used on a big-endian machine).
-	 */
-	cc = *(uint64_t *)(p->state.d + 48);
-	for (u = 0; u < 8; u ++) {
-		uint32_t state[16];
-		size_t v;
-		int i;
-
-		memcpy(&state[0], CW, sizeof CW);
-		memcpy(&state[4], p->state.d, 48);
-		state[14] ^= (uint32_t)cc;
-		state[15] ^= (uint32_t)(cc >> 32);
-		for (i = 0; i < 10; i ++) {
-
-#define QROUND(a, b, c, d)   do { \
-		state[a] += state[b]; \
-		state[d] ^= state[a]; \
-		state[d] = (state[d] << 16) | (state[d] >> 16); \
-		state[c] += state[d]; \
-		state[b] ^= state[c]; \
-		state[b] = (state[b] << 12) | (state[b] >> 20); \
-		state[a] += state[b]; \
-		state[d] ^= state[a]; \
-		state[d] = (state[d] <<  8) | (state[d] >> 24); \
-		state[c] += state[d]; \
-		state[b] ^= state[c]; \
-		state[b] = (state[b] <<  7) | (state[b] >> 25); \
-	} while (0)
-
-			QROUND( 0,  4,  8, 12);
-			QROUND( 1,  5,  9, 13);
-			QROUND( 2,  6, 10, 14);
-			QROUND( 3,  7, 11, 15);
-			QROUND( 0,  5, 10, 15);
-			QROUND( 1,  6, 11, 12);
-			QROUND( 2,  7,  8, 13);
-			QROUND( 3,  4,  9, 14);
-
-#undef QROUND
-
-		}
-
-		for (v = 0; v < 4; v ++) {
-			state[v] += CW[v];
-		}
-		for (v = 4; v < 14; v ++) {
-			state[v] += ((uint32_t *)p->state.d)[v - 4];
-		}
-		state[14] += ((uint32_t *)p->state.d)[10]
-			^ (uint32_t)cc;
-		state[15] += ((uint32_t *)p->state.d)[11]
-			^ (uint32_t)(cc >> 32);
-		cc ++;
-
-		/*
-		 * We mimic the interleaving that is used in the AVX2
-		 * implementation.
-		 */
-		for (v = 0; v < 16; v ++) {
-#if FALCON_LE  // yyyLE+1
-			((uint32_t *)p->buf.d)[u + (v << 3)] = state[v];
-#else  // yyyLE+0
-			p->buf.d[(u << 2) + (v << 5) + 0] =
-				(uint8_t)state[v];
-			p->buf.d[(u << 2) + (v << 5) + 1] =
-				(uint8_t)(state[v] >> 8);
-			p->buf.d[(u << 2) + (v << 5) + 2] =
-				(uint8_t)(state[v] >> 16);
-			p->buf.d[(u << 2) + (v << 5) + 3] =
-				(uint8_t)(state[v] >> 24);
-#endif  // yyyLE-
-		}
-	}
-	*(uint64_t *)(p->state.d + 48) = cc;
-
-#endif // yyyAVX2-
-
-	p->ptr = 0;
-}
-
-/* see inner.h */
-void
-Zf(prng_get_bytes)(prng *p, void *dst, size_t len)
-{
-	uint8_t *buf;
-
-	buf = dst;
-	while (len > 0) {
-		size_t clen;
-
-		clen = (sizeof p->buf.d) - p->ptr;
-		if (clen > len) {
-			clen = len;
-		}
-		memcpy(buf, p->buf.d, clen);
-		buf += clen;
-		len -= clen;
-		p->ptr += clen;
-		if (p->ptr == sizeof p->buf.d) {
-			Zf(prng_refill)(p);
-		}
-	}
-}
diff --git a/crypto_sign/falcon-512/m4-ct/sign.c b/crypto_sign/falcon-512/m4-ct/sign.c
deleted file mode 100644
index 752fb8ba..00000000
--- a/crypto_sign/falcon-512/m4-ct/sign.c
+++ /dev/null
@@ -1,1532 +0,0 @@
-/*
- * Falcon signature generation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-/* =================================================================== */
-
-/*
- * Compute degree N from logarithm 'logn'.
- */
-#define MKN(logn)   ((size_t)1 << (logn))
-
-/* =================================================================== */
-/*
- * Binary case:
- *   N = 2^logn
- *   phi = X^N+1
- */
-
-/*
- * Get the size of the LDL tree for an input with polynomials of size
- * 2^logn. The size is expressed in the number of elements.
- */
-static inline unsigned
-ffLDL_treesize(unsigned logn)
-{
-	/*
-	 * For logn = 0 (polynomials are constant), the "tree" is a
-	 * single element. Otherwise, the tree node has size 2^logn, and
-	 * has two child trees for size logn-1 each. Thus, treesize s()
-	 * must fulfill these two relations:
-	 *
-	 *   s(0) = 1
-	 *   s(logn) = (2^logn) + 2*s(logn-1)
-	 */
-	return (logn + 1) << logn;
-}
-
-/*
- * Inner function for ffLDL_fft(). It expects the matrix to be both
- * auto-adjoint and quasicyclic; also, it uses the source operands
- * as modifiable temporaries.
- *
- * tmp[] must have room for at least one polynomial.
- */
-static void
-ffLDL_fft_inner(fpr *restrict tree,
-	fpr *restrict g0, fpr *restrict g1, unsigned logn, fpr *restrict tmp)
-{
-	size_t n, hn;
-
-	n = MKN(logn);
-	if (n == 1) {
-		tree[0] = g0[0];
-		return;
-	}
-	hn = n >> 1;
-
-	/*
-	 * The LDL decomposition yields L (which is written in the tree)
-	 * and the diagonal of D. Since d00 = g0, we just write d11
-	 * into tmp.
-	 */
-	Zf(poly_LDLmv_fft)(tmp, tree, g0, g1, g0, logn);
-
-	/*
-	 * Split d00 (currently in g0) and d11 (currently in tmp). We
-	 * reuse g0 and g1 as temporary storage spaces:
-	 *   d00 splits into g1, g1+hn
-	 *   d11 splits into g0, g0+hn
-	 */
-	Zf(poly_split_fft)(g1, g1 + hn, g0, logn);
-	Zf(poly_split_fft)(g0, g0 + hn, tmp, logn);
-
-	/*
-	 * Each split result is the first row of a new auto-adjoint
-	 * quasicyclic matrix for the next recursive step.
-	 */
-	ffLDL_fft_inner(tree + n,
-		g1, g1 + hn, logn - 1, tmp);
-	ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
-		g0, g0 + hn, logn - 1, tmp);
-}
-
-/*
- * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix
- * is provided as three polynomials (FFT representation).
- *
- * The "tree" array is filled with the computed tree, of size
- * (logn+1)*(2^logn) elements (see ffLDL_treesize()).
- *
- * Input arrays MUST NOT overlap, except possibly the three unmodified
- * arrays g00, g01 and g11. tmp[] should have room for at least three
- * polynomials of 2^logn elements each.
- */
-static void
-ffLDL_fft(fpr *restrict tree, const fpr *restrict g00,
-	const fpr *restrict g01, const fpr *restrict g11,
-	unsigned logn, fpr *restrict tmp)
-{
-	size_t n, hn;
-	fpr *d00, *d11;
-
-	n = MKN(logn);
-	if (n == 1) {
-		tree[0] = g00[0];
-		return;
-	}
-	hn = n >> 1;
-	d00 = tmp;
-	d11 = tmp + n;
-	tmp += n << 1;
-
-	memcpy(d00, g00, n * sizeof *g00);
-	Zf(poly_LDLmv_fft)(d11, tree, g00, g01, g11, logn);
-
-	Zf(poly_split_fft)(tmp, tmp + hn, d00, logn);
-	Zf(poly_split_fft)(d00, d00 + hn, d11, logn);
-	memcpy(d11, tmp, n * sizeof *tmp);
-	ffLDL_fft_inner(tree + n,
-		d11, d11 + hn, logn - 1, tmp);
-	ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
-		d00, d00 + hn, logn - 1, tmp);
-}
-
-/*
- * Normalize an ffLDL tree: each leaf of value x is replaced with
- * sigma / sqrt(x).
- */
-static void
-ffLDL_binary_normalize(fpr *tree, unsigned logn)
-{
-	/*
-	 * TODO: make an iterative version.
-	 */
-	size_t n;
-
-	n = MKN(logn);
-	if (n == 1) {
-		/*
-		 * We actually store in the tree leaf the inverse of
-		 * the value mandated by the specification: this
-		 * saves a division both here and in the sampler.
-		 */
-		tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma);
-	} else {
-		ffLDL_binary_normalize(tree + n, logn - 1);
-		ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1),
-			logn - 1);
-	}
-}
-
-/* =================================================================== */
-
-/*
- * Convert an integer polynomial (with small values) into the
- * representation with complex numbers.
- */
-static void
-smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn)
-{
-	size_t n, u;
-
-	n = MKN(logn);
-	for (u = 0; u < n; u ++) {
-		r[u] = fpr_of(t[u]);
-	}
-}
-
-/*
- * The expanded private key contains:
- *  - The B0 matrix (four elements)
- *  - The ffLDL tree
- */
-
-static inline size_t
-skoff_b00(unsigned logn)
-{
-	(void)logn;
-	return 0;
-}
-
-static inline size_t
-skoff_b01(unsigned logn)
-{
-	return MKN(logn);
-}
-
-static inline size_t
-skoff_b10(unsigned logn)
-{
-	return 2 * MKN(logn);
-}
-
-static inline size_t
-skoff_b11(unsigned logn)
-{
-	return 3 * MKN(logn);
-}
-
-static inline size_t
-skoff_tree(unsigned logn)
-{
-	return 4 * MKN(logn);
-}
-
-/* see inner.h */
-void
-Zf(expand_privkey)(fpr *restrict expanded_key,
-	const int8_t *f, const int8_t *g,
-	const int8_t *F, const int8_t *G,
-	unsigned logn, uint8_t *restrict tmp)
-{
-	size_t n;
-	fpr *rf, *rg, *rF, *rG;
-	fpr *b00, *b01, *b10, *b11;
-	fpr *g00, *g01, *g11, *gxx;
-	fpr *tree;
-
-	n = MKN(logn);
-	b00 = expanded_key + skoff_b00(logn);
-	b01 = expanded_key + skoff_b01(logn);
-	b10 = expanded_key + skoff_b10(logn);
-	b11 = expanded_key + skoff_b11(logn);
-	tree = expanded_key + skoff_tree(logn);
-
-	/*
-	 * We load the private key elements directly into the B0 matrix,
-	 * since B0 = [[g, -f], [G, -F]].
-	 */
-	rf = b01;
-	rg = b00;
-	rF = b11;
-	rG = b10;
-
-	smallints_to_fpr(rf, f, logn);
-	smallints_to_fpr(rg, g, logn);
-	smallints_to_fpr(rF, F, logn);
-	smallints_to_fpr(rG, G, logn);
-
-	/*
-	 * Compute the FFT for the key elements, and negate f and F.
-	 */
-	Zf(FFT)(rf, logn);
-	Zf(FFT)(rg, logn);
-	Zf(FFT)(rF, logn);
-	Zf(FFT)(rG, logn);
-	Zf(poly_neg)(rf, logn);
-	Zf(poly_neg)(rF, logn);
-
-	/*
-	 * The Gram matrix is G = B·B*. Formulas are:
-	 *   g00 = b00*adj(b00) + b01*adj(b01)
-	 *   g01 = b00*adj(b10) + b01*adj(b11)
-	 *   g10 = b10*adj(b00) + b11*adj(b01)
-	 *   g11 = b10*adj(b10) + b11*adj(b11)
-	 *
-	 * For historical reasons, this implementation uses
-	 * g00, g01 and g11 (upper triangle).
-	 */
-	g00 = (fpr *)tmp;
-	g01 = g00 + n;
-	g11 = g01 + n;
-	gxx = g11 + n;
-
-	memcpy(g00, b00, n * sizeof *b00);
-	Zf(poly_mulselfadj_fft)(g00, logn);
-	memcpy(gxx, b01, n * sizeof *b01);
-	Zf(poly_mulselfadj_fft)(gxx, logn);
-	Zf(poly_add)(g00, gxx, logn);
-
-	memcpy(g01, b00, n * sizeof *b00);
-	Zf(poly_muladj_fft)(g01, b10, logn);
-	memcpy(gxx, b01, n * sizeof *b01);
-	Zf(poly_muladj_fft)(gxx, b11, logn);
-	Zf(poly_add)(g01, gxx, logn);
-
-	memcpy(g11, b10, n * sizeof *b10);
-	Zf(poly_mulselfadj_fft)(g11, logn);
-	memcpy(gxx, b11, n * sizeof *b11);
-	Zf(poly_mulselfadj_fft)(gxx, logn);
-	Zf(poly_add)(g11, gxx, logn);
-
-	/*
-	 * Compute the Falcon tree.
-	 */
-	ffLDL_fft(tree, g00, g01, g11, logn, gxx);
-
-	/*
-	 * Normalize tree.
-	 */
-	ffLDL_binary_normalize(tree, logn);
-}
-
-typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma);
-
-/*
- * Perform Fast Fourier Sampling for target vector t. The Gram matrix
- * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector
- * is written over (t0,t1). The Gram matrix is modified as well. The
- * tmp[] buffer must have room for four polynomials.
- */
-TARGET_AVX2
-static void
-ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx,
-	fpr *restrict t0, fpr *restrict t1,
-	fpr *restrict g00, fpr *restrict g01, fpr *restrict g11,
-	unsigned logn, fpr *restrict tmp)
-{
-	size_t n, hn;
-	fpr *z0, *z1;
-
-	/*
-	 * Deepest level: the LDL tree leaf value is just g00 (the
-	 * array has length only 1 at this point); we normalize it
-	 * with regards to sigma, then use it for sampling.
-	 */
-	if (logn == 0) {
-		fpr leaf;
-
-		leaf = g00[0];
-		leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma);
-		t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf));
-		t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf));
-		return;
-	}
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-
-	/*
-	 * Decompose G into LDL. We only need d00 (identical to g00),
-	 * d11, and l10; we do that in place.
-	 */
-	Zf(poly_LDL_fft)(g00, g01, g11, logn);
-
-	/*
-	 * Split d00 and d11 and expand them into half-size quasi-cyclic
-	 * Gram matrices. We also save l10 in tmp[].
-	 */
-	Zf(poly_split_fft)(tmp, tmp + hn, g00, logn);
-	memcpy(g00, tmp, n * sizeof *tmp);
-	Zf(poly_split_fft)(tmp, tmp + hn, g11, logn);
-	memcpy(g11, tmp, n * sizeof *tmp);
-	memcpy(tmp, g01, n * sizeof *g01);
-	memcpy(g01, g00, hn * sizeof *g00);
-	memcpy(g01 + hn, g11, hn * sizeof *g00);
-
-	/*
-	 * The half-size Gram matrices for the recursive LDL tree
-	 * building are now:
-	 *   - left sub-tree: g00, g00+hn, g01
-	 *   - right sub-tree: g11, g11+hn, g01+hn
-	 * l10 is in tmp[].
-	 */
-
-	/*
-	 * We split t1 and use the first recursive call on the two
-	 * halves, using the right sub-tree. The result is merged
-	 * back into tmp + 2*n.
-	 */
-	z1 = tmp + n;
-	Zf(poly_split_fft)(z1, z1 + hn, t1, logn);
-	ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn,
-		g11, g11 + hn, g01 + hn, logn - 1, z1 + n);
-	Zf(poly_merge_fft)(tmp + (n << 1), z1, z1 + hn, logn);
-
-	/*
-	 * Compute tb0 = t0 + (t1 - z1) * l10.
-	 * At that point, l10 is in tmp, t1 is unmodified, and z1 is
-	 * in tmp + (n << 1). The buffer in z1 is free.
-	 *
-	 * In the end, z1 is written over t1, and tb0 is in t0.
-	 */
-	memcpy(z1, t1, n * sizeof *t1);
-	Zf(poly_sub)(z1, tmp + (n << 1), logn);
-	memcpy(t1, tmp + (n << 1), n * sizeof *tmp);
-	Zf(poly_mul_fft)(tmp, z1, logn);
-	Zf(poly_add)(t0, tmp, logn);
-
-	/*
-	 * Second recursive invocation, on the split tb0 (currently in t0)
-	 * and the left sub-tree.
-	 */
-	z0 = tmp;
-	Zf(poly_split_fft)(z0, z0 + hn, t0, logn);
-	ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn,
-		g00, g00 + hn, g01, logn - 1, z0 + n);
-	Zf(poly_merge_fft)(t0, z0, z0 + hn, logn);
-}
-
-/*
- * Perform Fast Fourier Sampling for target vector t and LDL tree T.
- * tmp[] must have size for at least two polynomials of size 2^logn.
- */
-TARGET_AVX2
-static void
-ffSampling_fft(samplerZ samp, void *samp_ctx,
-	fpr *restrict z0, fpr *restrict z1,
-	const fpr *restrict tree,
-	const fpr *restrict t0, const fpr *restrict t1, unsigned logn,
-	fpr *restrict tmp)
-{
-	size_t n, hn;
-	const fpr *tree0, *tree1;
-
-	/*
-	 * When logn == 2, we inline the last two recursion levels.
-	 */
-	if (logn == 2) {
-#if FALCON_AVX2  // yyyAVX2+1
-		fpr w0, w1, w2, w3, sigma;
-		__m128d ww0, ww1, wa, wb, wc, wd;
-		__m128d wy0, wy1, wz0, wz1;
-		__m128d half, invsqrt8, invsqrt2, neghi, neglo;
-		int si0, si1, si2, si3;
-
-		tree0 = tree + 4;
-		tree1 = tree + 8;
-
-		half = _mm_set1_pd(0.5);
-		invsqrt8 = _mm_set1_pd(0.353553390593273762200422181052);
-		invsqrt2 = _mm_set1_pd(0.707106781186547524400844362105);
-		neghi = _mm_set_pd(-0.0, 0.0);
-		neglo = _mm_set_pd(0.0, -0.0);
-
-		/*
-		 * We split t1 into w*, then do the recursive invocation,
-		 * with output in w*. We finally merge back into z1.
-		 */
-		ww0 = _mm_loadu_pd(&t1[0].v);
-		ww1 = _mm_loadu_pd(&t1[2].v);
-		wa = _mm_unpacklo_pd(ww0, ww1);
-		wb = _mm_unpackhi_pd(ww0, ww1);
-		wc = _mm_add_pd(wa, wb);
-		ww0 = _mm_mul_pd(wc, half);
-		wc = _mm_sub_pd(wa, wb);
-		wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi);
-		ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8);
-
-		w2.v = _mm_cvtsd_f64(ww1);
-		w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1));
-		wa = ww1;
-		sigma = tree1[3];
-		si2 = samp(samp_ctx, w2, sigma);
-		si3 = samp(samp_ctx, w3, sigma);
-		ww1 = _mm_set_pd((double)si3, (double)si2);
-		wa = _mm_sub_pd(wa, ww1);
-		wb = _mm_loadu_pd(&tree1[0].v);
-		wc = _mm_mul_pd(wa, wb);
-		wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1));
-		wa = _mm_unpacklo_pd(wc, wd);
-		wb = _mm_unpackhi_pd(wc, wd);
-		ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo)));
-		w0.v = _mm_cvtsd_f64(ww0);
-		w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1));
-		sigma = tree1[2];
-		si0 = samp(samp_ctx, w0, sigma);
-		si1 = samp(samp_ctx, w1, sigma);
-		ww0 = _mm_set_pd((double)si1, (double)si0);
-
-		wc = _mm_mul_pd(
-			_mm_set_pd((double)(si2 + si3), (double)(si2 - si3)),
-			invsqrt2);
-		wa = _mm_add_pd(ww0, wc);
-		wb = _mm_sub_pd(ww0, wc);
-		ww0 = _mm_unpacklo_pd(wa, wb);
-		ww1 = _mm_unpackhi_pd(wa, wb);
-		_mm_storeu_pd(&z1[0].v, ww0);
-		_mm_storeu_pd(&z1[2].v, ww1);
-
-		/*
-		 * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
-		 */
-		wy0 = _mm_sub_pd(_mm_loadu_pd(&t1[0].v), ww0);
-		wy1 = _mm_sub_pd(_mm_loadu_pd(&t1[2].v), ww1);
-		wz0 = _mm_loadu_pd(&tree[0].v);
-		wz1 = _mm_loadu_pd(&tree[2].v);
-		ww0 = _mm_sub_pd(_mm_mul_pd(wy0, wz0), _mm_mul_pd(wy1, wz1));
-		ww1 = _mm_add_pd(_mm_mul_pd(wy0, wz1), _mm_mul_pd(wy1, wz0));
-		ww0 = _mm_add_pd(ww0, _mm_loadu_pd(&t0[0].v));
-		ww1 = _mm_add_pd(ww1, _mm_loadu_pd(&t0[2].v));
-
-		/*
-		 * Second recursive invocation.
-		 */
-		wa = _mm_unpacklo_pd(ww0, ww1);
-		wb = _mm_unpackhi_pd(ww0, ww1);
-		wc = _mm_add_pd(wa, wb);
-		ww0 = _mm_mul_pd(wc, half);
-		wc = _mm_sub_pd(wa, wb);
-		wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi);
-		ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8);
-
-		w2.v = _mm_cvtsd_f64(ww1);
-		w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1));
-		wa = ww1;
-		sigma = tree0[3];
-		si2 = samp(samp_ctx, w2, sigma);
-		si3 = samp(samp_ctx, w3, sigma);
-		ww1 = _mm_set_pd((double)si3, (double)si2);
-		wa = _mm_sub_pd(wa, ww1);
-		wb = _mm_loadu_pd(&tree0[0].v);
-		wc = _mm_mul_pd(wa, wb);
-		wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1));
-		wa = _mm_unpacklo_pd(wc, wd);
-		wb = _mm_unpackhi_pd(wc, wd);
-		ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo)));
-		w0.v = _mm_cvtsd_f64(ww0);
-		w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1));
-		sigma = tree0[2];
-		si0 = samp(samp_ctx, w0, sigma);
-		si1 = samp(samp_ctx, w1, sigma);
-		ww0 = _mm_set_pd((double)si1, (double)si0);
-
-		wc = _mm_mul_pd(
-			_mm_set_pd((double)(si2 + si3), (double)(si2 - si3)),
-			invsqrt2);
-		wa = _mm_add_pd(ww0, wc);
-		wb = _mm_sub_pd(ww0, wc);
-		ww0 = _mm_unpacklo_pd(wa, wb);
-		ww1 = _mm_unpackhi_pd(wa, wb);
-		_mm_storeu_pd(&z0[0].v, ww0);
-		_mm_storeu_pd(&z0[2].v, ww1);
-
-		return;
-#else  // yyyAVX2+0
-		fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma;
-		fpr a_re, a_im, b_re, b_im, c_re, c_im;
-
-		tree0 = tree + 4;
-		tree1 = tree + 8;
-
-		/*
-		 * We split t1 into w*, then do the recursive invocation,
-		 * with output in w*. We finally merge back into z1.
-		 */
-		a_re = t1[0];
-		a_im = t1[2];
-		b_re = t1[1];
-		b_im = t1[3];
-		c_re = fpr_add(a_re, b_re);
-		c_im = fpr_add(a_im, b_im);
-		w0 = fpr_half(c_re);
-		w1 = fpr_half(c_im);
-		c_re = fpr_sub(a_re, b_re);
-		c_im = fpr_sub(a_im, b_im);
-		w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
-		w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
-
-		x0 = w2;
-		x1 = w3;
-		sigma = tree1[3];
-		w2 = fpr_of(samp(samp_ctx, x0, sigma));
-		w3 = fpr_of(samp(samp_ctx, x1, sigma));
-		a_re = fpr_sub(x0, w2);
-		a_im = fpr_sub(x1, w3);
-		b_re = tree1[0];
-		b_im = tree1[1];
-		c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
-		c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
-		x0 = fpr_add(c_re, w0);
-		x1 = fpr_add(c_im, w1);
-		sigma = tree1[2];
-		w0 = fpr_of(samp(samp_ctx, x0, sigma));
-		w1 = fpr_of(samp(samp_ctx, x1, sigma));
-
-		a_re = w0;
-		a_im = w1;
-		b_re = w2;
-		b_im = w3;
-		c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
-		c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
-		z1[0] = w0 = fpr_add(a_re, c_re);
-		z1[2] = w2 = fpr_add(a_im, c_im);
-		z1[1] = w1 = fpr_sub(a_re, c_re);
-		z1[3] = w3 = fpr_sub(a_im, c_im);
-
-		/*
-		 * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
-		 */
-		w0 = fpr_sub(t1[0], w0);
-		w1 = fpr_sub(t1[1], w1);
-		w2 = fpr_sub(t1[2], w2);
-		w3 = fpr_sub(t1[3], w3);
-
-		a_re = w0;
-		a_im = w2;
-		b_re = tree[0];
-		b_im = tree[2];
-		w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
-		w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
-		a_re = w1;
-		a_im = w3;
-		b_re = tree[1];
-		b_im = tree[3];
-		w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
-		w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
-
-		w0 = fpr_add(w0, t0[0]);
-		w1 = fpr_add(w1, t0[1]);
-		w2 = fpr_add(w2, t0[2]);
-		w3 = fpr_add(w3, t0[3]);
-
-		/*
-		 * Second recursive invocation.
-		 */
-		a_re = w0;
-		a_im = w2;
-		b_re = w1;
-		b_im = w3;
-		c_re = fpr_add(a_re, b_re);
-		c_im = fpr_add(a_im, b_im);
-		w0 = fpr_half(c_re);
-		w1 = fpr_half(c_im);
-		c_re = fpr_sub(a_re, b_re);
-		c_im = fpr_sub(a_im, b_im);
-		w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
-		w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
-
-		x0 = w2;
-		x1 = w3;
-		sigma = tree0[3];
-		w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma));
-		w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma));
-		a_re = fpr_sub(x0, y0);
-		a_im = fpr_sub(x1, y1);
-		b_re = tree0[0];
-		b_im = tree0[1];
-		c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
-		c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
-		x0 = fpr_add(c_re, w0);
-		x1 = fpr_add(c_im, w1);
-		sigma = tree0[2];
-		w0 = fpr_of(samp(samp_ctx, x0, sigma));
-		w1 = fpr_of(samp(samp_ctx, x1, sigma));
-
-		a_re = w0;
-		a_im = w1;
-		b_re = w2;
-		b_im = w3;
-		c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
-		c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
-		z0[0] = fpr_add(a_re, c_re);
-		z0[2] = fpr_add(a_im, c_im);
-		z0[1] = fpr_sub(a_re, c_re);
-		z0[3] = fpr_sub(a_im, c_im);
-
-		return;
-#endif  // yyyAVX2-
-	}
-
-	/*
-	 * Case logn == 1 is reachable only when using Falcon-2 (the
-	 * smallest size for which Falcon is mathematically defined, but
-	 * of course way too insecure to be of any use).
-	 */
-	if (logn == 1) {
-		fpr x0, x1, y0, y1, sigma;
-		fpr a_re, a_im, b_re, b_im, c_re, c_im;
-
-		x0 = t1[0];
-		x1 = t1[1];
-		sigma = tree[3];
-		z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
-		z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
-		a_re = fpr_sub(x0, y0);
-		a_im = fpr_sub(x1, y1);
-		b_re = tree[0];
-		b_im = tree[1];
-		c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
-		c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
-		x0 = fpr_add(c_re, t0[0]);
-		x1 = fpr_add(c_im, t0[1]);
-		sigma = tree[2];
-		z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
-		z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
-
-		return;
-	}
-
-	/*
-	 * Normal end of recursion is for logn == 0. Since the last
-	 * steps of the recursions were inlined in the blocks above
-	 * (when logn == 1 or 2), this case is not reachable, and is
-	 * retained here only for documentation purposes.
-
-	if (logn == 0) {
-		fpr x0, x1, sigma;
-
-		x0 = t0[0];
-		x1 = t1[0];
-		sigma = tree[0];
-		z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
-		z1[0] = fpr_of(samp(samp_ctx, x1, sigma));
-		return;
-	}
-
-	 */
-
-	/*
-	 * General recursive case (logn >= 3).
-	 */
-
-	n = (size_t)1 << logn;
-	hn = n >> 1;
-	tree0 = tree + n;
-	tree1 = tree + n + ffLDL_treesize(logn - 1);
-
-	/*
-	 * We split t1 into z1 (reused as temporary storage), then do
-	 * the recursive invocation, with output in tmp. We finally
-	 * merge back into z1.
-	 */
-	Zf(poly_split_fft)(z1, z1 + hn, t1, logn);
-	ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
-		tree1, z1, z1 + hn, logn - 1, tmp + n);
-	Zf(poly_merge_fft)(z1, tmp, tmp + hn, logn);
-
-	/*
-	 * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[].
-	 */
-	memcpy(tmp, t1, n * sizeof *t1);
-	Zf(poly_sub)(tmp, z1, logn);
-	Zf(poly_mul_fft)(tmp, tree, logn);
-	Zf(poly_add)(tmp, t0, logn);
-
-	/*
-	 * Second recursive invocation.
-	 */
-	Zf(poly_split_fft)(z0, z0 + hn, tmp, logn);
-	ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
-		tree0, z0, z0 + hn, logn - 1, tmp + n);
-	Zf(poly_merge_fft)(z0, tmp, tmp + hn, logn);
-}
-
-/*
- * Compute a signature: the signature contains two vectors, s1 and s2.
- * The s1 vector is not returned. The squared norm of (s1,s2) is
- * computed, and if it is short enough, then s2 is returned into the
- * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
- * returned; the caller should then try again. This function uses an
- * expanded key.
- *
- * tmp[] must have room for at least six polynomials.
- */
-static int
-do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
-	const fpr *restrict expanded_key,
-	const uint16_t *hm,
-	unsigned logn, fpr *restrict tmp)
-{
-	size_t n, u;
-	fpr *t0, *t1, *tx, *ty;
-	const fpr *b00, *b01, *b10, *b11, *tree;
-	fpr ni;
-	uint32_t sqn, ng;
-	int16_t *s1tmp, *s2tmp;
-
-	n = MKN(logn);
-	t0 = tmp;
-	t1 = t0 + n;
-	b00 = expanded_key + skoff_b00(logn);
-	b01 = expanded_key + skoff_b01(logn);
-	b10 = expanded_key + skoff_b10(logn);
-	b11 = expanded_key + skoff_b11(logn);
-	tree = expanded_key + skoff_tree(logn);
-
-	/*
-	 * Set the target vector to [hm, 0] (hm is the hashed message).
-	 */
-	for (u = 0; u < n; u ++) {
-		t0[u] = fpr_of(hm[u]);
-		/* This is implicit.
-		t1[u] = fpr_zero;
-		*/
-	}
-
-	/*
-	 * Apply the lattice basis to obtain the real target
-	 * vector (after normalization with regards to modulus).
-	 */
-	Zf(FFT)(t0, logn);
-	ni = fpr_inverse_of_q;
-	memcpy(t1, t0, n * sizeof *t0);
-	Zf(poly_mul_fft)(t1, b01, logn);
-	Zf(poly_mulconst)(t1, fpr_neg(ni), logn);
-	Zf(poly_mul_fft)(t0, b11, logn);
-	Zf(poly_mulconst)(t0, ni, logn);
-
-	tx = t1 + n;
-	ty = tx + n;
-
-	/*
-	 * Apply sampling. Output is written back in [tx, ty].
-	 */
-	ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, logn, ty + n);
-
-	/*
-	 * Get the lattice point corresponding to that tiny vector.
-	 */
-	memcpy(t0, tx, n * sizeof *tx);
-	memcpy(t1, ty, n * sizeof *ty);
-	Zf(poly_mul_fft)(tx, b00, logn);
-	Zf(poly_mul_fft)(ty, b10, logn);
-	Zf(poly_add)(tx, ty, logn);
-	memcpy(ty, t0, n * sizeof *t0);
-	Zf(poly_mul_fft)(ty, b01, logn);
-
-	memcpy(t0, tx, n * sizeof *tx);
-	Zf(poly_mul_fft)(t1, b11, logn);
-	Zf(poly_add)(t1, ty, logn);
-
-	Zf(iFFT)(t0, logn);
-	Zf(iFFT)(t1, logn);
-
-	/*
-	 * Compute the signature.
-	 */
-	s1tmp = (int16_t *)tx;
-	sqn = 0;
-	ng = 0;
-	for (u = 0; u < n; u ++) {
-		int32_t z;
-
-		z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
-		sqn += (uint32_t)(z * z);
-		ng |= sqn;
-		s1tmp[u] = (int16_t)z;
-	}
-	sqn |= -(ng >> 31);
-
-	/*
-	 * With "normal" degrees (e.g. 512 or 1024), it is very
-	 * improbable that the computed vector is not short enough;
-	 * however, it may happen in practice for the very reduced
-	 * versions (e.g. degree 16 or below). In that case, the caller
-	 * will loop, and we must not write anything into s2[] because
-	 * s2[] may overlap with the hashed message hm[] and we need
-	 * hm[] for the next iteration.
-	 */
-	s2tmp = (int16_t *)tmp;
-	for (u = 0; u < n; u ++) {
-		s2tmp[u] = (int16_t)-fpr_rint(t1[u]);
-	}
-	if (Zf(is_short_half)(sqn, s2tmp, logn)) {
-		memcpy(s2, s2tmp, n * sizeof *s2);
-		memcpy(tmp, s1tmp, n * sizeof *s1tmp);
-		return 1;
-	}
-	return 0;
-}
-
-/*
- * Compute a signature: the signature contains two vectors, s1 and s2.
- * The s1 vector is not returned. The squared norm of (s1,s2) is
- * computed, and if it is short enough, then s2 is returned into the
- * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
- * returned; the caller should then try again.
- *
- * tmp[] must have room for at least nine polynomials.
- */
-static int
-do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
-	const int8_t *restrict f, const int8_t *restrict g,
-	const int8_t *restrict F, const int8_t *restrict G,
-	const uint16_t *hm, unsigned logn, fpr *restrict tmp)
-{
-	size_t n, u;
-	fpr *t0, *t1, *tx, *ty;
-	fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
-	fpr ni;
-	uint32_t sqn, ng;
-	int16_t *s1tmp, *s2tmp;
-
-	n = MKN(logn);
-
-	/*
-	 * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT.
-	 */
-	b00 = tmp;
-	b01 = b00 + n;
-	b10 = b01 + n;
-	b11 = b10 + n;
-	smallints_to_fpr(b01, f, logn);
-	smallints_to_fpr(b00, g, logn);
-	smallints_to_fpr(b11, F, logn);
-	smallints_to_fpr(b10, G, logn);
-	Zf(FFT)(b01, logn);
-	Zf(FFT)(b00, logn);
-	Zf(FFT)(b11, logn);
-	Zf(FFT)(b10, logn);
-	Zf(poly_neg)(b01, logn);
-	Zf(poly_neg)(b11, logn);
-
-	/*
-	 * Compute the Gram matrix G = B·B*. Formulas are:
-	 *   g00 = b00*adj(b00) + b01*adj(b01)
-	 *   g01 = b00*adj(b10) + b01*adj(b11)
-	 *   g10 = b10*adj(b00) + b11*adj(b01)
-	 *   g11 = b10*adj(b10) + b11*adj(b11)
-	 *
-	 * For historical reasons, this implementation uses
-	 * g00, g01 and g11 (upper triangle). g10 is not kept
-	 * since it is equal to adj(g01).
-	 *
-	 * We _replace_ the matrix B with the Gram matrix, but we
-	 * must keep b01 and b11 for computing the target vector.
-	 */
-	t0 = b11 + n;
-	t1 = t0 + n;
-
-	memcpy(t0, b01, n * sizeof *b01);
-	Zf(poly_mulselfadj_fft)(t0, logn);    // t0 <- b01*adj(b01)
-
-	memcpy(t1, b00, n * sizeof *b00);
-	Zf(poly_muladj_fft)(t1, b10, logn);   // t1 <- b00*adj(b10)
-	Zf(poly_mulselfadj_fft)(b00, logn);   // b00 <- b00*adj(b00)
-	Zf(poly_add)(b00, t0, logn);      // b00 <- g00
-	memcpy(t0, b01, n * sizeof *b01);
-	Zf(poly_muladj_fft)(b01, b11, logn);  // b01 <- b01*adj(b11)
-	Zf(poly_add)(b01, t1, logn);      // b01 <- g01
-
-	Zf(poly_mulselfadj_fft)(b10, logn);   // b10 <- b10*adj(b10)
-	memcpy(t1, b11, n * sizeof *b11);
-	Zf(poly_mulselfadj_fft)(t1, logn);    // t1 <- b11*adj(b11)
-	Zf(poly_add)(b10, t1, logn);      // b10 <- g11
-
-	/*
-	 * We rename variables to make things clearer. The three elements
-	 * of the Gram matrix uses the first 3*n slots of tmp[], followed
-	 * by b11 and b01 (in that order).
-	 */
-	g00 = b00;
-	g01 = b01;
-	g11 = b10;
-	b01 = t0;
-	t0 = b01 + n;
-	t1 = t0 + n;
-
-	/*
-	 * Memory layout at that point:
-	 *   g00 g01 g11 b11 b01 t0 t1
-	 */
-
-	/*
-	 * Set the target vector to [hm, 0] (hm is the hashed message).
-	 */
-	for (u = 0; u < n; u ++) {
-		t0[u] = fpr_of(hm[u]);
-		/* This is implicit.
-		t1[u] = fpr_zero;
-		*/
-	}
-
-	/*
-	 * Apply the lattice basis to obtain the real target
-	 * vector (after normalization with regards to modulus).
-	 */
-	Zf(FFT)(t0, logn);
-	ni = fpr_inverse_of_q;
-	memcpy(t1, t0, n * sizeof *t0);
-	Zf(poly_mul_fft)(t1, b01, logn);
-	Zf(poly_mulconst)(t1, fpr_neg(ni), logn);
-	Zf(poly_mul_fft)(t0, b11, logn);
-	Zf(poly_mulconst)(t0, ni, logn);
-
-	/*
-	 * b01 and b11 can be discarded, so we move back (t0,t1).
-	 * Memory layout is now:
-	 *      g00 g01 g11 t0 t1
-	 */
-	memcpy(b11, t0, n * 2 * sizeof *t0);
-	t0 = g11 + n;
-	t1 = t0 + n;
-
-	/*
-	 * Apply sampling; result is written over (t0,t1).
-	 */
-	ffSampling_fft_dyntree(samp, samp_ctx,
-		t0, t1, g00, g01, g11, logn, t1 + n);
-
-	/*
-	 * We arrange the layout back to:
-	 *     b00 b01 b10 b11 t0 t1
-	 *
-	 * We did not conserve the matrix basis, so we must recompute
-	 * it now.
-	 */
-	b00 = tmp;
-	b01 = b00 + n;
-	b10 = b01 + n;
-	b11 = b10 + n;
-	memmove(b11 + n, t0, n * 2 * sizeof *t0);
-	t0 = b11 + n;
-	t1 = t0 + n;
-	smallints_to_fpr(b01, f, logn);
-	smallints_to_fpr(b00, g, logn);
-	smallints_to_fpr(b11, F, logn);
-	smallints_to_fpr(b10, G, logn);
-	Zf(FFT)(b01, logn);
-	Zf(FFT)(b00, logn);
-	Zf(FFT)(b11, logn);
-	Zf(FFT)(b10, logn);
-	Zf(poly_neg)(b01, logn);
-	Zf(poly_neg)(b11, logn);
-	tx = t1 + n;
-	ty = tx + n;
-
-	/*
-	 * Get the lattice point corresponding to that tiny vector.
-	 */
-	memcpy(tx, t0, n * sizeof *t0);
-	memcpy(ty, t1, n * sizeof *t1);
-	Zf(poly_mul_fft)(tx, b00, logn);
-	Zf(poly_mul_fft)(ty, b10, logn);
-	Zf(poly_add)(tx, ty, logn);
-	memcpy(ty, t0, n * sizeof *t0);
-	Zf(poly_mul_fft)(ty, b01, logn);
-
-	memcpy(t0, tx, n * sizeof *tx);
-	Zf(poly_mul_fft)(t1, b11, logn);
-	Zf(poly_add)(t1, ty, logn);
-	Zf(iFFT)(t0, logn);
-	Zf(iFFT)(t1, logn);
-
-	s1tmp = (int16_t *)tx;
-	sqn = 0;
-	ng = 0;
-	for (u = 0; u < n; u ++) {
-		int32_t z;
-
-		z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
-		sqn += (uint32_t)(z * z);
-		ng |= sqn;
-		s1tmp[u] = (int16_t)z;
-	}
-	sqn |= -(ng >> 31);
-
-	/*
-	 * With "normal" degrees (e.g. 512 or 1024), it is very
-	 * improbable that the computed vector is not short enough;
-	 * however, it may happen in practice for the very reduced
-	 * versions (e.g. degree 16 or below). In that case, the caller
-	 * will loop, and we must not write anything into s2[] because
-	 * s2[] may overlap with the hashed message hm[] and we need
-	 * hm[] for the next iteration.
-	 */
-	s2tmp = (int16_t *)tmp;
-	for (u = 0; u < n; u ++) {
-		s2tmp[u] = (int16_t)-fpr_rint(t1[u]);
-	}
-	if (Zf(is_short_half)(sqn, s2tmp, logn)) {
-		memcpy(s2, s2tmp, n * sizeof *s2);
-		memcpy(tmp, s1tmp, n * sizeof *s1tmp);
-		return 1;
-	}
-	return 0;
-}
-
-/*
- * Sample an integer value along a half-gaussian distribution centered
- * on zero and standard deviation 1.8205, with a precision of 72 bits.
- */
-TARGET_AVX2
-int
-Zf(gaussian0_sampler)(prng *p)
-{
-#if FALCON_AVX2 // yyyAVX2+1
-
-	/*
-	 * High words.
-	 */
-	static const union {
-		uint16_t u16[16];
-		__m256i ymm[1];
-	} rhi15 = {
-		{
-			0x51FB, 0x2A69, 0x113E, 0x0568,
-			0x014A, 0x003B, 0x0008, 0x0000,
-			0x0000, 0x0000, 0x0000, 0x0000,
-			0x0000, 0x0000, 0x0000, 0x0000
-		}
-	};
-
-	static const union {
-		uint64_t u64[20];
-		__m256i ymm[5];
-	} rlo57 = {
-		{
-			0x1F42ED3AC391802, 0x12B181F3F7DDB82,
-			0x1CDD0934829C1FF, 0x1754377C7994AE4,
-			0x1846CAEF33F1F6F, 0x14AC754ED74BD5F,
-			0x024DD542B776AE4, 0x1A1FFDC65AD63DA,
-			0x01F80D88A7B6428, 0x001C3FDB2040C69,
-			0x00012CF24D031FB, 0x00000949F8B091F,
-			0x0000003665DA998, 0x00000000EBF6EBB,
-			0x0000000002F5D7E, 0x000000000007098,
-			0x0000000000000C6, 0x000000000000001,
-			0x000000000000000, 0x000000000000000
-		}
-	};
-
-	uint64_t lo;
-	unsigned hi;
-	__m256i xhi, rhi, gthi, eqhi, eqm;
-	__m256i xlo, gtlo0, gtlo1, gtlo2, gtlo3, gtlo4;
-	__m128i t, zt;
-	int r;
-
-	/*
-	 * Get a 72-bit random value and split it into a low part
-	 * (57 bits) and a high part (15 bits)
-	 */
-	lo = prng_get_u64(p);
-	hi = prng_get_u8(p);
-	hi = (hi << 7) | (unsigned)(lo >> 57);
-	lo &= 0x1FFFFFFFFFFFFFF;
-
-	/*
-	 * Broadcast the high part and compare it with the relevant
-	 * values. We need both a "greater than" and an "equal"
-	 * comparisons.
-	 */
-	xhi = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(hi));
-	rhi = _mm256_loadu_si256(&rhi15.ymm[0]);
-	gthi = _mm256_cmpgt_epi16(rhi, xhi);
-	eqhi = _mm256_cmpeq_epi16(rhi, xhi);
-
-	/*
-	 * The result is the number of 72-bit values (among the list of 19)
-	 * which are greater than the 72-bit random value. We first count
-	 * all non-zero 16-bit elements in the first eight of gthi. Such
-	 * elements have value -1 or 0, so we first negate them.
-	 */
-	t = _mm_srli_epi16(_mm256_castsi256_si128(gthi), 15);
-	zt = _mm_setzero_si128();
-	t = _mm_hadd_epi16(t, zt);
-	t = _mm_hadd_epi16(t, zt);
-	t = _mm_hadd_epi16(t, zt);
-	r = _mm_cvtsi128_si32(t);
-
-	/*
-	 * We must look at the low bits for all values for which the
-	 * high bits are an "equal" match; values 8-18 all have the
-	 * same high bits (0).
-	 * On 32-bit systems, 'lo' really is two registers, requiring
-	 * some extra code.
-	 */
-#if defined(__x86_64__) || defined(_M_X64)
-	xlo = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(*(int64_t *)&lo));
-#else
-	{
-		uint32_t e0, e1;
-		int32_t f0, f1;
-
-		e0 = (uint32_t)lo;
-		e1 = (uint32_t)(lo >> 32);
-		f0 = *(int32_t *)&e0;
-		f1 = *(int32_t *)&e1;
-		xlo = _mm256_set_epi32(f1, f0, f1, f0, f1, f0, f1, f0);
-	}
-#endif
-	gtlo0 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[0]), xlo); 
-	gtlo1 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[1]), xlo); 
-	gtlo2 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[2]), xlo); 
-	gtlo3 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[3]), xlo); 
-	gtlo4 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[4]), xlo); 
-
-	/*
-	 * Keep only comparison results that correspond to the non-zero
-	 * elements in eqhi.
-	 */
-	gtlo0 = _mm256_and_si256(gtlo0, _mm256_cvtepi16_epi64(
-		_mm256_castsi256_si128(eqhi)));
-	gtlo1 = _mm256_and_si256(gtlo1, _mm256_cvtepi16_epi64(
-		_mm256_castsi256_si128(_mm256_bsrli_epi128(eqhi, 8))));
-	eqm = _mm256_permute4x64_epi64(eqhi, 0xFF);
-	gtlo2 = _mm256_and_si256(gtlo2, eqm);
-	gtlo3 = _mm256_and_si256(gtlo3, eqm);
-	gtlo4 = _mm256_and_si256(gtlo4, eqm);
-
-	/*
-	 * Add all values to count the total number of "-1" elements.
-	 * Since the first eight "high" words are all different, only
-	 * one element (at most) in gtlo0:gtlo1 can be non-zero; however,
-	 * if the high word of the random value is zero, then many
-	 * elements of gtlo2:gtlo3:gtlo4 can be non-zero.
-	 */
-	gtlo0 = _mm256_or_si256(gtlo0, gtlo1);
-	gtlo0 = _mm256_add_epi64(
-		_mm256_add_epi64(gtlo0, gtlo2),
-		_mm256_add_epi64(gtlo3, gtlo4));
-	t = _mm_add_epi64(
-		_mm256_castsi256_si128(gtlo0),
-		_mm256_extracti128_si256(gtlo0, 1));
-	t = _mm_add_epi64(t, _mm_srli_si128(t, 8));
-	r -= _mm_cvtsi128_si32(t);
-
-	return r;
-
-#else // yyyAVX2+0
-
-	static const uint32_t dist[] = {
-		10745844u,  3068844u,  3741698u,
-		 5559083u,  1580863u,  8248194u,
-		 2260429u, 13669192u,  2736639u,
-		  708981u,  4421575u, 10046180u,
-		  169348u,  7122675u,  4136815u,
-		   30538u, 13063405u,  7650655u,
-		    4132u, 14505003u,  7826148u,
-		     417u, 16768101u, 11363290u,
-		      31u,  8444042u,  8086568u,
-		       1u, 12844466u,   265321u,
-		       0u,  1232676u, 13644283u,
-		       0u,    38047u,  9111839u,
-		       0u,      870u,  6138264u,
-		       0u,       14u, 12545723u,
-		       0u,        0u,  3104126u,
-		       0u,        0u,    28824u,
-		       0u,        0u,      198u,
-		       0u,        0u,        1u
-	};
-
-	uint32_t v0, v1, v2, hi;
-	uint64_t lo;
-	size_t u;
-	int z;
-
-	/*
-	 * Get a random 72-bit value, into three 24-bit limbs v0..v2.
-	 */
-	lo = prng_get_u64(p);
-	hi = prng_get_u8(p);
-	v0 = (uint32_t)lo & 0xFFFFFF;
-	v1 = (uint32_t)(lo >> 24) & 0xFFFFFF;
-	v2 = (uint32_t)(lo >> 48) | (hi << 16);
-
-	/*
-	 * Sampled value is z, such that v0..v2 is lower than the first
-	 * z elements of the table.
-	 */
-	z = 0;
-	for (u = 0; u < (sizeof dist) / sizeof(dist[0]); u += 3) {
-		uint32_t w0, w1, w2, cc;
-
-		w0 = dist[u + 2];
-		w1 = dist[u + 1];
-		w2 = dist[u + 0];
-		cc = (v0 - w0) >> 31;
-		cc = (v1 - w1 - cc) >> 31;
-		cc = (v2 - w2 - cc) >> 31;
-		z += (int)cc;
-	}
-	return z;
-
-#endif // yyyAVX2-
-}
-
-/*
- * Sample a bit with probability exp(-x) for some x >= 0.
- */
-TARGET_AVX2
-static int
-BerExp(prng *p, fpr x, fpr ccs)
-{
-	int s, i;
-	fpr r;
-	uint32_t sw, w;
-	uint64_t z;
-
-	/*
-	 * Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
-	 * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
-	 */
-	s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
-	r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
-
-	/*
-	 * It may happen (quite rarely) that s >= 64; if sigma = 1.2
-	 * (the minimum value for sigma), r = 0 and b = 1, then we get
-	 * s >= 64 if the half-Gaussian produced a z >= 13, which happens
-	 * with probability about 0.000000000230383991, which is
-	 * approximatively equal to 2^(-32). In any case, if s >= 64,
-	 * then BerExp will be non-zero with probability less than
-	 * 2^(-64), so we can simply saturate s at 63.
-	 */
-	sw = (uint32_t)s;
-	sw ^= (sw ^ 63) & -((63 - sw) >> 31);
-	s = (int)sw;
-
-	/*
-	 * Compute exp(-r); we know that 0 <= r < log(2) at this point, so
-	 * we can use fpr_expm_p63(), which yields a result scaled to 2^63.
-	 * We scale it up to 2^64, then right-shift it by s bits because
-	 * we really want exp(-x) = 2^(-s)*exp(-r).
-	 *
-	 * The "-1" operation makes sure that the value fits on 64 bits
-	 * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
-	 * case). The bias is negligible since fpr_expm_p63() only computes
-	 * with 51 bits of precision or so.
-	 */
-	z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
-
-	/*
-	 * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
-	 * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
-	 * PRNG output to limit its consumption, the sign of the difference
-	 * yields the expected result.
-	 */
-	i = 64;
-	do {
-		i -= 8;
-		w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
-	} while (!w && i > 0);
-	return (int)(w >> 31);
-}
-
-/*
- * The sampler produces a random integer that follows a discrete Gaussian
- * distribution, centered on mu, and with standard deviation sigma. The
- * provided parameter isigma is equal to 1/sigma.
- *
- * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
- * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
- */
-TARGET_AVX2
-int
-Zf(sampler)(void *ctx, fpr mu, fpr isigma)
-{
-	sampler_context *spc;
-	int s;
-	fpr r, dss, ccs;
-
-	spc = ctx;
-
-	/*
-	 * Center is mu. We compute mu = s + r where s is an integer
-	 * and 0 <= r < 1.
-	 */
-	s = (int)fpr_floor(mu);
-	r = fpr_sub(mu, fpr_of(s));
-
-	/*
-	 * dss = 1/(2*sigma^2) = 0.5*(isigma^2).
-	 */
-	dss = fpr_half(fpr_sqr(isigma));
-
-	/*
-	 * ccs = sigma_min / sigma = sigma_min * isigma.
-	 */
-	ccs = fpr_mul(isigma, spc->sigma_min);
-
-	/*
-	 * We now need to sample on center r.
-	 */
-	for (;;) {
-		int z0, z, b;
-		fpr x;
-
-		/*
-		 * Sample z for a Gaussian distribution. Then get a
-		 * random bit b to turn the sampling into a bimodal
-		 * distribution: if b = 1, we use z+1, otherwise we
-		 * use -z. We thus have two situations:
-		 *
-		 *  - b = 1: z >= 1 and sampled against a Gaussian
-		 *    centered on 1.
-		 *  - b = 0: z <= 0 and sampled against a Gaussian
-		 *    centered on 0.
-		 */
-		z0 = Zf(gaussian0_sampler)(&spc->p);
-		b = prng_get_u8(&spc->p) & 1;
-		z = b + ((b << 1) - 1) * z0;
-
-		/*
-		 * Rejection sampling. We want a Gaussian centered on r;
-		 * but we sampled against a Gaussian centered on b (0 or
-		 * 1). But we know that z is always in the range where
-		 * our sampling distribution is greater than the Gaussian
-		 * distribution, so rejection works.
-		 *
-		 * We got z with distribution:
-		 *    G(z) = exp(-((z-b)^2)/(2*sigma0^2))
-		 * We target distribution:
-		 *    S(z) = exp(-((z-r)^2)/(2*sigma^2))
-		 * Rejection sampling works by keeping the value z with
-		 * probability S(z)/G(z), and starting again otherwise.
-		 * This requires S(z) <= G(z), which is the case here.
-		 * Thus, we simply need to keep our z with probability:
-		 *    P = exp(-x)
-		 * where:
-		 *    x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
-		 *
-		 * Here, we scale up the Bernouilli distribution, which
-		 * makes rejection more probable, but makes rejection
-		 * rate sufficiently decorrelated from the Gaussian
-		 * center and standard deviation that the whole sampler
-		 * can be said to be constant-time.
-		 */
-		x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
-		x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
-		if (BerExp(&spc->p, x, ccs)) {
-			/*
-			 * Rejection sampling was centered on r, but the
-			 * actual center is mu = s + r.
-			 */
-			return s + z;
-		}
-	}
-}
-
-/* see inner.h */
-void
-Zf(sign_tree)(int16_t *sig, inner_shake256_context *rng,
-	const fpr *restrict expanded_key,
-	const uint16_t *hm, unsigned logn, uint8_t *tmp)
-{
-	fpr *ftmp;
-
-	ftmp = (fpr *)tmp;
-	for (;;) {
-		/*
-		 * Signature produces short vectors s1 and s2. The
-		 * signature is acceptable only if the aggregate vector
-		 * s1,s2 is short; we must use the same bound as the
-		 * verifier.
-		 *
-		 * If the signature is acceptable, then we return only s2
-		 * (the verifier recomputes s1 from s2, the hashed message,
-		 * and the public key).
-		 */
-		sampler_context spc;
-		samplerZ samp;
-		void *samp_ctx;
-
-		/*
-		 * Normal sampling. We use a fast PRNG seeded from our
-		 * SHAKE context ('rng').
-		 */
-		spc.sigma_min = (logn == 10)
-			? fpr_sigma_min_10
-			: fpr_sigma_min_9;
-		Zf(prng_init)(&spc.p, rng);
-		samp = Zf(sampler);
-		samp_ctx = &spc;
-
-		/*
-		 * Do the actual signature.
-		 */
-		if (do_sign_tree(samp, samp_ctx, sig,
-			expanded_key, hm, logn, ftmp))
-		{
-			break;
-		}
-	}
-}
-
-/* see inner.h */
-void
-Zf(sign_dyn)(int16_t *sig, inner_shake256_context *rng,
-	const int8_t *restrict f, const int8_t *restrict g,
-	const int8_t *restrict F, const int8_t *restrict G,
-	const uint16_t *hm, unsigned logn, uint8_t *tmp)
-{
-	fpr *ftmp;
-
-	ftmp = (fpr *)tmp;
-	for (;;) {
-		/*
-		 * Signature produces short vectors s1 and s2. The
-		 * signature is acceptable only if the aggregate vector
-		 * s1,s2 is short; we must use the same bound as the
-		 * verifier.
-		 *
-		 * If the signature is acceptable, then we return only s2
-		 * (the verifier recomputes s1 from s2, the hashed message,
-		 * and the public key).
-		 */
-		sampler_context spc;
-		samplerZ samp;
-		void *samp_ctx;
-
-		/*
-		 * Normal sampling. We use a fast PRNG seeded from our
-		 * SHAKE context ('rng').
-		 */
-		spc.sigma_min = (logn == 10)
-			? fpr_sigma_min_10
-			: fpr_sigma_min_9;
-		Zf(prng_init)(&spc.p, rng);
-		samp = Zf(sampler);
-		samp_ctx = &spc;
-
-		/*
-		 * Do the actual signature.
-		 */
-		if (do_sign_dyn(samp, samp_ctx, sig,
-			f, g, F, G, hm, logn, ftmp))
-		{
-			break;
-		}
-	}
-}
diff --git a/crypto_sign/falcon-512/m4-ct/vrfy.c b/crypto_sign/falcon-512/m4-ct/vrfy.c
deleted file mode 100644
index c74a3dd3..00000000
--- a/crypto_sign/falcon-512/m4-ct/vrfy.c
+++ /dev/null
@@ -1,871 +0,0 @@
-/*
- * Falcon signature verification.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2017-2019  Falcon Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
- */
-
-#include "inner.h"
-
-/* ===================================================================== */
-/*
- * Constants for NTT.
- *
- *   n = 2^logn  (2 <= n <= 1024)
- *   phi = X^n + 1
- *   q = 12289
- *   q0i = -1/q mod 2^16
- *   R = 2^16 mod q
- *   R2 = 2^32 mod q
- */
-
-#define Q     12289
-#define Q0I   12287
-#define R      4091
-#define R2    10952
-
-/*
- * Table for NTT, binary case:
- *   GMb[x] = R*(g^rev(x)) mod q
- * where g = 7 (it is a 2048-th primitive root of 1 modulo q)
- * and rev() is the bit-reversal function over 10 bits.
- */
-static const uint16_t GMb[] = {
-	 4091,  7888, 11060, 11208,  6960,  4342,  6275,  9759,
-	 1591,  6399,  9477,  5266,   586,  5825,  7538,  9710,
-	 1134,  6407,  1711,   965,  7099,  7674,  3743,  6442,
-	10414,  8100,  1885,  1688,  1364, 10329, 10164,  9180,
-	12210,  6240,   997,   117,  4783,  4407,  1549,  7072,
-	 2829,  6458,  4431,  8877,  7144,  2564,  5664,  4042,
-	12189,   432, 10751,  1237,  7610,  1534,  3983,  7863,
-	 2181,  6308,  8720,  6570,  4843,  1690,    14,  3872,
-	 5569,  9368, 12163,  2019,  7543,  2315,  4673,  7340,
-	 1553,  1156,  8401, 11389,  1020,  2967, 10772,  7045,
-	 3316, 11236,  5285, 11578, 10637, 10086,  9493,  6180,
-	 9277,  6130,  3323,   883, 10469,   489,  1502,  2851,
-	11061,  9729,  2742, 12241,  4970, 10481, 10078,  1195,
-	  730,  1762,  3854,  2030,  5892, 10922,  9020,  5274,
-	 9179,  3604,  3782, 10206,  3180,  3467,  4668,  2446,
-	 7613,  9386,   834,  7703,  6836,  3403,  5351, 12276,
-	 3580,  1739, 10820,  9787, 10209,  4070, 12250,  8525,
-	10401,  2749,  7338, 10574,  6040,   943,  9330,  1477,
-	 6865,  9668,  3585,  6633, 12145,  4063,  3684,  7680,
-	 8188,  6902,  3533,  9807,  6090,   727, 10099,  7003,
-	 6945,  1949,  9731, 10559,  6057,   378,  7871,  8763,
-	 8901,  9229,  8846,  4551,  9589, 11664,  7630,  8821,
-	 5680,  4956,  6251,  8388, 10156,  8723,  2341,  3159,
-	 1467,  5460,  8553,  7783,  2649,  2320,  9036,  6188,
-	  737,  3698,  4699,  5753,  9046,  3687,    16,   914,
-	 5186, 10531,  4552,  1964,  3509,  8436,  7516,  5381,
-	10733,  3281,  7037,  1060,  2895,  7156,  8887,  5357,
-	 6409,  8197,  2962,  6375,  5064,  6634,  5625,   278,
-	  932, 10229,  8927,  7642,   351,  9298,   237,  5858,
-	 7692,  3146, 12126,  7586,  2053, 11285,  3802,  5204,
-	 4602,  1748, 11300,   340,  3711,  4614,   300, 10993,
-	 5070, 10049, 11616, 12247,  7421, 10707,  5746,  5654,
-	 3835,  5553,  1224,  8476,  9237,  3845,   250, 11209,
-	 4225,  6326,  9680, 12254,  4136,  2778,   692,  8808,
-	 6410,  6718, 10105, 10418,  3759,  7356, 11361,  8433,
-	 6437,  3652,  6342,  8978,  5391,  2272,  6476,  7416,
-	 8418, 10824, 11986,  5733,   876,  7030,  2167,  2436,
-	 3442,  9217,  8206,  4858,  5964,  2746,  7178,  1434,
-	 7389,  8879, 10661, 11457,  4220,  1432, 10832,  4328,
-	 8557,  1867,  9454,  2416,  3816,  9076,   686,  5393,
-	 2523,  4339,  6115,   619,   937,  2834,  7775,  3279,
-	 2363,  7488,  6112,  5056,   824, 10204, 11690,  1113,
-	 2727,  9848,   896,  2028,  5075,  2654, 10464,  7884,
-	12169,  5434,  3070,  6400,  9132, 11672, 12153,  4520,
-	 1273,  9739, 11468,  9937, 10039,  9720,  2262,  9399,
-	11192,   315,  4511,  1158,  6061,  6751, 11865,   357,
-	 7367,  4550,   983,  8534,  8352, 10126,  7530,  9253,
-	 4367,  5221,  3999,  8777,  3161,  6990,  4130, 11652,
-	 3374, 11477,  1753,   292,  8681,  2806, 10378, 12188,
-	 5800, 11811,  3181,  1988,  1024,  9340,  2477, 10928,
-	 4582,  6750,  3619,  5503,  5233,  2463,  8470,  7650,
-	 7964,  6395,  1071,  1272,  3474, 11045,  3291, 11344,
-	 8502,  9478,  9837,  1253,  1857,  6233,  4720, 11561,
-	 6034,  9817,  3339,  1797,  2879,  6242,  5200,  2114,
-	 7962,  9353, 11363,  5475,  6084,  9601,  4108,  7323,
-	10438,  9471,  1271,   408,  6911,  3079,   360,  8276,
-	11535,  9156,  9049, 11539,   850,  8617,   784,  7919,
-	 8334, 12170,  1846, 10213, 12184,  7827, 11903,  5600,
-	 9779,  1012,   721,  2784,  6676,  6552,  5348,  4424,
-	 6816,  8405,  9959,  5150,  2356,  5552,  5267,  1333,
-	 8801,  9661,  7308,  5788,  4910,   909, 11613,  4395,
-	 8238,  6686,  4302,  3044,  2285, 12249,  1963,  9216,
-	 4296, 11918,   695,  4371,  9793,  4884,  2411, 10230,
-	 2650,   841,  3890, 10231,  7248,  8505, 11196,  6688,
-	 4059,  6060,  3686,  4722, 11853,  5816,  7058,  6868,
-	11137,  7926,  4894, 12284,  4102,  3908,  3610,  6525,
-	 7938,  7982, 11977,  6755,   537,  4562,  1623,  8227,
-	11453,  7544,   906, 11816,  9548, 10858,  9703,  2815,
-	11736,  6813,  6979,   819,  8903,  6271, 10843,   348,
-	 7514,  8339,  6439,   694,   852,  5659,  2781,  3716,
-	11589,  3024,  1523,  8659,  4114, 10738,  3303,  5885,
-	 2978,  7289, 11884,  9123,  9323, 11830,    98,  2526,
-	 2116,  4131, 11407,  1844,  3645,  3916,  8133,  2224,
-	10871,  8092,  9651,  5989,  7140,  8480,  1670,   159,
-	10923,  4918,   128,  7312,   725,  9157,  5006,  6393,
-	 3494,  6043, 10972,  6181, 11838,  3423, 10514,  7668,
-	 3693,  6658,  6905, 11953, 10212, 11922,  9101,  8365,
-	 5110,    45,  2400,  1921,  4377,  2720,  1695,    51,
-	 2808,   650,  1896,  9997,  9971, 11980,  8098,  4833,
-	 4135,  4257,  5838,  4765, 10985, 11532,   590, 12198,
-	  482, 12173,  2006,  7064, 10018,  3912, 12016, 10519,
-	11362,  6954,  2210,   284,  5413,  6601,  3865, 10339,
-	11188,  6231,   517,  9564, 11281,  3863,  1210,  4604,
-	 8160, 11447,   153,  7204,  5763,  5089,  9248, 12154,
-	11748,  1354,  6672,   179,  5532,  2646,  5941, 12185,
-	  862,  3158,   477,  7279,  5678,  7914,  4254,   302,
-	 2893, 10114,  6890,  9560,  9647, 11905,  4098,  9824,
-	10269,  1353, 10715,  5325,  6254,  3951,  1807,  6449,
-	 5159,  1308,  8315,  3404,  1877,  1231,   112,  6398,
-	11724, 12272,  7286,  1459, 12274,  9896,  3456,   800,
-	 1397, 10678,   103,  7420,  7976,   936,   764,   632,
-	 7996,  8223,  8445,  7758, 10870,  9571,  2508,  1946,
-	 6524, 10158,  1044,  4338,  2457,  3641,  1659,  4139,
-	 4688,  9733, 11148,  3946,  2082,  5261,  2036, 11850,
-	 7636, 12236,  5366,  2380,  1399,  7720,  2100,  3217,
-	10912,  8898,  7578, 11995,  2791,  1215,  3355,  2711,
-	 2267,  2004,  8568, 10176,  3214,  2337,  1750,  4729,
-	 4997,  7415,  6315, 12044,  4374,  7157,  4844,   211,
-	 8003, 10159,  9290, 11481,  1735,  2336,  5793,  9875,
-	 8192,   986,  7527,  1401,   870,  3615,  8465,  2756,
-	 9770,  2034, 10168,  3264,  6132,    54,  2880,  4763,
-	11805,  3074,  8286,  9428,  4881,  6933,  1090, 10038,
-	 2567,   708,   893,  6465,  4962, 10024,  2090,  5718,
-	10743,   780,  4733,  4623,  2134,  2087,  4802,   884,
-	 5372,  5795,  5938,  4333,  6559,  7549,  5269, 10664,
-	 4252,  3260,  5917, 10814,  5768,  9983,  8096,  7791,
-	 6800,  7491,  6272,  1907, 10947,  6289, 11803,  6032,
-	11449,  1171,  9201,  7933,  2479,  7970, 11337,  7062,
-	 8911,  6728,  6542,  8114,  8828,  6595,  3545,  4348,
-	 4610,  2205,  6999,  8106,  5560, 10390,  9321,  2499,
-	 2413,  7272,  6881, 10582,  9308,  9437,  3554,  3326,
-	 5991, 11969,  3415, 12283,  9838, 12063,  4332,  7830,
-	11329,  6605, 12271,  2044, 11611,  7353, 11201, 11582,
-	 3733,  8943,  9978,  1627,  7168,  3935,  5050,  2762,
-	 7496, 10383,   755,  1654, 12053,  4952, 10134,  4394,
-	 6592,  7898,  7497,  8904, 12029,  3581, 10748,  5674,
-	10358,  4901,  7414,  8771,   710,  6764,  8462,  7193,
-	 5371,  7274, 11084,   290,  7864,  6827, 11822,  2509,
-	 6578,  4026,  5807,  1458,  5721,  5762,  4178,  2105,
-	11621,  4852,  8897,  2856, 11510,  9264,  2520,  8776,
-	 7011,  2647,  1898,  7039,  5950, 11163,  5488,  6277,
-	 9182, 11456,   633, 10046, 11554,  5633,  9587,  2333,
-	 7008,  7084,  5047,  7199,  9865,  8997,   569,  6390,
-	10845,  9679,  8268, 11472,  4203,  1997,     2,  9331,
-	  162,  6182,  2000,  3649,  9792,  6363,  7557,  6187,
-	 8510,  9935,  5536,  9019,  3706, 12009,  1452,  3067,
-	 5494,  9692,  4865,  6019,  7106,  9610,  4588, 10165,
-	 6261,  5887,  2652, 10172,  1580, 10379,  4638,  9949
-};
-
-/*
- * Table for inverse NTT, binary case:
- *   iGMb[x] = R*((1/g)^rev(x)) mod q
- * Since g = 7, 1/g = 8778 mod 12289.
- */
-static const uint16_t iGMb[] = {
-	 4091,  4401,  1081,  1229,  2530,  6014,  7947,  5329,
-	 2579,  4751,  6464, 11703,  7023,  2812,  5890, 10698,
-	 3109,  2125,  1960, 10925, 10601, 10404,  4189,  1875,
-	 5847,  8546,  4615,  5190, 11324, 10578,  5882, 11155,
-	 8417, 12275, 10599,  7446,  5719,  3569,  5981, 10108,
-	 4426,  8306, 10755,  4679, 11052,  1538, 11857,   100,
-	 8247,  6625,  9725,  5145,  3412,  7858,  5831,  9460,
-	 5217, 10740,  7882,  7506, 12172, 11292,  6049,    79,
-	   13,  6938,  8886,  5453,  4586, 11455,  2903,  4676,
-	 9843,  7621,  8822,  9109,  2083,  8507,  8685,  3110,
-	 7015,  3269,  1367,  6397, 10259,  8435, 10527, 11559,
-	11094,  2211,  1808,  7319,    48,  9547,  2560,  1228,
-	 9438, 10787, 11800,  1820, 11406,  8966,  6159,  3012,
-	 6109,  2796,  2203,  1652,   711,  7004,  1053,  8973,
-	 5244,  1517,  9322, 11269,   900,  3888, 11133, 10736,
-	 4949,  7616,  9974,  4746, 10270,   126,  2921,  6720,
-	 6635,  6543,  1582,  4868,    42,   673,  2240,  7219,
-	 1296, 11989,  7675,  8578, 11949,   989, 10541,  7687,
-	 7085,  8487,  1004, 10236,  4703,   163,  9143,  4597,
-	 6431, 12052,  2991, 11938,  4647,  3362,  2060, 11357,
-	12011,  6664,  5655,  7225,  5914,  9327,  4092,  5880,
-	 6932,  3402,  5133,  9394, 11229,  5252,  9008,  1556,
-	 6908,  4773,  3853,  8780, 10325,  7737,  1758,  7103,
-	11375, 12273,  8602,  3243,  6536,  7590,  8591, 11552,
-	 6101,  3253,  9969,  9640,  4506,  3736,  6829, 10822,
-	 9130,  9948,  3566,  2133,  3901,  6038,  7333,  6609,
-	 3468,  4659,   625,  2700,  7738,  3443,  3060,  3388,
-	 3526,  4418, 11911,  6232,  1730,  2558, 10340,  5344,
-	 5286,  2190, 11562,  6199,  2482,  8756,  5387,  4101,
-	 4609,  8605,  8226,   144,  5656,  8704,  2621,  5424,
-	10812,  2959, 11346,  6249,  1715,  4951,  9540,  1888,
-	 3764,    39,  8219,  2080,  2502,  1469, 10550,  8709,
-	 5601,  1093,  3784,  5041,  2058,  8399, 11448,  9639,
-	 2059,  9878,  7405,  2496,  7918, 11594,   371,  7993,
-	 3073, 10326,    40, 10004,  9245,  7987,  5603,  4051,
-	 7894,   676, 11380,  7379,  6501,  4981,  2628,  3488,
-	10956,  7022,  6737,  9933,  7139,  2330,  3884,  5473,
-	 7865,  6941,  5737,  5613,  9505, 11568, 11277,  2510,
-	 6689,   386,  4462,   105,  2076, 10443,   119,  3955,
-	 4370, 11505,  3672, 11439,   750,  3240,  3133,   754,
-	 4013, 11929,  9210,  5378, 11881, 11018,  2818,  1851,
-	 4966,  8181,  2688,  6205,  6814,   926,  2936,  4327,
-	10175,  7089,  6047,  9410, 10492,  8950,  2472,  6255,
-	  728,  7569,  6056, 10432, 11036,  2452,  2811,  3787,
-	  945,  8998,  1244,  8815, 11017, 11218,  5894,  4325,
-	 4639,  3819,  9826,  7056,  6786,  8670,  5539,  7707,
-	 1361,  9812,  2949, 11265, 10301,  9108,   478,  6489,
-	  101,  1911,  9483,  3608, 11997, 10536,   812,  8915,
-	  637,  8159,  5299,  9128,  3512,  8290,  7068,  7922,
-	 3036,  4759,  2163,  3937,  3755, 11306,  7739,  4922,
-	11932,   424,  5538,  6228, 11131,  7778, 11974,  1097,
-	 2890, 10027,  2569,  2250,  2352,   821,  2550, 11016,
-	 7769,   136,   617,  3157,  5889,  9219,  6855,   120,
-	 4405,  1825,  9635,  7214, 10261, 11393,  2441,  9562,
-	11176,   599,  2085, 11465,  7233,  6177,  4801,  9926,
-	 9010,  4514,  9455, 11352, 11670,  6174,  7950,  9766,
-	 6896, 11603,  3213,  8473,  9873,  2835, 10422,  3732,
-	 7961,  1457, 10857,  8069,   832,  1628,  3410,  4900,
-	10855,  5111,  9543,  6325,  7431,  4083,  3072,  8847,
-	 9853, 10122,  5259, 11413,  6556,   303,  1465,  3871,
-	 4873,  5813, 10017,  6898,  3311,  5947,  8637,  5852,
-	 3856,   928,  4933,  8530,  1871,  2184,  5571,  5879,
-	 3481, 11597,  9511,  8153,    35,  2609,  5963,  8064,
-	 1080, 12039,  8444,  3052,  3813, 11065,  6736,  8454,
-	 2340,  7651,  1910, 10709,  2117,  9637,  6402,  6028,
-	 2124,  7701,  2679,  5183,  6270,  7424,  2597,  6795,
-	 9222, 10837,   280,  8583,  3270,  6753,  2354,  3779,
-	 6102,  4732,  5926,  2497,  8640, 10289,  6107, 12127,
-	 2958, 12287, 10292,  8086,   817,  4021,  2610,  1444,
-	 5899, 11720,  3292,  2424,  5090,  7242,  5205,  5281,
-	 9956,  2702,  6656,   735,  2243, 11656,   833,  3107,
-	 6012,  6801,  1126,  6339,  5250, 10391,  9642,  5278,
-	 3513,  9769,  3025,   779,  9433,  3392,  7437,   668,
-	10184,  8111,  6527,  6568, 10831,  6482,  8263,  5711,
-	 9780,   467,  5462,  4425, 11999,  1205,  5015,  6918,
-	 5096,  3827,  5525, 11579,  3518,  4875,  7388,  1931,
-	 6615,  1541,  8708,   260,  3385,  4792,  4391,  5697,
-	 7895,  2155,  7337,   236, 10635, 11534,  1906,  4793,
-	 9527,  7239,  8354,  5121, 10662,  2311,  3346,  8556,
-	  707,  1088,  4936,   678, 10245,    18,  5684,   960,
-	 4459,  7957,   226,  2451,     6,  8874,   320,  6298,
-	 8963,  8735,  2852,  2981,  1707,  5408,  5017,  9876,
-	 9790,  2968,  1899,  6729,  4183,  5290, 10084,  7679,
-	 7941,  8744,  5694,  3461,  4175,  5747,  5561,  3378,
-	 5227,   952,  4319,  9810,  4356,  3088, 11118,   840,
-	 6257,   486,  6000,  1342, 10382,  6017,  4798,  5489,
-	 4498,  4193,  2306,  6521,  1475,  6372,  9029,  8037,
-	 1625,  7020,  4740,  5730,  7956,  6351,  6494,  6917,
-	11405,  7487, 10202, 10155,  7666,  7556, 11509,  1546,
-	 6571, 10199,  2265,  7327,  5824, 11396, 11581,  9722,
-	 2251, 11199,  5356,  7408,  2861,  4003,  9215,   484,
-	 7526,  9409, 12235,  6157,  9025,  2121, 10255,  2519,
-	 9533,  3824,  8674, 11419, 10888,  4762, 11303,  4097,
-	 2414,  6496,  9953, 10554,   808,  2999,  2130,  4286,
-	12078,  7445,  5132,  7915,   245,  5974,  4874,  7292,
-	 7560, 10539,  9952,  9075,  2113,  3721, 10285, 10022,
-	 9578,  8934, 11074,  9498,   294,  4711,  3391,  1377,
-	 9072, 10189,  4569, 10890,  9909,  6923,    53,  4653,
-	  439, 10253,  7028, 10207,  8343,  1141,  2556,  7601,
-	 8150, 10630,  8648,  9832,  7951, 11245,  2131,  5765,
-	10343,  9781,  2718,  1419,  4531,  3844,  4066,  4293,
-	11657, 11525, 11353,  4313,  4869, 12186,  1611, 10892,
-	11489,  8833,  2393,    15, 10830,  5003,    17,   565,
-	 5891, 12177, 11058, 10412,  8885,  3974, 10981,  7130,
-	 5840, 10482,  8338,  6035,  6964,  1574, 10936,  2020,
-	 2465,  8191,   384,  2642,  2729,  5399,  2175,  9396,
-	11987,  8035,  4375,  6611,  5010, 11812,  9131, 11427,
-	  104,  6348,  9643,  6757, 12110,  5617, 10935,   541,
-	  135,  3041,  7200,  6526,  5085, 12136,   842,  4129,
-	 7685, 11079,  8426,  1008,  2725, 11772,  6058,  1101,
-	 1950,  8424,  5688,  6876, 12005, 10079,  5335,   927,
-	 1770,   273,  8377,  2271,  5225, 10283,   116, 11807,
-	   91, 11699,   757,  1304,  7524,  6451,  8032,  8154,
-	 7456,  4191,   309,  2318,  2292, 10393, 11639,  9481,
-	12238, 10594,  9569,  7912, 10368,  9889, 12244,  7179,
-	 3924,  3188,   367,  2077,   336,  5384,  5631,  8596,
-	 4621,  1775,  8866,   451,  6108,  1317,  6246,  8795,
-	 5896,  7283,  3132, 11564,  4977, 12161,  7371,  1366,
-	12130, 10619,  3809,  5149,  6300,  2638,  4197,  1418,
-	10065,  4156,  8373,  8644, 10445,   882,  8158, 10173,
-	 9763, 12191,   459,  2966,  3166,   405,  5000,  9311,
-	 6404,  8986,  1551,  8175,  3630, 10766,  9265,   700,
-	 8573,  9508,  6630, 11437, 11595,  5850,  3950,  4775,
-	11941,  1446,  6018,  3386, 11470,  5310,  5476,   553,
-	 9474,  2586,  1431,  2741,   473, 11383,  4745,   836,
-	 4062, 10666,  7727, 11752,  5534,   312,  4307,  4351,
-	 5764,  8679,  8381,  8187,     5,  7395,  4363,  1152,
-	 5421,  5231,  6473,   436,  7567,  8603,  6229,  8230
-};
-
-/*
- * Reduce a small signed integer modulo q. The source integer MUST
- * be between -q/2 and +q/2.
- */
-static inline uint32_t
-mq_conv_small(int x)
-{
-	/*
-	 * If x < 0, the cast to uint32_t will set the high bit to 1.
-	 */
-	uint32_t y;
-
-	y = (uint32_t)x;
-	y += Q & -(y >> 31);
-	return y;
-}
-
-/*
- * Addition modulo q. Operands must be in the 0..q-1 range.
- */
-static inline uint32_t
-mq_add(uint32_t x, uint32_t y)
-{
-	/*
-	 * We compute x + y - q. If the result is negative, then the
-	 * high bit will be set, and 'd >> 31' will be equal to 1;
-	 * thus '-(d >> 31)' will be an all-one pattern. Otherwise,
-	 * it will be an all-zero pattern. In other words, this
-	 * implements a conditional addition of q.
-	 */
-	uint32_t d;
-
-	d = x + y - Q;
-	d += Q & -(d >> 31);
-	return d;
-}
-
-/*
- * Subtraction modulo q. Operands must be in the 0..q-1 range.
- */
-static inline uint32_t
-mq_sub(uint32_t x, uint32_t y)
-{
-	/*
-	 * As in mq_add(), we use a conditional addition to ensure the
-	 * result is in the 0..q-1 range.
-	 */
-	uint32_t d;
-
-	d = x - y;
-	d += Q & -(d >> 31);
-	return d;
-}
-
-/*
- * Division by 2 modulo q. Operand must be in the 0..q-1 range.
- */
-static inline uint32_t
-mq_rshift1(uint32_t x)
-{
-	x += Q & -(x & 1);
-	return (x >> 1);
-}
-
-/*
- * Montgomery multiplication modulo q. If we set R = 2^16 mod q, then
- * this function computes: x * y / R mod q
- * Operands must be in the 0..q-1 range.
- */
-static inline uint32_t
-mq_montymul(uint32_t x, uint32_t y)
-{
-	uint32_t z, w;
-
-	/*
-	 * We compute x*y + k*q with a value of k chosen so that the 16
-	 * low bits of the result are 0. We can then shift the value.
-	 * After the shift, result may still be larger than q, but it
-	 * will be lower than 2*q, so a conditional subtraction works.
-	 */
-
-	z = x * y;
-	w = ((z * Q0I) & 0xFFFF) * Q;
-
-	/*
-	 * When adding z and w, the result will have its low 16 bits
-	 * equal to 0. Since x, y and z are lower than q, the sum will
-	 * be no more than (2^15 - 1) * q + (q - 1)^2, which will
-	 * fit on 29 bits.
-	 */
-	z = (z + w) >> 16;
-
-	/*
-	 * After the shift, analysis shows that the value will be less
-	 * than 2q. We do a subtraction then conditional subtraction to
-	 * ensure the result is in the expected range.
-	 */
-	z -= Q;
-	z += Q & -(z >> 31);
-	return z;
-}
-
-/*
- * Montgomery squaring (computes (x^2)/R).
- */
-static inline uint32_t
-mq_montysqr(uint32_t x)
-{
-	return mq_montymul(x, x);
-}
-
-/*
- * Divide x by y modulo q = 12289.
- */
-static inline uint32_t
-mq_div_12289(uint32_t x, uint32_t y)
-{
-	/*
-	 * We invert y by computing y^(q-2) mod q.
-	 *
-	 * We use the following addition chain for exponent e = 12287:
-	 *
-	 *   e0 = 1
-	 *   e1 = 2 * e0 = 2
-	 *   e2 = e1 + e0 = 3
-	 *   e3 = e2 + e1 = 5
-	 *   e4 = 2 * e3 = 10
-	 *   e5 = 2 * e4 = 20
-	 *   e6 = 2 * e5 = 40
-	 *   e7 = 2 * e6 = 80
-	 *   e8 = 2 * e7 = 160
-	 *   e9 = e8 + e2 = 163
-	 *   e10 = e9 + e8 = 323
-	 *   e11 = 2 * e10 = 646
-	 *   e12 = 2 * e11 = 1292
-	 *   e13 = e12 + e9 = 1455
-	 *   e14 = 2 * e13 = 2910
-	 *   e15 = 2 * e14 = 5820
-	 *   e16 = e15 + e10 = 6143
-	 *   e17 = 2 * e16 = 12286
-	 *   e18 = e17 + e0 = 12287
-	 *
-	 * Additions on exponents are converted to Montgomery
-	 * multiplications. We define all intermediate results as so
-	 * many local variables, and let the C compiler work out which
-	 * must be kept around.
-	 */
-	uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
-	uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18;
-
-	y0 = mq_montymul(y, R2);
-	y1 = mq_montysqr(y0);
-	y2 = mq_montymul(y1, y0);
-	y3 = mq_montymul(y2, y1);
-	y4 = mq_montysqr(y3);
-	y5 = mq_montysqr(y4);
-	y6 = mq_montysqr(y5);
-	y7 = mq_montysqr(y6);
-	y8 = mq_montysqr(y7);
-	y9 = mq_montymul(y8, y2);
-	y10 = mq_montymul(y9, y8);
-	y11 = mq_montysqr(y10);
-	y12 = mq_montysqr(y11);
-	y13 = mq_montymul(y12, y9);
-	y14 = mq_montysqr(y13);
-	y15 = mq_montysqr(y14);
-	y16 = mq_montymul(y15, y10);
-	y17 = mq_montysqr(y16);
-	y18 = mq_montymul(y17, y0);
-
-	/*
-	 * Final multiplication with x, which is not in Montgomery
-	 * representation, computes the correct division result.
-	 */
-	return mq_montymul(y18, x);
-}
-
-/*
- * Compute NTT on a ring element.
- */
-static void
-mq_NTT(uint16_t *a, unsigned logn)
-{
-	size_t n, t, m;
-
-	n = (size_t)1 << logn;
-	t = n;
-	for (m = 1; m < n; m <<= 1) {
-		size_t ht, i, j1;
-
-		ht = t >> 1;
-		for (i = 0, j1 = 0; i < m; i ++, j1 += t) {
-			size_t j, j2;
-			uint32_t s;
-
-			s = GMb[m + i];
-			j2 = j1 + ht;
-			for (j = j1; j < j2; j ++) {
-				uint32_t u, v;
-
-				u = a[j];
-				v = mq_montymul(a[j + ht], s);
-				a[j] = (uint16_t)mq_add(u, v);
-				a[j + ht] = (uint16_t)mq_sub(u, v);
-			}
-		}
-		t = ht;
-	}
-}
-
-/*
- * Compute the inverse NTT on a ring element, binary case.
- */
-static void
-mq_iNTT(uint16_t *a, unsigned logn)
-{
-	size_t n, t, m;
-	uint32_t ni;
-
-	n = (size_t)1 << logn;
-	t = 1;
-	m = n;
-	while (m > 1) {
-		size_t hm, dt, i, j1;
-
-		hm = m >> 1;
-		dt = t << 1;
-		for (i = 0, j1 = 0; i < hm; i ++, j1 += dt) {
-			size_t j, j2;
-			uint32_t s;
-
-			j2 = j1 + t;
-			s = iGMb[hm + i];
-			for (j = j1; j < j2; j ++) {
-				uint32_t u, v, w;
-
-				u = a[j];
-				v = a[j + t];
-				a[j] = (uint16_t)mq_add(u, v);
-				w = mq_sub(u, v);
-				a[j + t] = (uint16_t)
-					mq_montymul(w, s);
-			}
-		}
-		t = dt;
-		m = hm;
-	}
-
-	/*
-	 * To complete the inverse NTT, we must now divide all values by
-	 * n (the vector size). We thus need the inverse of n, i.e. we
-	 * need to divide 1 by 2 logn times. But we also want it in
-	 * Montgomery representation, i.e. we also want to multiply it
-	 * by R = 2^16. In the common case, this should be a simple right
-	 * shift. The loop below is generic and works also in corner cases;
-	 * its computation time is negligible.
-	 */
-	ni = R;
-	for (m = n; m > 1; m >>= 1) {
-		ni = mq_rshift1(ni);
-	}
-	for (m = 0; m < n; m ++) {
-		a[m] = (uint16_t)mq_montymul(a[m], ni);
-	}
-}
-
-/*
- * Convert a polynomial (mod q) to Montgomery representation.
- */
-static void
-mq_poly_tomonty(uint16_t *f, unsigned logn)
-{
-	size_t u, n;
-
-	n = (size_t)1 << logn;
-	for (u = 0; u < n; u ++) {
-		f[u] = (uint16_t)mq_montymul(f[u], R2);
-	}
-}
-
-/*
- * Multiply two polynomials together (NTT representation, and using
- * a Montgomery multiplication). Result f*g is written over f.
- */
-static void
-mq_poly_montymul_ntt(uint16_t *f, const uint16_t *g, unsigned logn)
-{
-	size_t u, n;
-
-	n = (size_t)1 << logn;
-	for (u = 0; u < n; u ++) {
-		f[u] = (uint16_t)mq_montymul(f[u], g[u]);
-	}
-}
-
-/*
- * Subtract polynomial g from polynomial f.
- */
-static void
-mq_poly_sub(uint16_t *f, const uint16_t *g, unsigned logn)
-{
-	size_t u, n;
-
-	n = (size_t)1 << logn;
-	for (u = 0; u < n; u ++) {
-		f[u] = (uint16_t)mq_sub(f[u], g[u]);
-	}
-}
-
-/* ===================================================================== */
-
-/* see inner.h */
-void
-Zf(to_ntt_monty)(uint16_t *h, unsigned logn)
-{
-	mq_NTT(h, logn);
-	mq_poly_tomonty(h, logn);
-}
-
-/* see inner.h */
-int
-Zf(verify_raw)(const uint16_t *c0, const int16_t *s2,
-	const uint16_t *h, unsigned logn, uint8_t *tmp)
-{
-	size_t u, n;
-	uint16_t *tt;
-
-	n = (size_t)1 << logn;
-	tt = (uint16_t *)tmp;
-
-	/*
-	 * Reduce s2 elements modulo q ([0..q-1] range).
-	 */
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = (uint32_t)s2[u];
-		w += Q & -(w >> 31);
-		tt[u] = (uint16_t)w;
-	}
-
-	/*
-	 * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]).
-	 */
-	mq_NTT(tt, logn);
-	mq_poly_montymul_ntt(tt, h, logn);
-	mq_iNTT(tt, logn);
-	mq_poly_sub(tt, c0, logn);
-
-	/*
-	 * Normalize -s1 elements into the [-q/2..q/2] range.
-	 */
-	for (u = 0; u < n; u ++) {
-		int32_t w;
-
-		w = (int32_t)tt[u];
-		w -= (int32_t)(Q & -(((Q >> 1) - (uint32_t)w) >> 31));
-		((int16_t *)tt)[u] = (int16_t)w;
-	}
-
-	/*
-	 * Signature is valid if and only if the aggregate (-s1,s2) vector
-	 * is short enough.
-	 */
-	return Zf(is_short)((int16_t *)tt, s2, logn);
-}
-
-/* see inner.h */
-int
-Zf(compute_public)(uint16_t *h,
-	const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp)
-{
-	size_t u, n;
-	uint16_t *tt;
-
-	n = (size_t)1 << logn;
-	tt = (uint16_t *)tmp;
-	for (u = 0; u < n; u ++) {
-		tt[u] = (uint16_t)mq_conv_small(f[u]);
-		h[u] = (uint16_t)mq_conv_small(g[u]);
-	}
-	mq_NTT(h, logn);
-	mq_NTT(tt, logn);
-	for (u = 0; u < n; u ++) {
-		if (tt[u] == 0) {
-			return 0;
-		}
-		h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
-	}
-	mq_iNTT(h, logn);
-	return 1;
-}
-
-/* see inner.h */
-int
-Zf(complete_private)(int8_t *G,
-	const int8_t *f, const int8_t *g, const int8_t *F,
-	unsigned logn, uint8_t *tmp)
-{
-	size_t u, n;
-	uint16_t *t1, *t2;
-
-	n = (size_t)1 << logn;
-	t1 = (uint16_t *)tmp;
-	t2 = t1 + n;
-	for (u = 0; u < n; u ++) {
-		t1[u] = (uint16_t)mq_conv_small(g[u]);
-		t2[u] = (uint16_t)mq_conv_small(F[u]);
-	}
-	mq_NTT(t1, logn);
-	mq_NTT(t2, logn);
-	mq_poly_tomonty(t1, logn);
-	mq_poly_montymul_ntt(t1, t2, logn);
-	for (u = 0; u < n; u ++) {
-		t2[u] = (uint16_t)mq_conv_small(f[u]);
-	}
-	mq_NTT(t2, logn);
-	for (u = 0; u < n; u ++) {
-		if (t2[u] == 0) {
-			return 0;
-		}
-		t1[u] = (uint16_t)mq_div_12289(t1[u], t2[u]);
-	}
-	mq_iNTT(t1, logn);
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-		int32_t gi;
-
-		w = t1[u];
-		w -= (Q & ~-((w - (Q >> 1)) >> 31));
-		gi = *(int32_t *)&w;
-		if (gi < -127 || gi > +127) {
-			return 0;
-		}
-		G[u] = (int8_t)gi;
-	}
-	return 1;
-}
-
-/* see inner.h */
-int
-Zf(is_invertible)(
-	const int16_t *s2, unsigned logn, uint8_t *tmp)
-{
-	size_t u, n;
-	uint16_t *tt;
-	uint32_t r;
-
-	n = (size_t)1 << logn;
-	tt = (uint16_t *)tmp;
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = (uint32_t)s2[u];
-		w += Q & -(w >> 31);
-		tt[u] = (uint16_t)w;
-	}
-	mq_NTT(tt, logn);
-	r = 0;
-	for (u = 0; u < n; u ++) {
-		r |= (uint32_t)(tt[u] - 1);
-	}
-	return (int)(1u - (r >> 31));
-}
-
-/* see inner.h */
-int
-Zf(verify_recover)(uint16_t *h,
-	const uint16_t *c0, const int16_t *s1, const int16_t *s2,
-	unsigned logn, uint8_t *tmp)
-{
-	size_t u, n;
-	uint16_t *tt;
-	uint32_t r;
-
-	n = (size_t)1 << logn;
-
-	/*
-	 * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
-	 * and c0 - s1 into h[].
-	 */
-	tt = (uint16_t *)tmp;
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = (uint32_t)s2[u];
-		w += Q & -(w >> 31);
-		tt[u] = (uint16_t)w;
-
-		w = (uint32_t)s1[u];
-		w += Q & -(w >> 31);
-		w = mq_sub(c0[u], w);
-		h[u] = (uint16_t)w;
-	}
-
-	/*
-	 * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
-	 * is zero (in NTT representation) then the operation fails. We
-	 * keep that information into a flag so that we do not deviate
-	 * from strict constant-time processing; if all coefficients of
-	 * s2 are non-zero, then the high bit of r will be zero.
-	 */
-	mq_NTT(tt, logn);
-	mq_NTT(h, logn);
-	r = 0;
-	for (u = 0; u < n; u ++) {
-		r |= (uint32_t)(tt[u] - 1);
-		h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
-	}
-	mq_iNTT(h, logn);
-
-	/*
-	 * Signature is acceptable if and only if it is short enough,
-	 * and s2 was invertible mod phi mod q. The caller must still
-	 * check that the rebuilt public key matches the expected
-	 * value (e.g. through a hash).
-	 */
-	r = ~r & (uint32_t)-Zf(is_short)(s1, s2, logn);
-	return (int)(r >> 31);
-}
-
-/* see inner.h */
-int
-Zf(count_nttzero)(const int16_t *sig, unsigned logn, uint8_t *tmp)
-{
-	uint16_t *s2;
-	size_t u, n;
-	uint32_t r;
-
-	n = (size_t)1 << logn;
-	s2 = (uint16_t *)tmp;
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = (uint32_t)sig[u];
-		w += Q & -(w >> 31);
-		s2[u] = (uint16_t)w;
-	}
-	mq_NTT(s2, logn);
-	r = 0;
-	for (u = 0; u < n; u ++) {
-		uint32_t w;
-
-		w = (uint32_t)s2[u] - 1u;
-		r += (w >> 31);
-	}
-	return (int)r;
-}
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/LICENSE b/crypto_sign/fndsa_provisional-1024/m4f/LICENSE
new file mode 120000
index 00000000..a8b0b647
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/LICENSE
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/LICENSE
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/api.c b/crypto_sign/fndsa_provisional-1024/m4f/api.c
new file mode 120000
index 00000000..925a7669
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/api.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/api.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/api.h b/crypto_sign/fndsa_provisional-1024/m4f/api.h
new file mode 120000
index 00000000..a128187e
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/api.h
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-1024/ref/api.h
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/archflags.h b/crypto_sign/fndsa_provisional-1024/m4f/archflags.h
new file mode 120000
index 00000000..5995cb74
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/archflags.h
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/archflags.h
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/codec.c b/crypto_sign/fndsa_provisional-1024/m4f/codec.c
new file mode 120000
index 00000000..f24de227
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/codec.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/codec.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/codec_cm4.s b/crypto_sign/fndsa_provisional-1024/m4f/codec_cm4.s
new file mode 120000
index 00000000..0c44b8f4
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/codec_cm4.s
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/codec_cm4.s
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/fndsa.h b/crypto_sign/fndsa_provisional-1024/m4f/fndsa.h
new file mode 120000
index 00000000..9b987bbd
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/fndsa.h
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/fndsa.h
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/inner.h b/crypto_sign/fndsa_provisional-1024/m4f/inner.h
new file mode 120000
index 00000000..ccf790d2
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/inner.h
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/inner.h
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/kgen.c b/crypto_sign/fndsa_provisional-1024/m4f/kgen.c
new file mode 120000
index 00000000..905593b2
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/kgen.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/kgen.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/kgen_fxp.c b/crypto_sign/fndsa_provisional-1024/m4f/kgen_fxp.c
new file mode 120000
index 00000000..5a42ccfd
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/kgen_fxp.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/kgen_fxp.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/kgen_gauss.c b/crypto_sign/fndsa_provisional-1024/m4f/kgen_gauss.c
new file mode 120000
index 00000000..56f96573
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/kgen_gauss.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/kgen_gauss.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/kgen_inner.h b/crypto_sign/fndsa_provisional-1024/m4f/kgen_inner.h
new file mode 120000
index 00000000..1fc8ceef
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/kgen_inner.h
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/kgen_inner.h
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/kgen_mp31.c b/crypto_sign/fndsa_provisional-1024/m4f/kgen_mp31.c
new file mode 120000
index 00000000..f8c33ba5
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/kgen_mp31.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/kgen_mp31.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/kgen_ntru.c b/crypto_sign/fndsa_provisional-1024/m4f/kgen_ntru.c
new file mode 120000
index 00000000..ba0388c9
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/kgen_ntru.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/kgen_ntru.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/kgen_poly.c b/crypto_sign/fndsa_provisional-1024/m4f/kgen_poly.c
new file mode 120000
index 00000000..05fd362f
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/kgen_poly.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/kgen_poly.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/kgen_zint31.c b/crypto_sign/fndsa_provisional-1024/m4f/kgen_zint31.c
new file mode 120000
index 00000000..3cb38053
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/kgen_zint31.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/kgen_zint31.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/mq.c b/crypto_sign/fndsa_provisional-1024/m4f/mq.c
new file mode 120000
index 00000000..a351025f
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/mq.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/mq.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/mq_cm4.s b/crypto_sign/fndsa_provisional-1024/m4f/mq_cm4.s
new file mode 120000
index 00000000..f0f60a44
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/mq_cm4.s
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/mq_cm4.s
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sha3.c b/crypto_sign/fndsa_provisional-1024/m4f/sha3.c
new file mode 120000
index 00000000..933b2bda
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/sha3.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/sha3.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sha3_cm4.s b/crypto_sign/fndsa_provisional-1024/m4f/sha3_cm4.s
new file mode 120000
index 00000000..d1d08be8
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/sha3_cm4.s
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/sha3_cm4.s
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sign.c b/crypto_sign/fndsa_provisional-1024/m4f/sign.c
new file mode 120000
index 00000000..3dbf03a2
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/sign.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/sign.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sign_core.c b/crypto_sign/fndsa_provisional-1024/m4f/sign_core.c
new file mode 120000
index 00000000..5bb17b4b
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/sign_core.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/sign_core.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sign_fpoly.c b/crypto_sign/fndsa_provisional-1024/m4f/sign_fpoly.c
new file mode 120000
index 00000000..a0b083ce
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/sign_fpoly.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/sign_fpoly.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sign_fpr.c b/crypto_sign/fndsa_provisional-1024/m4f/sign_fpr.c
new file mode 120000
index 00000000..dc3b5cd0
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/sign_fpr.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/sign_fpr.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sign_fpr_cm4.s b/crypto_sign/fndsa_provisional-1024/m4f/sign_fpr_cm4.s
new file mode 120000
index 00000000..4cecbb3b
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/sign_fpr_cm4.s
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/sign_fpr_cm4.s
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sign_inner.h b/crypto_sign/fndsa_provisional-1024/m4f/sign_inner.h
new file mode 120000
index 00000000..3a34addb
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/sign_inner.h
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/sign_inner.h
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sign_sampler.c b/crypto_sign/fndsa_provisional-1024/m4f/sign_sampler.c
new file mode 120000
index 00000000..f402644f
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/sign_sampler.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/sign_sampler.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sign_sampler_cm4.s b/crypto_sign/fndsa_provisional-1024/m4f/sign_sampler_cm4.s
new file mode 120000
index 00000000..bfde9bed
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/sign_sampler_cm4.s
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/sign_sampler_cm4.s
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/sysrng.c b/crypto_sign/fndsa_provisional-1024/m4f/sysrng.c
new file mode 120000
index 00000000..dfb4edcb
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/sysrng.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/sysrng.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/util.c b/crypto_sign/fndsa_provisional-1024/m4f/util.c
new file mode 120000
index 00000000..bcc36e58
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/util.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/util.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-1024/m4f/vrfy.c b/crypto_sign/fndsa_provisional-1024/m4f/vrfy.c
new file mode 120000
index 00000000..9b515e7e
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-1024/m4f/vrfy.c
@@ -0,0 +1 @@
+../../fndsa_provisional-512/m4f/vrfy.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/LICENSE b/crypto_sign/fndsa_provisional-512/m4f/LICENSE
new file mode 120000
index 00000000..53ea2086
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/LICENSE
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/LICENSE
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/api.c b/crypto_sign/fndsa_provisional-512/m4f/api.c
new file mode 120000
index 00000000..42027e55
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/api.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/api.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/api.h b/crypto_sign/fndsa_provisional-512/m4f/api.h
new file mode 120000
index 00000000..7377f660
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/api.h
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/api.h
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/archflags.h b/crypto_sign/fndsa_provisional-512/m4f/archflags.h
new file mode 100644
index 00000000..99a4b205
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/archflags.h
@@ -0,0 +1,2 @@
+/* Architecture-specific flags (if any). */
+#define FNDSA_ASM_CORTEXM4   1
diff --git a/crypto_sign/fndsa_provisional-512/m4f/codec.c b/crypto_sign/fndsa_provisional-512/m4f/codec.c
new file mode 120000
index 00000000..706495e7
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/codec.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/codec.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/codec_cm4.s b/crypto_sign/fndsa_provisional-512/m4f/codec_cm4.s
new file mode 100644
index 00000000..cb80767d
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/codec_cm4.s
@@ -0,0 +1,203 @@
+	.syntax	unified
+	.cpu	cortex-m4
+	.file	"mq_cm4.s"
+	.text
+
+	.equ	Q, 12289
+
+@ =======================================================================
+@ size_t fndsa_mqpoly_decode(unsigned logn, const uint8_t *f, uint16_t *h)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_mqpoly_decode
+	.thumb
+	.thumb_func
+	.type	fndsa_mqpoly_decode, %function
+fndsa_mqpoly_decode:
+	push	{ r4, r5, r6, r7, r8, r10, r11 }
+
+	@ ASSUMPTIONS:
+	@  - logn >= 2 (hence, n is a multiple of 4)
+	@  - output buffer is 32-bit aligned
+	@ We process input by chunks of 7 bytes, to produce 4 values.
+	@ TODO: try using chunks of 28 bytes when source is aligned; it
+	@ would avoid most unaligned penalties and save 1/8 of reads.
+
+	@ r0 <- n = 2^logn 
+	movs	r3, #1
+	lsl	r0, r3, r0
+	@ r11 <- original source pointer
+	mov	r11, r1
+
+	@ r3 <- 0x3FFF:0x3FFF
+	movw	r3, 0x3FFF
+	movt	r3, 0x3FFF
+	@ r10 <- q:q
+	movw	r10, #Q
+	movt	r10, #Q
+	@ r12 <- 0xFFFFFFFF
+	@ If any value overflows, then bit 15 or 31 of r12 will be cleared.
+	mov	r12, #0xFFFFFFFF
+
+fndsa_mqpoly_decode__L1:
+	@ Get next 7-byte value as integer r7:r5 with big-endian
+	@ interpretation.
+	ldr	r5, [r1], #3
+	ldr	r4, [r1], #4
+	lsls	r5, #8
+	rev	r5, r5
+	rev	r4, r4
+	@ We assemble the 4 values in r6:r7 (packed 16-bit):
+	@ x0: r6<0,13>  <- r5<10,23>
+	@ x1: r6<16,19> <- r4<28,31>,  r6<20,29> <- r5<0,9>
+	@ x2: r7<0,13>  <- r4<14,27>
+	@ x3: r7<16,29> <- r4<0,13>
+	ubfx	r6, r5, #10, #14
+	bfi	r6, r5, #20, #10
+	lsrs	r7, r4, #28
+	orr	r6, r6, r7, lsl #16
+	lsrs	r7, r4, #14
+	bfi	r7, r4, #16, #14
+	ands	r6, r3
+	ands	r7, r3
+	@ Update the overflow mask.
+	usub16	r8, r6, r10
+	and	r12, r12, r8
+	usub16	r8, r7, r10
+	and	r12, r12, r8
+	@ Store the extracted values.
+	strd	r6, r7, [r2], #8
+	@ Loop until all values have been decoded.
+	subs	r0, #4
+	bne	fndsa_mqpoly_decode__L1
+
+	@ Get output value (number of consumed bytes).
+	@ Clamp it to 0 on overflow.
+	sub	r0, r1, r11
+	and	r12, r12, r12, lsl #16
+	and	r0, r0, r12, asr #31
+
+	pop	{ r4, r5, r6, r7, r8, r10, r11 }
+	bx	lr
+	.size	fndsa_mqpoly_decode,.-fndsa_mqpoly_decode
+
+@ =======================================================================
+@ size_t fndsa_comp_decode(unsigned logn,
+@                          const uint8_t *d, size_t dlen, int16_t *s)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_comp_decode
+	.thumb
+	.thumb_func
+	.type	fndsa_comp_decode, %function
+fndsa_comp_decode:
+	push	{ r4, r5, r6, r7 }
+
+	@ r0 <- n = 2^logn
+	movs	r4, #1
+	lsl	r0, r4, r0
+	@ r2 <- upper bound for d
+	adds	r2, r1
+
+	@ r4   acc
+	@ r5   acc_ptr
+	@ Unprocessed bits are in the top bits of acc. First unprocessed bit
+	@ is at index acc_ptr + 8.
+	eors	r4, r4
+	movs	r5, #24
+
+fndsa_comp_decode__L1:
+	@ Invariant: acc_ptr >= 17 (i.e. there are at most 7 unprocessed bits).
+
+	@ Get next 8 bits.
+	cmp	r1, r2
+	beq	fndsa_comp_decode__Lerr
+	ldrb	r6, [r1], #1
+	lsls	r6, r5
+	orrs	r4, r6
+
+	@ r6 <- low 7 absolute value bits
+	@ r12 <- sign (word-extended)
+	ubfx	r6, r4, #24, #7
+	asr	r12, r4, #31
+	lsls	r4, #8
+
+	@ We injected 8 bits then consumed 8 bits: acc_ptr is unmodified.
+
+	@ Locate next bit of value 1. If necessary, read one or two
+	@ extra bytes. Heuristically, values are small, so the fast
+	@ path is that the extra bit is already there.
+	cbz	r4, fndsa_comp_decode__Lzb1
+	clz	r7, r4
+fndsa_comp_decode__L2:
+	@ There are r7 zeros, then a one. r7 <= 15.
+	add	r6, r6, r7, lsl #7
+	@ Consume the zeros and the final one.
+	adds	r7, #1
+	lsls	r4, r7
+	adds	r5, r7
+	@ Mantissa is in r6, sign in r12. Reject "minus zero" encoding,
+	@ i.e. r6 = 0 and r12 = -1
+	orn	r7, r6, r12
+	cbz	r7, fndsa_comp_decode__Lerr
+	@ We assemble the value in r6
+	eor	r6, r6, r12
+	sub	r6, r6, r12
+	strh	r6, [r3], #2
+
+	@ Loop until all values have been obtained.
+	subs	r0, #1
+	bne	fndsa_comp_decode__L1
+
+	@ Check that remaining unused bits are zero (accumulator and
+	@ all unused bytes).
+	movs	r0, #1
+	cbnz	r4, fndsa_comp_decode__Lerr
+	cmp	r1, r2
+	beq	fndsa_comp_decode__Lexit
+fndsa_comp_decode__L3:
+	ldrb	r6, [r1], #1
+	cbnz	r6, fndsa_comp_decode__Lerr
+	cmp	r1, r2
+	bne	fndsa_comp_decode__L3
+fndsa_comp_decode__Lexit:
+	pop	{ r4, r5, r6, r7 }
+	bx	lr
+
+fndsa_comp_decode__Lzb1:
+	@ All currently buffered bits are zero, we must get an extra byte.
+	@ Get next byte.
+	cmp	r1, r2
+	beq	fndsa_comp_decode__Lerr
+	ldrb	r7, [r1], #1
+	lsls	r7, r5
+	orrs	r4, r7
+	cbz	r4, fndsa_comp_decode__Lzb2
+	subs	r5, #8
+	clz	r7, r4
+	b	fndsa_comp_decode__L2
+
+fndsa_comp_decode__Lzb2:
+	@ All currently buffered bits are zero, and the next byte was
+	@ all-zeros too; we must get another byte.
+	cmp	r1, r2
+	beq	fndsa_comp_decode__Lerr
+	ldrb	r7, [r1], #1
+	subs	r5, #8
+	lsls	r7, r5
+	orrs	r4, r7
+	cbz	r4, fndsa_comp_decode__Lerr
+	subs	r5, #8
+	clz	r7, r4
+	@ Since we added two bytes and the accumulator already contained
+	@ up to 7 bits, then we may have up to 23 bits at this point,
+	@ hence r7 can be up to 22. Values greater than 15 are invalid.
+	cmp	r7, #15
+	bls	fndsa_comp_decode__L2
+	@ Fall through to error sequence.
+fndsa_comp_decode__Lerr:
+	eors	r0, r0
+	b	fndsa_comp_decode__Lexit
+	.size	fndsa_comp_decode,.-fndsa_comp_decode
diff --git a/crypto_sign/fndsa_provisional-512/m4f/fndsa.h b/crypto_sign/fndsa_provisional-512/m4f/fndsa.h
new file mode 120000
index 00000000..52ec0735
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/fndsa.h
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/fndsa.h
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/inner.h b/crypto_sign/fndsa_provisional-512/m4f/inner.h
new file mode 120000
index 00000000..85121904
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/inner.h
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/inner.h
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/kgen.c b/crypto_sign/fndsa_provisional-512/m4f/kgen.c
new file mode 120000
index 00000000..74038217
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/kgen.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/kgen.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/kgen_fxp.c b/crypto_sign/fndsa_provisional-512/m4f/kgen_fxp.c
new file mode 120000
index 00000000..aba35701
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/kgen_fxp.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/kgen_fxp.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/kgen_gauss.c b/crypto_sign/fndsa_provisional-512/m4f/kgen_gauss.c
new file mode 120000
index 00000000..50228b17
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/kgen_gauss.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/kgen_gauss.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/kgen_inner.h b/crypto_sign/fndsa_provisional-512/m4f/kgen_inner.h
new file mode 120000
index 00000000..3359c80c
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/kgen_inner.h
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/kgen_inner.h
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/kgen_mp31.c b/crypto_sign/fndsa_provisional-512/m4f/kgen_mp31.c
new file mode 120000
index 00000000..0b7b5e55
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/kgen_mp31.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/kgen_mp31.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/kgen_ntru.c b/crypto_sign/fndsa_provisional-512/m4f/kgen_ntru.c
new file mode 120000
index 00000000..d58fdd5a
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/kgen_ntru.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/kgen_ntru.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/kgen_poly.c b/crypto_sign/fndsa_provisional-512/m4f/kgen_poly.c
new file mode 120000
index 00000000..7af990e2
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/kgen_poly.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/kgen_poly.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/kgen_zint31.c b/crypto_sign/fndsa_provisional-512/m4f/kgen_zint31.c
new file mode 120000
index 00000000..b8f3907a
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/kgen_zint31.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/kgen_zint31.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/mq.c b/crypto_sign/fndsa_provisional-512/m4f/mq.c
new file mode 120000
index 00000000..b881f59c
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/mq.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/mq.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/mq_cm4.s b/crypto_sign/fndsa_provisional-512/m4f/mq_cm4.s
new file mode 100644
index 00000000..e9ba11df
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/mq_cm4.s
@@ -0,0 +1,758 @@
+	.syntax	unified
+	.cpu	cortex-m4
+	.file	"mq_cm4.s"
+	.text
+
+	.equ	Q, 12289
+	.equ	Q1I, 4143984639
+	.equ	R, 10952
+	.equ	R2, 5664
+
+@ =======================================================================
+@ void fndsa_mqpoly_small_to_int(unsigned logn, const int8_t *f, uint16_t *d)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_mqpoly_small_to_int
+	.thumb
+	.thumb_func
+	.type	fndsa_mqpoly_small_to_int, %function
+fndsa_mqpoly_small_to_int:
+	@ This implementation actually normalizes to [1,q] (strict internal
+	@ representation).
+	push	{ r4, r5, r6, r7 }
+	@ Set r3 to n = 2^logn
+	movs	r3, #1
+	lsls	r3, r0
+	@ Set both halves of r0 to Q
+	movw	r0, #Q
+	movt	r0, #Q
+	@ Set both halves of r7 to 1
+	mov	r7, #0x00010001
+fndsa_mqpoly_small_to_int__L1:
+	@ Get next four source bytes.
+	ldr	r5, [r1], #4
+	@ Expand bytes to 16-bit each; for each byte whose value is negative
+	@ or zero, we need to add Q.
+	sxtb16	r4, r5
+	sadd16	r6, r4, r0
+	ssub16	r12, r4, r7
+	sel	r4, r4, r6
+	sxtb16	r5, r5, ror #8
+	sadd16	r6, r5, r0
+	ssub16	r12, r5, r7
+	sel	r5, r5, r6
+	@ We need to interleave the values to get them in the right order.
+	pkhtb	r6, r5, r4, asr #16
+	pkhbt	r5, r4, r5, lsl #16
+	@ We can use strd because the caller ensured that the output is
+	@ aligned.
+	strd	r5, r6, [r2], #8
+	subs	r3, #4
+	bne	fndsa_mqpoly_small_to_int__L1
+
+	pop	{ r4, r5, r6, r7 }
+	bx	lr
+	.size	fndsa_mqpoly_small_to_int,.-fndsa_mqpoly_small_to_int
+
+@ =======================================================================
+@ void fndsa_mqpoly_signed_to_int(unsigned logn, uint16_t *d)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_mqpoly_signed_to_int
+	.thumb
+	.thumb_func
+	.type	fndsa_mqpoly_signed_to_int, %function
+fndsa_mqpoly_signed_to_int:
+	@ This implementation actually normalizes to [1,q] (strict internal
+	@ representation).
+	push	{ r4, r5, r6 }
+	movs	r3, #1
+	lsls	r3, r0
+	@ Set both halves of r0 to Q
+	movw	r0, #Q
+	movt	r0, #Q
+	@ Set both halves of r2 to 1
+	mov	r2, #0x00010001
+fndsa_mqpoly_signed_to_int__L1:
+	@ We can use ldrd because the caller ensured that the input is
+	@ aligned.
+	ldrd	r4, r5, [r1]
+	@ For each word half, we want to add q if the value is negative or 0.
+	sadd16	r6, r4, r0
+	ssub16	r12, r4, r2
+	sel	r4, r4, r6
+	sadd16	r6, r5, r0
+	ssub16	r12, r5, r2
+	sel	r5, r5, r6
+	strd	r4, r5, [r1], #8
+	subs	r3, #4
+	bne	fndsa_mqpoly_signed_to_int__L1
+
+	pop	{ r4, r5, r6 }
+	bx	lr
+	.size	fndsa_mqpoly_signed_to_int,.-fndsa_mqpoly_signed_to_int
+
+@ =======================================================================
+@ void fndsa_mqpoly_int_to_ext(unsigned logn, uint16_t *d)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_mqpoly_int_to_ext
+	.thumb
+	.thumb_func
+	.type	fndsa_mqpoly_int_to_ext, %function
+fndsa_mqpoly_int_to_ext:
+	push.w	{ r4, r5, r6 }
+	movs	r3, #1
+	lsls	r3, r0
+	@ Set both halves of r0 to Q
+	movw	r0, #Q
+	movt	r0, #Q
+	@ Set both halves of r2 to 0xFFFF
+	mov	r2, #0xFFFFFFFF
+	@ Set r6 to zero
+	movw	r6, #0
+fndsa_mqpoly_int_to_ext__L1:
+	@ We can use ldrd because the caller ensured that the input is
+	@ aligned.
+	ldrd	r4, r5, [r1]
+	@ Each word half equal to q must be set to 0; others are untouched.
+	ssub16	r12, r4, r0
+	sel	r4, r6, r4
+	ssub16	r12, r5, r0
+	sel	r5, r6, r5
+	strd	r4, r5, [r1], #8
+	subs	r3, #4
+	bne	fndsa_mqpoly_int_to_ext__L1
+
+	pop	{ r4, r5, r6 }
+	bx	lr
+	.size	fndsa_mqpoly_int_to_ext,.-fndsa_mqpoly_int_to_ext
+
+@ =======================================================================
+@ void fndsa_mqpoly_mul_ntt(unsigned logn, uint16_t *a, const uint16_t *b)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_mqpoly_mul_ntt
+	.thumb
+	.thumb_func
+	.type	fndsa_mqpoly_mul_ntt, %function
+fndsa_mqpoly_mul_ntt:
+	push.w	{ r4, r5, r6, r7, r8, r10, lr }
+	movs	r3, #1
+	lsls	r3, r0
+
+	@ r10 <- q
+	movw	r10, #Q
+	@ r14 <- -1/q mod 2^32
+	movw	r14, #(Q1I & 0xFFFF)
+	movt	r14, #(Q1I >> 16)
+	@ r0 <- 2^64 mod q
+	movw	r0, #R2
+
+	@ r12 is a temporary
+
+fndsa_mqpoly_mul_ntt__L1:
+	@ A sequence of four ldr is faster than two ldrd or two ldm.
+	ldr	r5, [r1]
+	ldr	r6, [r1, #4]
+	ldr.w	r7, [r2], #4
+	ldr.w	r8, [r2], #4
+
+	@ First pair of words (r5 and r7)
+	@ Products over integers.
+	smulbb	r4, r5, r7
+	smultt	r5, r5, r7
+	@ Montgomery reduction.
+	mul	r12, r4, r14
+	umaal	r14, r4, r12, r10
+	mul	r12, r5, r14
+	umaal	r14, r5, r12, r10
+	@ Conversion Montgomery -> normal
+	muls	r4, r0
+	muls	r5, r0
+	mul	r12, r4, r14
+	umaal	r14, r4, r12, r10
+	mul	r12, r5, r14
+	umaal	r14, r5, r12, r10
+	@ Repack the two output values and write word
+	pkhbt	r7, r4, r5, lsl #16
+	str.w	r7, [r1], #4
+
+	@ Second pair of words (r6 and r8)
+	@ Products over integers.
+	smulbb	r4, r6, r8
+	smultt	r5, r6, r8
+	@ Montgomery reduction.
+	mul	r12, r4, r14
+	umaal	r14, r4, r12, r10
+	mul	r12, r5, r14
+	umaal	r14, r5, r12, r10
+	@ Conversion Montgomery -> normal
+	muls	r4, r0
+	muls	r5, r0
+	mul	r12, r4, r14
+	umaal	r14, r4, r12, r10
+	mul	r12, r5, r14
+	umaal	r14, r5, r12, r10
+	@ Repack the two output values and write word
+	pkhbt	r8, r4, r5, lsl #16
+	str.w	r8, [r1], #4
+
+	@ Store the four output values.
+	subs	r3, #4
+	bne	fndsa_mqpoly_mul_ntt__L1
+
+	pop	{ r4, r5, r6, r7, r8, r10, pc }
+	.size	fndsa_mqpoly_mul_ntt,.-fndsa_mqpoly_mul_ntt
+
+@ =======================================================================
+@ void fndsa_mqpoly_sub(unsigned logn, uint16_t *a, const uint16_t *b)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_mqpoly_sub
+	.thumb
+	.thumb_func
+	.type	fndsa_mqpoly_sub, %function
+fndsa_mqpoly_sub:
+	push.w	{ r4, r5, r6, r7, lr }
+	movs	r3, #1
+	lsls	r3, r0
+
+	@ r0 <- 0
+	movw	r0, #0
+	@ r14 <- q (both halves)
+	movw	r14, #Q
+	movt	r14, #Q
+
+fndsa_mqpoly_sub__L1:
+	@ Four ldr are faster than two ldrd or two ldm.
+	ldr	r4, [r1]
+	ldr	r5, [r1, #4]
+	ldr.w	r6, [r2], #4
+	ldr.w	r7, [r2], #4
+
+	@ We do the subtraction over the integers, then add q back if
+	@ the result is negative.
+	ssub16	r4, r4, r6
+	sel	r12, r0, r14
+	sadd16	r4, r4, r12
+	str.w	r4, [r1], #4
+	ssub16	r5, r5, r7
+	sel	r12, r0, r14
+	sadd16	r5, r5, r12
+	str.w	r5, [r1], #4
+
+	subs	r3, #4
+	bne	fndsa_mqpoly_sub__L1
+
+	pop	{ r4, r5, r6, r7, pc }
+	.size	fndsa_mqpoly_sub,.-fndsa_mqpoly_sub
+
+@ =======================================================================
+@ uint32_t fndsa_mqpoly_sqnorm_signed(unsigned logn, const uint16_t *a)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_mqpoly_sqnorm_signed
+	.thumb
+	.thumb_func
+	.type	fndsa_mqpoly_sqnorm_signed, %function
+fndsa_mqpoly_sqnorm_signed:
+	movs	r3, #1
+	lsls	r3, r0
+
+	movw	r0, #0
+fndsa_mqpoly_sqnorm_signed__L1:
+	@ We can use ldrd because the caller ensured that the input is
+	@ aligned.
+	ldrd	r2, r12, [r1], #8
+	smlad	r0, r2, r2, r0
+	smlad	r0, r12, r12, r0
+	@ The whole operation cannot overflow in unsigned convention,
+	@ since signed values are at most 2047 (in absolute value) and
+	@ there are at most 1024 of them, hence a maximum squared norm
+	@ of 1024*2047*2047 = 4290774016, which fits on 32 bits.
+	subs	r3, #4
+	bne	fndsa_mqpoly_sqnorm_signed__L1
+
+	bx	lr
+	.size	fndsa_mqpoly_sqnorm_signed,.-fndsa_mqpoly_sqnorm_signed
+
+@ =======================================================================
+@ uint32_t fndsa_mqpoly_sqnorm_ext(unsigned logn, const uint16_t *a)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_mqpoly_sqnorm_ext
+	.thumb
+	.thumb_func
+	.type	fndsa_mqpoly_sqnorm_ext, %function
+fndsa_mqpoly_sqnorm_ext:
+	push.w	{ r4, r5, r6, r7 }
+	movs	r3, #1
+	lsls	r3, r0
+
+	@ r5 <- q (in both halves)
+	movw	r5, #Q
+	movt	r5, #Q
+	@ r6 <- ceil(q/2) (in both halves)
+	movw	r6, #((Q + 1) >> 1)
+	movt	r6, #((Q + 1) >> 1)
+
+	movw	r0, #0
+	@ We clear the Q flag, which we will use to detect overflows.
+	msr	APSR_nzcvq, r0
+fndsa_mqpoly_sqnorm_ext__L1:
+	@ We can use ldrd because the caller ensured that the input is
+	@ aligned.
+	ldrd	r2, r4, [r1], #8
+
+	@ Normalize values to [-q/2,+q/2]
+	ssub16	r7, r2, r5
+	ssub16	r12, r2, r6
+	sel	r2, r7, r2
+	ssub16	r7, r4, r5
+	ssub16	r12, r4, r6
+	sel	r4, r7, r4
+	@ If any addition overflows (signed interpretation), then the Q
+	@ flag will be set.
+	smlad	r0, r2, r2, r0
+	smlad	r0, r4, r4, r0
+	subs	r3, #4
+	bne	fndsa_mqpoly_sqnorm_ext__L1
+
+	@ If the Q flag is set, saturate the returned value to 0xFFFFFFFF
+	mrs	r1, APSR
+	sbfx	r1, r1, #27, #1
+	orrs	r0, r1
+
+	pop	{ r4, r5, r6, r7 }
+	bx	lr
+	.size	fndsa_mqpoly_sqnorm_ext,.-fndsa_mqpoly_sqnorm_ext
+
+@ =======================================================================
+@ void fndsa_mqpoly_int_to_ntt(unsigned logn, uint16_t *d)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_mqpoly_int_to_ntt
+	.thumb
+	.thumb_func
+	.type	fndsa_mqpoly_int_to_ntt, %function
+fndsa_mqpoly_int_to_ntt:
+	push.w	{ r4, r5, r6, r7, r8, r10, r11, lr }
+
+	@ ASSUMPTION: logn >= 2
+
+	@ State:
+	@   r0    0
+	@   r1    &d[j1]
+	@   r2    t = ht*2
+	@   r3    middle loop counter
+	@   r6    s
+	@   r7    innermost loop counter
+	@   r8    &mq_GM[i + m]
+	@   r10   q
+	@   r11   q:q
+	@   r12   scratch
+	@   r14   -1/q mod 2^32
+	@
+	@   s2    d
+	@   s3    m
+
+	vmov	s2, r1         @ original &d[0]
+	movs	r2, #1
+	lsls	r2, r0         @ r2 <- t = ht*2 (initially equal to n)
+	movw	r0, #1         @ m <- 1
+	vmov	s3, r0
+
+	@ Constants.
+	@ r0 <- 0
+	movw	r0, #0
+	@ r10 <- q
+	movw	r10, #Q
+	@ r11 <- q (both halves)
+	orr	r11, r10, r10, lsl #16
+	@ r14 <- -1/q mod 2^32
+	movw	r14, #(Q1I & 0xFFFF)
+	movt	r14, #(Q1I >> 16)
+
+	@ r8 <- &mq_GM[1]
+	adr	r8, fndsa_mqpoly_int_to_ntt__gmaddr_plus1
+	ldr	r8, [r8]
+
+	@ If n = 4, then skip directly to the specialized code for the
+	@ last two iterations.
+	cmp	r2, #4
+	beq	fndsa_mqpoly_int_to_ntt__L4
+
+fndsa_mqpoly_int_to_ntt__L1:
+	@ Middle loop has m iterations.
+	vmov	r3, s3
+	lsl	r6, r3, #1     @ prepare m for next iteration
+	vmov	s3, r6
+fndsa_mqpoly_int_to_ntt__L2:
+	ldrh	r6, [r8], #2   @ s <- mq_GM[i + m]
+	lsr	r7, r2, #1     @ r7 <- ht
+fndsa_mqpoly_int_to_ntt__L3:
+	@ Each inner loop iteration processes two pairs (x1,x2) and (y1,y2).
+	ldr.w	r4, [r1, r2]   @ r4 <- x2:y2
+
+	@ r5 <- mmul(y2, s)
+	smultb	r5, r4, r6
+	mul	r12, r5, r14
+	umaal	r14, r5, r12, r10
+	@ r4 <- mmul(x2, s)
+	smulbb	r4, r4, r6
+	mul	r12, r4, r14
+	umaal	r14, r4, r12, r10
+	@ r5 <- x2:y2
+	pkhbt	r5, r4, r5, lsl #16
+
+	@ r4 <- x1:y1
+	ldr.w	r4, [r1]       @ r4 <- x1:y1
+
+	@ d[j1] <- x1+x2 : y1+y2
+	@ d[j2] <- x1-x2 : y1-y2
+	sadd16	r12, r4, r5
+	ssub16	r5, r4, r5
+	sel	r4, r0, r11
+	sadd16	r5, r5, r4
+	str.w	r5, [r1, r2]
+	ssub16	r4, r12, r11
+	sel	r4, r4, r12
+	str.w	r4, [r1], #4
+
+	@ loop ht/2 times
+	subs	r7, #2
+	bne	fndsa_mqpoly_int_to_ntt__L3
+
+	@ ---------------------------
+
+	@ j0 <- j0 + t
+	@ j0 is implicit in r1, which has been increased for ht elements,
+	@ hence we add ht here (ht*2, since elements are 2-byte values)
+	add.w	r1, r1, r2
+	@ We loop m times
+	subs	r3, #1
+	bne	fndsa_mqpoly_int_to_ntt__L2
+
+	@ r1 now contains &d[n], we must reset it to &d[0] for the next
+	@ iteration.
+	vmov	r1, s2
+
+	@ replace t with ht
+	@ Loop until t reaches 2
+	lsr	r2, r2, #1
+	cmp	r2, #4
+	bne	fndsa_mqpoly_int_to_ntt__L1
+
+fndsa_mqpoly_int_to_ntt__L4:
+	@ Last two outer iterations use specialized code.
+	@   m = n/4
+	@   t = 4
+	@ We do n/4 inner iterations, each processing four consecutive values.
+
+	@ Loop counter (m = n/4).
+	vmov	r3, s3
+
+	@ We need two pointers to read s values; we use r8 and r7.
+	@ At this point, r8 is correct (&mq_GM[m]) and we set r7 to
+	@ &mq_GM[2*m] by adding 2*m (in bytes) to r8.
+	add	r7, r8, r3, lsl #1
+
+	@ r2 is free, since we know it contains 4.
+
+fndsa_mqpoly_int_to_ntt__L5:
+	@ Next-to-last outer iteration: the four values are, in RAM order:
+	@   x1 y1 x2 y2
+	@ We load x2:y2 (into r5) and s (into r6)
+	ldr.w	r5, [r1, #4]
+	ldrh	r6, [r8], #2
+
+	@ r4 <- mmul(x2, s)
+	smulbb	r4, r5, r6
+	mul	r12, r4, r14
+	umaal	r14, r4, r12, r10
+	@ r5 <- mmul(y2, s)
+	smultb	r5, r5, r6
+	mul	r12, r5, r14
+	umaal	r14, r5, r12, r10
+	@ r5 <- mmul(x2, s) : mmul(y2, s)
+	pkhbt	r5, r4, r5, lsl #16
+
+	@ Load x1:y1 (into r4)
+	ldr.w	r4, [r1]
+
+	@ r4 <- (x1+mmul(x2,s)):(y1+mmul(y2,s))
+	@ r5 <- (x1-mmul(x2,s)):(y1-mmul(y2,s))
+	sadd16	r12, r4, r5
+	ssub16	r5, r4, r5
+	sel	r4, r0, r11
+	sadd16	r5, r5, r4
+	ssub16	r4, r12, r11
+	sel	r4, r4, r12
+
+	@ Last iteration: the four values are, in RAM order: x1 x2 y1 y2
+	@ The values have not been really written to RAM, though; they
+	@ are in r4 (x1:x2) and r5 (y1:y2).
+	@ Get the two relevant s values into r6.
+	ldr	r6, [r7], #4
+
+	@ r2 <- x1:y1
+	pkhbt	r2, r4, r5, lsl #16
+	@ r5 <- mmul(x2,s):mmul(y2,s)
+	smultb	r4, r4, r6
+	mul	r12, r4, r14
+	umaal	r14, r4, r12, r10
+	smultt	r5, r5, r6
+	mul	r12, r5, r14
+	umaal	r14, r5, r12, r10
+	pkhbt	r5, r4, r5, lsl #16
+	@ r4 <- (x1+mmul(x2,s):(y1+mmul(y2,s))
+	sadd16	r4, r2, r5
+	ssub16	r12, r4, r11
+	sel	r4, r12, r4
+	@ r5 <- (x1-mmul(x2,s):(y1-mmul(y2,s))
+	ssub16	r5, r2, r5
+	sel	r12, r0, r11
+	sadd16	r5, r5, r12
+
+	@ We write the four final values in x1 x2 y1 y2 order.
+	pkhbt	r12, r4, r5, lsl #16
+	str.w	r12, [r1], #4
+	pkhtb	r12, r5, r4, asr #16
+	str.w	r12, [r1], #4
+
+	subs	r3, #1
+	bne	fndsa_mqpoly_int_to_ntt__L5
+
+fndsa_mqpoly_int_to_ntt__Lend:
+	pop	{ r4, r5, r6, r7, r8, r10, r11, pc }
+	.align	2
+fndsa_mqpoly_int_to_ntt__gmaddr_plus1:
+	.word	fndsa_mq_GM + 2
+	.size	fndsa_mqpoly_int_to_ntt,.-fndsa_mqpoly_int_to_ntt
+
+@ =======================================================================
+@ void fndsa_mqpoly_ntt_to_int(unsigned logn, uint16_t *d)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_mqpoly_ntt_to_int
+	.thumb
+	.thumb_func
+	.type	fndsa_mqpoly_ntt_to_int, %function
+fndsa_mqpoly_ntt_to_int:
+	push.w	{ r4, r5, r6, r7, r8, r10, r11, lr }
+
+	@ ASSUMPTION: logn >= 2
+
+	@ State:
+	@   r0    scratch
+	@   r1    &d[j1]
+	@   r2    dt = 2*t
+	@   r3    middle loop counter
+	@   r4    x1
+	@   r5    x2
+	@   r6    s
+	@   r7    innermost loop counter
+	@   r8    &mq_GM[i + hm]
+	@   r10   q
+	@   r11   q:q
+	@   r12   scratch
+	@   r14   -1/q mod 2^32
+	@
+	@   s2    d
+	@   s3    m
+
+	@ We save the original d in s2.
+	vmov	s2, r1
+	@ m = n initially; we save m/2 to s3, and set r8 to &mq_iGM[m/2]
+	adr	r8, fndsa_mqpoly_ntt_to_int__igmaddr
+	ldr	r8, [r8]
+	movs	r3, #1
+	subs	r0, #1
+	lsl	r0, r3, r0     @ r0 <- n/2 = 2^(logn-1)
+	add.w	r8, r8, r0     @ r8 <- &mq_iGM[n/4]
+	lsr	r3, r0, #1
+	vmov	s3, r3         @ s3 <- n/4
+
+	@ r0 <- 0
+	movw	r0, #0
+	@ r10 <- q
+	movw	r10, #Q
+	@ r11 <- q:q
+	orr	r11, r10, r10, lsl #16
+	@ r14 <- -1/q mod 2^32
+	movw	r14, #(Q1I & 0xFFFF)
+	movt	r14, #(Q1I >> 16)
+
+	@ r8 is the pointer into mq_GM[] for the second outer iteration.
+	@ r7 is the pointer into mq_GM[] for the first outer iteration.
+	add	r7, r8, r3, lsl #1
+
+	@ r3 is the loop counter. r10, r11 and r14 are constants used for
+	@ modular reduction. r2, r4, r5 and r12 are scratch.
+
+	@ First two iterations are specialized.
+fndsa_mqpoly_ntt_to_int__L0:
+	@ First iteration: values are in x1 x2 y1 y2 order.
+	ldr	r2, [r1]        @ r2 <- x1:x2
+	ldr	r5, [r1, #4]    @ r5 <- y1:y2
+	ldr.w	r6, [r7], #4    @ r6 <- s1:s2
+
+	@ r4 <- x1:y1
+	pkhbt	r4, r2, r5, lsl #16
+	@ r5 <- x2:y2
+	pkhtb	r5, r5, r2, asr #16
+	@ r2 <- (x1+x2)/2:(y1+y2)/2
+	sadd16	r2, r4, r5
+	ssub16	r12, r2, r11
+	sel	r2, r12, r2
+	and	r12, r2, #0x00010001
+	umlal	r2, r12, r12, r10
+	lsr.w	r2, r2, #1
+	@ r5 <- (x1-x2):(y1-y2)
+	ssub16	r5, r4, r5
+	sel	r12, r0, r11
+	sadd16	r5, r5, r12
+	@ r4 <- mmul(x1-x2,s)
+	smulbb	r4, r5, r6
+	mul	r12, r4, r14
+	umaal	r14, r4, r12, r10
+	@ r5 <- mmul(y1-y2,s)
+	smultt	r5, r5, r6
+	mul	r12, r5, r14
+	umaal	r14, r5, r12, r10
+
+	@ Second iteration. Normally we get x1 y1 x2 y2 from RAM; here,
+	@ we have x1:x2 in r2, y1 in r4 and y2 in r5.
+	@ Reorganize the values:
+	pkhbt	r4, r2, r4, lsl #16   @ r4 <- x1:y1
+	lsl.w	r5, r5, #16
+	orr	r5, r5, r2, lsr #16   @ r5 <- x2:y2
+	@ Read s for the second iteration.
+	ldrh	r6, [r8], #2
+
+	@ r2 <- (x1+x2)/2:(y1+y2)/2
+	sadd16	r2, r4, r5
+	ssub16	r12, r2, r11
+	sel	r2, r12, r2
+	and	r12, r2, #0x00010001
+	umlal	r2, r12, r12, r10
+	lsr.w	r2, r2, #1
+	@ r5 <- (x1-x2):(y1-y2)
+	ssub16	r5, r4, r5
+	sel	r12, r0, r11
+	sadd16	r5, r5, r12
+	@ r4 <- mmul(x1-x2,s)
+	smulbb	r4, r5, r6
+	mul	r12, r4, r14
+	umaal	r14, r4, r12, r10
+	@ r5 <- mmul(y1-y2,s)
+	smultb	r5, r5, r6
+	mul	r12, r5, r14
+	umaal	r14, r5, r12, r10
+
+	@ Repack values, to write them in x1 y1 x2 y2 order.
+	str.w	r2, [r1], #4
+	pkhbt	r4, r4, r5, lsl #16
+	str.w	r4, [r1], #4
+
+	@ Loop n/4 times.
+	subs	r3, #1
+	bne	fndsa_mqpoly_ntt_to_int__L0
+
+	@ Prepare for remaining iterations.
+	@ r2 <- -2*t = -8
+	movs	r2, #8
+	rsbs	r2, #0
+	@ r3 <- m (for next iteration)
+	vmov	r3, s3
+
+	@ If logn=2 then m=1 and we are finished.
+	cmp	r3, #1
+	beq	fndsa_mqpoly_ntt_to_int__Lend
+
+fndsa_mqpoly_ntt_to_int__L1:
+	@ Rewind r1 to start of array.
+	vmov	r1, s2
+
+	@ m is in r3. r8 was left at &mq_iGM[2*m]; we need to adjust it
+	@ to &mq_iGM[m/2], by subtracting 3*m (each element is two bytes).
+	sub	r8, r8, r3, lsl #1
+	sub.w	r8, r8, r3
+
+	@ Middle loop has m/2 iterations (r3 is used as counter).
+fndsa_mqpoly_ntt_to_int__L2:
+	ldrh	r6, [r8], #2   @ s <- mq_iGM[i + m/2]
+
+	asrs	r7, r2, #1     @ r7 <- -t
+	@ We use r1 to point to the second pair (x2:y2); r2 is negative.
+	@ The inner loop will inherently adjust r1 to point to the start
+	@ of the next chunk for the next middle loop iteration.
+	subs	r1, r1, r2
+
+fndsa_mqpoly_ntt_to_int__L3:
+	@ Each inner loop iteration processes two pairs (x1,x2) and (y1,y2).
+	ldr	r4, [r1, r2]   @ r4 <- x1:y1
+	ldr	r5, [r1]       @ r5 <- x2:y2
+
+	@ r4 <- (x1+x2):(y1+y2)
+	@ r5 <- (x1-x2):(y1-x2)
+	sadd16	r12, r4, r5
+	ssub16	r5, r4, r5
+	sel	r4, r0, r11
+	sadd16	r5, r5, r4
+	ssub16	r4, r12, r11
+	sel	r4, r4, r12
+	@ r4 <- (x1+x2)/2:(y1+y2)/2
+	and	r12, r4, #0x00010001
+	umlal	r4, r12, r12, r10
+	lsr.w	r4, r4, #1
+	@ Write first output word
+	str.w	r4, [r1, r2]
+
+	@ r5 <- mmul(x1-x2,s):mmul(y1-y2,s)
+	smulbb	r4, r5, r6
+	mul	r12, r4, r14
+	umaal	r14, r4, r12, r10
+	smultb	r5, r5, r6
+	mul	r12, r5, r14
+	umaal	r14, r5, r12, r10
+	pkhbt	r5, r4, r5, lsl #16
+	@ Write second output word
+	str.w	r5, [r1], #4
+
+	@ We should do t iterations, but since we process a pair of elements
+	@ each time, we only do t/2 iterations. Take care that the r7 counter
+	@ is negative.
+	adds	r7, #2
+	bne	fndsa_mqpoly_ntt_to_int__L3
+
+	@ We loop m/2 times
+	subs	r3, #2
+	bne	fndsa_mqpoly_ntt_to_int__L2
+
+	@ Replace -t with -dt = 2*(-t)
+	lsl.w	r2, r2, #1
+
+	@ Replace m with m/2. We are finished when m becomes 1.
+	vmov	r3, s3
+	lsr.w	r3, r3, #1
+	vmov	s3, r3
+	cmp	r3, #1
+	bne	fndsa_mqpoly_ntt_to_int__L1
+
+fndsa_mqpoly_ntt_to_int__Lend:
+	pop	{ r4, r5, r6, r7, r8, r10, r11, pc }
+	.align	2
+fndsa_mqpoly_ntt_to_int__igmaddr:
+	.word	fndsa_mq_iGM
+	.size	fndsa_mqpoly_ntt_to_int,.-fndsa_mqpoly_ntt_to_int
diff --git a/crypto_sign/fndsa_provisional-512/m4f/sha3.c b/crypto_sign/fndsa_provisional-512/m4f/sha3.c
new file mode 120000
index 00000000..07f53388
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/sha3.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/sha3.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/sha3_cm4.s b/crypto_sign/fndsa_provisional-512/m4f/sha3_cm4.s
new file mode 100644
index 00000000..fce3fd00
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/sha3_cm4.s
@@ -0,0 +1,1061 @@
+	.syntax	unified
+	.cpu	cortex-m4
+	.file	"sha3_cm4.s"
+	.text
+
+@ =======================================================================
+@ void fndsa_sha3_inject_chunk(void *dst, const void *src, size_t len)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_sha3_inject_chunk
+	.thumb
+	.thumb_func
+	.type	fndsa_sha3_inject_chunk, %function
+fndsa_sha3_inject_chunk:
+	push	{ r4, r5 }
+
+	@ If less than 8 bytes to inject, do it byte-by-byte.
+	cmp	r2, #8
+	blo	fndsa_sha3_inject_chunk__L4
+
+	@ Process some bytes until the destination is aligned.
+	rsbs	r5, r0, #0
+	ands	r5, r5, #3
+	beq	fndsa_sha3_inject_chunk__L2
+	subs	r2, r5
+fndsa_sha3_inject_chunk__L1:
+	ldrb.w	r3, [r0]
+	ldrb	r4, [r1], #1
+	eor	r3, r3, r4
+	strb	r3, [r0], #1
+	subs	r5, #1
+	bne	fndsa_sha3_inject_chunk__L1
+
+fndsa_sha3_inject_chunk__L2:
+	@ Destination is aligned. Source might be unaligned, but the
+	@ Cortex-M4 tolerates unaligns accesses with a penalty which is
+	@ lower than doing word reassembly in software.
+	lsr	r5, r2, #2
+fndsa_sha3_inject_chunk__L3:
+	ldr.w	r3, [r0]
+	ldr	r4, [r1], #4
+	eor	r3, r3, r4
+	str	r3, [r0], #4
+	subs	r5, #1
+	bne	fndsa_sha3_inject_chunk__L3
+
+	@ We may have a remaining tail of up to 3 bytes.
+	ands	r2, r2, #3
+	beq.w	fndsa_sha3_inject_chunk__L5
+
+fndsa_sha3_inject_chunk__L4:
+	@ Byte-by-byte processing for the data tail.
+	ldrb.w	r3, [r0]
+	ldrb	r4, [r1], #1
+	eor	r3, r3, r4
+	strb	r3, [r0], #1
+	subs	r2, #1
+	bne	fndsa_sha3_inject_chunk__L4
+
+fndsa_sha3_inject_chunk__L5:
+	pop	{ r4, r5 }
+	bx	lr
+	.size	fndsa_sha3_inject_chunk,.-fndsa_sha3_inject_chunk
+
+@ =======================================================================
+@ bit_split_5(uint64_t x0, uint64_t x1, uint64_t x2, uint64_t x3, uint64_t x4)
+@ Split inputs x0 to x4 into even-indexed and odd-indexed bits.
+@ Internal function only; non-standard ABI:
+@ input:
+@    r0:r1    x0
+@    r2:r3    x1
+@    r4:r5    x2
+@    r6:r7    x3
+@    r10:r11  x4
+@ ASPR.GE flags must have pattern 0110.
+@
+@ output:
+@    r0     even-indexed bits of x0
+@    r1     odd-indexed bits of x0
+@    r2     even-indexed bits of x1
+@    r3     odd-indexed bits of x1
+@    r4     even-indexed bits of x2
+@    r5     odd-indexed bits of x2
+@    r6     even-indexed bits of x3
+@    r7     odd-indexed bits of x3
+@    r10    even-indexed bits of x4
+@    r11    odd-indexed bits of x4
+@ clobbers:
+@    r8, r14
+@
+@ bit_split_1, bit_split_2, bit_split_3 and bit_split_4 are alternate
+@ entry points that process only the first 1, 2, 3 or 4 words.
+@ =======================================================================
+
+	@ This macro splits a word (input register xx) into its
+	@ even-indexed bits (into the low half of output register dd)
+	@ and odd-indexed bits (high half of dd).
+	@ This macro assumes that the ASPR.GE flags have the 0110 pattern.
+	@ dd and xx cannot be the same register. xx is consumed.
+.macro	BIT_SPLIT_32  xx, dd
+	eor	\dd, \xx, \xx, lsr #1
+	and	\dd, \dd, #0x22222222
+	eor	\xx, \xx, \dd
+	eor	\xx, \xx, \dd, lsl #1
+	eor	\dd, \xx, \xx, lsr #2
+	and	\dd, \dd, #0x0C0C0C0C
+	eor	\xx, \xx, \dd
+	eor	\xx, \xx, \dd, lsl #2
+	eor	\dd, \xx, \xx, lsr #4
+	and	\dd, \dd, #0x00F000F0
+	eor	\xx, \xx, \dd
+	eor	\xx, \xx, \dd, lsl #4
+	rev	\dd, \xx
+	sel	\dd, \dd, \xx
+.endm
+
+	@ Split a 64-bit value x0:x1 into its even-indexed bits (into x0)
+	@ and high-indexed bits (into x1). xt is a scratch register.
+	@ This macro assumes that the ASPR.GE flags have the 0110 pattern.
+.macro	BIT_SPLIT_64  x0, x1, xt
+	BIT_SPLIT_32  \x0, \xt
+	BIT_SPLIT_32  \x1, \x0
+	pkhtb	\x1, \x0, \xt, asr #16
+	pkhbt	\x0, \xt, \x0, lsl #16
+.endm
+
+	.align	2
+	.thumb
+	.thumb_func
+	.type	bit_split_5, %function
+bit_split_5:
+	BIT_SPLIT_64	r10, r11, r8
+bit_split_4:
+	BIT_SPLIT_64	r6,  r7,  r8
+bit_split_3:
+	BIT_SPLIT_64	r4,  r5,  r8
+bit_split_2:
+	BIT_SPLIT_64	r2,  r3,  r8
+bit_split_1:
+	BIT_SPLIT_64	r0,  r1,  r8
+	bx	lr
+	.size	bit_split_5, .-bit_split_5
+
+@ =======================================================================
+@ bit_merge_5(uint64_t x0, uint64_t x1, uint64_t x2, uint64_t x3, uint64_t x4)
+@ Merge inputs x0 to x4 with bit interleaving. For i = 0 to 4, the
+@ low word of x_i contains the even-indexed bits, and the high word
+@ contains the odd-indexed bits.
+@ Internal function only; non-standard ABI:
+@ input:
+@    r0:r1    x0
+@    r2:r3    x1
+@    r4:r5    x2
+@    r6:r7    x3
+@    r10:r11  x4
+@ ASPR.GE flags must have pattern 0110.
+@
+@ output:
+@    r0:r1    merged x0
+@    r2:r3    merged x1
+@    r4:r5    merged x2
+@    r6:r7    merged x3
+@    r10:r11  merged x4
+@ clobbers:
+@    r8, r14
+@
+@ bit_merge_1, bit_merge_2, bit_merge_3 and bit_merge_4 are alternate
+@ entry points that process only the first 1, 2, 3 or 4 words.
+@ =======================================================================
+
+	@ This macro merges a word (input register xx): low half yields
+	@ the even-indexed bits, and hight half provides the odd-indexed
+	@ bits. Output is written into register dd.
+	@ This macro assumes that the ASPR.GE flags have the 0110 pattern.
+	@ dd and xx cannot be the same register. xx is consumed.
+.macro	BIT_MERGE_32  xx, dd
+	rev	\dd, \xx
+	sel	\xx, \dd, \xx
+	eor	\dd, \xx, \xx, lsr #4
+	and	\dd, \dd, #0x00F000F0
+	eor	\xx, \xx, \dd
+	eor	\xx, \xx, \dd, lsl #4
+	eor	\dd, \xx, \xx, lsr #2
+	and	\dd, \dd, #0x0C0C0C0C
+	eor	\xx, \xx, \dd
+	eor	\xx, \xx, \dd, lsl #2
+	eor	\dd, \xx, \xx, lsr #1
+	and	\dd, \dd, #0x22222222
+	eor	\xx, \xx, \dd
+	eor	\dd, \xx, \dd, lsl #1
+.endm
+
+	@ BIT_MERGE_64 interleaves the bits from x0 and from x1, result
+	@ is written back to x0:x1. xt is a scratch register.
+	@ This macro assumes that the ASPR.GE flags have the 0110 pattern.
+.macro	BIT_MERGE_64  x0, x1, xt
+	pkhtb	\xt, \x1, \x0, asr #16
+	pkhbt	\x1, \x0, \x1, lsl #16
+	BIT_MERGE_32	\x1, \x0
+	BIT_MERGE_32	\xt, \x1
+.endm
+
+	.align	2
+	.thumb
+	.thumb_func
+	.type	bit_merge_5, %function
+bit_merge_5:
+	BIT_MERGE_64	r10, r11, r8
+bit_merge_4:
+	BIT_MERGE_64	r6,  r7,  r8
+bit_merge_3:
+	BIT_MERGE_64	r4,  r5,  r8
+bit_merge_2:
+	BIT_MERGE_64	r2,  r3,  r8
+bit_merge_1:
+	BIT_MERGE_64	r0,  r1,  r8
+	bx	lr
+	.size	bit_merge_5, .-bit_merge_5
+
+@ =======================================================================
+@ void fndsa_sha3_process_block(uint64_t *A, unsigned r)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_sha3_process_block
+	.thumb
+	.thumb_func
+	.type	fndsa_sha3_process_block, %function
+fndsa_sha3_process_block:
+	push.w	{ r4, r5, r6, r7, r8, r10, r11, lr }
+	vpush.64 { d8, d9, d10, d11, d12, d13, d14, d15 }
+
+	@ Source state is read from the provided buffer. The first r
+	@ words (with "word" being a 64-bit state element) are split
+	@ into even-indexed and odd-indexed bits. The state is loaded
+	@ into FP registers and a stack buffer:
+	@   d0 to d15 receive A[0] to A[15]
+	@   sp[] receives A[i] at offset 8*(i-16) for i >= 16
+
+	@ TODO: with the split, most of each round is really two separate
+	@ sequences, each working on 25 32-bit values; they communicate
+	@ with each other only through the rotation of the XOR of the
+	@ lanes (step 2 of theta, section 3.2.1 in FIPS 202) and when
+	@ further lane rotations use an odd count (rho function, section
+	@ 3.2.2). We might be able to leverage that to improve locality,
+	@ i.e. keep more values in integer registers and reduce traffic
+	@ with storage (FP registers and stack). The state layout in FP
+	@ registers and the stack would have to change so that accesses
+	@ to FP can still be done with the double-width vmov most of
+	@ the time.
+
+	@ Stack:
+	@ off  size
+	@   0   72   state words 16 to 24
+	@  72    4   pointer to state array
+	@  76    4   number of data words
+	@  80    8   temporary for one state word
+	sub	sp, #88
+	str	r0, [sp, #72]   @ Save state pointer
+	str	r1, [sp, #76]   @ Save rate (in 64-bit words)
+	mov.n	r14, r1
+
+	@ Read word A[idx] into the specified register pair.
+.macro	A_LD  x0, x1, idx
+	.if ((\idx) == 0)
+	vmov	\x0, \x1, s0, s1
+	.elseif ((\idx) == 1)
+	vmov	\x0, \x1, s2, s3
+	.elseif ((\idx) == 2)
+	vmov	\x0, \x1, s4, s5
+	.elseif ((\idx) == 3)
+	vmov	\x0, \x1, s6, s7
+	.elseif ((\idx) == 4)
+	vmov	\x0, \x1, s8, s9
+	.elseif ((\idx) == 5)
+	vmov	\x0, \x1, s10, s11
+	.elseif ((\idx) == 6)
+	vmov	\x0, \x1, s12, s13
+	.elseif ((\idx) == 7)
+	vmov	\x0, \x1, s14, s15
+	.elseif ((\idx) == 8)
+	vmov	\x0, \x1, s16, s17
+	.elseif ((\idx) == 9)
+	vmov	\x0, \x1, s18, s19
+	.elseif ((\idx) == 10)
+	vmov	\x0, \x1, s20, s21
+	.elseif ((\idx) == 11)
+	vmov	\x0, \x1, s22, s23
+	.elseif ((\idx) == 12)
+	vmov	\x0, \x1, s24, s25
+	.elseif ((\idx) == 13)
+	vmov	\x0, \x1, s26, s27
+	.elseif ((\idx) == 14)
+	vmov	\x0, \x1, s28, s29
+	.elseif ((\idx) == 15)
+	vmov	\x0, \x1, s30, s31
+	.else
+	ldrd	\x0, \x1, [sp, #(8 * ((\idx) - 16))]
+	.endif
+.endm
+
+	@ Like A_LD, except that it uses two ldr opcodes instead of one
+	@ ldrd for the words which are on the stack. This allows that
+	@ load to pipeline with a previous load.
+	@ WARNING: the two destination registers shall be both low
+	@ (r0 to r7) or both high (r8 to r14), otherwise misalignment
+	@ may occur. When the two registers are high, the footprint is
+	@ 8 bytes, while A_LD would use 4 bytes.
+.macro	A_LDX  x0, x1, idx
+	.if (\idx) <= 15
+	A_LD	\x0, \x1, \idx
+	.else
+	ldr	\x0, [sp, #(8 * ((\idx) - 16))]
+	ldr	\x1, [sp, #(8 * ((\idx) - 16) + 4)]
+	.endif
+.endm
+
+	@ Write into word A[idx] from the specified register pair.
+	@ WARNING: the two destination registers shall be both low
+	@ (r0 to r7) or both high (r8 to r14), otherwise misalignment
+	@ may occur.
+.macro	A_ST  x0, x1, idx
+	.if ((\idx) == 0)
+	vmov	s0, s1, \x0, \x1
+	.elseif ((\idx) == 1)
+	vmov	s2, s3, \x0, \x1
+	.elseif ((\idx) == 2)
+	vmov	s4, s5, \x0, \x1
+	.elseif ((\idx) == 3)
+	vmov	s6, s7, \x0, \x1
+	.elseif ((\idx) == 4)
+	vmov	s8, s9, \x0, \x1
+	.elseif ((\idx) == 5)
+	vmov	s10, s11, \x0, \x1
+	.elseif ((\idx) == 6)
+	vmov	s12, s13, \x0, \x1
+	.elseif ((\idx) == 7)
+	vmov	s14, s15, \x0, \x1
+	.elseif ((\idx) == 8)
+	vmov	s16, s17, \x0, \x1
+	.elseif ((\idx) == 9)
+	vmov	s18, s19, \x0, \x1
+	.elseif ((\idx) == 10)
+	vmov	s20, s21, \x0, \x1
+	.elseif ((\idx) == 11)
+	vmov	s22, s23, \x0, \x1
+	.elseif ((\idx) == 12)
+	vmov	s24, s25, \x0, \x1
+	.elseif ((\idx) == 13)
+	vmov	s26, s27, \x0, \x1
+	.elseif ((\idx) == 14)
+	vmov	s28, s29, \x0, \x1
+	.elseif ((\idx) == 15)
+	vmov	s30, s31, \x0, \x1
+	.else
+	@ Two str opcodes will pair and run in 2 cycles (as long as there
+	@ is no stall from another memory access immediately before or
+	@ after); strd would be shorter (one instruction) but use 3 cycles.
+	str	\x0, [sp, #(8 * ((\idx) - 16))]
+	str	\x1, [sp, #(8 * ((\idx) - 16) + 4)]
+	.endif
+.endm
+
+	@ Rotate-right registers x0 and x1 by e0 and e1 bits, respectively;
+	@ rotation counts must be in [0,31]. Rotation is skipped when the
+	@ rotation count is zero.
+.macro	ROR_WORD  x0, x1, e0, e1
+	.if (\e0) != 0
+	ror	\x0, \x0, #(\e0)
+	.endif
+	.if (\e1) != 0
+	ror	\x1, \x1, #(\e1)
+	.endif
+.endm
+
+	@ XOR right-rotated registers xa into registers xd. Rotation count
+	@ must be in [0,31].
+.macro	XOR_ROR_WORD  xd0, xd1, xa0, xa1, e0, e1
+	.if (\e0) == 0
+	eor	\xd0, \xd0, \xa0
+	.else
+	eor	\xd0, \xd0, \xa0, ror #(\e0)
+	.endif
+	.if (\e1) == 0
+	eor	\xd1, \xd1, \xa1
+	.else
+	eor	\xd1, \xd1, \xa1, ror #(\e1)
+	.endif
+.endm
+
+	@ Prepare the ASPR.GE flags with pattern 0110.
+	@ All operations in the complete routine preserve these flags. This
+	@ flag pattern is used in bit_split_5 and bit_merge_5.
+	movw	r7, #0xFF00
+	movt	r7, #0x00FF
+	uadd8	r7, r7, r7
+
+	@ For the initial input, we must split even/odd bits from data word
+	@ (the non-data words are assumed to be already split), and
+	@ pre-rotate all words in the way the loop expects.
+	@
+	@ Possible rate values (in 64-bit words):
+	@   SHAKE128              21
+	@   SHA3-224              18
+	@   SHAKE256, SHA3-256    17
+	@   SHA3-384              13
+	@   SHA3-512               9
+	@
+	@ We fast-path the case r = 17, which corresponds to SHAKE256 and
+	@ SHA3-256.
+
+	@ A[0] to A[5]
+	mov.w	r12, r0    @ move state pointer to r12
+	ldm	r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 }
+	bl	bit_split_5
+	ROR_WORD  r0,  r1,   0,  0
+	ROR_WORD  r2,  r3,  22, 22
+	ROR_WORD  r4,  r5,  22, 21
+	ROR_WORD  r6,  r7,  11, 10
+	ROR_WORD  r10, r11,  7,  7
+	A_ST	r0,  r1,   0
+	A_ST	r2,  r3,   1
+	A_ST	r4,  r5,   2
+	A_ST	r6,  r7,   3
+	A_ST	r10, r11,  4
+	@ A[5] to A[10]
+	ldm	r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 }
+	@ If r = 9 then we split only four words; otherwise, r >= 13
+	@ and we split 5 words.
+	cmp	r14, #9
+	bhi.w	fndsa_sha3_process_block__L1
+	bl	bit_split_4
+	b.w	fndsa_sha3_process_block__L2
+fndsa_sha3_process_block__L1:
+	bl	bit_split_5
+fndsa_sha3_process_block__L2:
+	ROR_WORD  r0,  r1,  14, 14
+	ROR_WORD  r2,  r3,  10, 10
+	ROR_WORD  r4,  r5,   2,  1
+	ROR_WORD  r6,  r7,  23, 22
+	ROR_WORD  r10, r11, 31, 30
+	A_ST	r0,  r1,   5
+	A_ST	r2,  r3,   6
+	A_ST	r4,  r5,   7
+	A_ST	r6,  r7,   8
+	A_ST	r10, r11,  9
+	@ A[10] to A[14]
+	ldm	r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 }
+	@ If r = 9 then we do not split any word.
+	@ If r = 13 then we split three words.
+	@ Otherwise, r >= 17 and we split 5 words.
+	cmp	r14, #13
+	bhi.w	fndsa_sha3_process_block__L3
+	cmp	r14, #10
+	bls.w	fndsa_sha3_process_block__L4
+	bl	bit_split_3
+	b.w	fndsa_sha3_process_block__L4
+fndsa_sha3_process_block__L3:
+	bl	bit_split_5
+fndsa_sha3_process_block__L4:
+	ROR_WORD  r0,  r1,   1,  0
+	ROR_WORD  r2,  r3,   3,  3
+	ROR_WORD  r4,  r5,  13, 12
+	ROR_WORD  r6,  r7,   4,  4
+	ROR_WORD  r10, r11,  9,  9
+	A_ST	r0,  r1,  10
+	A_ST	r2,  r3,  11
+	A_ST	r4,  r5,  12
+	A_ST	r6,  r7,  13
+	A_ST	r10, r11, 14
+	@ A[15] to A[19]
+	ldm	r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 }
+	@ If r = 9 or 13, then we do not split any word.
+	@ If r = 17 then we split two words.
+	@ If r = 18 then we split three words.
+	@ Otherwise, r = 21 and we split 5 words.
+	cmp	r14, #17
+	beq.w	fndsa_sha3_process_block__L6
+	cmp	r14, #18
+	beq.w	fndsa_sha3_process_block__L5
+	cmp	r14, #15
+	bls.w	fndsa_sha3_process_block__L7
+	bl	bit_split_5
+	b.w	fndsa_sha3_process_block__L7
+fndsa_sha3_process_block__L5:
+	bl	bit_split_3
+	b.w	fndsa_sha3_process_block__L7
+fndsa_sha3_process_block__L6:
+	bl	bit_split_2
+fndsa_sha3_process_block__L7:
+	ROR_WORD  r0,  r1,  14, 13
+	ROR_WORD  r2,  r3,  18, 18
+	ROR_WORD  r4,  r5,   5,  5
+	ROR_WORD  r6,  r7,   8,  7
+	ROR_WORD  r10, r11, 28, 28
+	A_ST	r0,  r1,  15
+	A_ST	r2,  r3,  16
+	A_ST	r4,  r5,  17
+	A_ST	r6,  r7,  18
+	A_ST	r10, r11, 19
+	@ A[20] to A[24]
+	ldm	r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 }
+	@ If r = 9, 13, 17 or 18, then no split happens here.
+	@ Otherwise, r = 21 and we split word A[20].
+	cmp	r14, #20
+	bls.w	fndsa_sha3_process_block__L8
+	bl	bit_split_1
+fndsa_sha3_process_block__L8:
+	@ We have the split but not rotated words in the registers; we
+	@ want to keep them that way, hence we have to copy through r8:r12
+	@ for the pre-rotation.
+	mov	r8, r0
+	mov	r12, r1
+	ROR_WORD  r8,  r12, 31, 31
+	A_ST	r8,  r12, 20
+	mov	r8, r2
+	mov	r12, r3
+	ROR_WORD  r8,  r12, 28, 27
+	A_ST	r8,  r12, 21
+	mov	r8, r4
+	mov	r12, r5
+	ROR_WORD  r8,  r12, 20, 19
+	A_ST	r8,  r12, 22
+	mov	r8, r6
+	mov	r12, r7
+	ROR_WORD  r8,  r12, 21, 20
+	A_ST	r8,  r12, 23
+	mov	r8, r10
+	mov	r12, r11
+	ROR_WORD  r8,  r12,  1,  1
+	A_ST	r8,  r12, 24
+
+	@ Here begins the preamble for the first iteration (XORing the
+	@ words into t0..t4). Afterwards, that operation is done at the
+	@ end of each iteration (in preparation for the next one) so
+	@ this sequence is done only once.
+
+	@ xor(A[5*i+0]) -> r0:r1
+	@ xor(A[5*i+1]) -> r2:r3
+	@ xor(A[5*i+2]) -> r4:r5
+	@ xor(A[5*i+3]) -> r6:r7
+	@ xor(A[5*i+4]) -> r10:r11
+
+	@ Previous code left A[20..24] into the registers, we do not have
+	@ to read them again.
+	@add	r12, sp, #32
+	@ldm	r12, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 }
+	A_LD	r8, r12, 0
+	XOR_ROR_WORD  r0, r1, r8, r12,  0,  0
+	A_LD	r8, r12, 5
+	XOR_ROR_WORD  r0, r1, r8, r12, 18, 18
+	A_LD	r8, r12, 10
+	XOR_ROR_WORD  r0, r1, r8, r12, 31,  0
+	A_LD	r8, r12, 15
+	XOR_ROR_WORD  r0, r1, r8, r12, 18, 19
+
+	A_LD	r8, r12, 1
+	XOR_ROR_WORD  r2, r3, r8, r12, 10, 10
+	A_LD	r8, r12, 6
+	XOR_ROR_WORD  r2, r3, r8, r12, 22, 22
+	A_LD	r8, r12, 11
+	XOR_ROR_WORD  r2, r3, r8, r12, 29, 29
+	A_LD	r8, r12, 16
+	XOR_ROR_WORD  r2, r3, r8, r12, 14, 14
+
+	A_LD	r8, r12, 2
+	XOR_ROR_WORD  r4, r5, r8, r12, 10, 11
+	A_LD	r8, r12, 7
+	XOR_ROR_WORD  r4, r5, r8, r12, 30, 31
+	A_LD	r8, r12, 12
+	XOR_ROR_WORD  r4, r5, r8, r12, 19, 20
+	A_LD	r8, r12, 17
+	XOR_ROR_WORD  r4, r5, r8, r12, 27, 27
+
+	A_LD	r8, r12, 3
+	XOR_ROR_WORD  r6, r7, r8, r12, 21, 22
+	A_LD	r8, r12, 8
+	XOR_ROR_WORD  r6, r7, r8, r12,  9, 10
+	A_LD	r8, r12, 13
+	XOR_ROR_WORD  r6, r7, r8, r12, 28, 28
+	A_LD	r8, r12, 18
+	XOR_ROR_WORD  r6, r7, r8, r12, 24, 25
+
+	A_LD	r8, r12, 4
+	XOR_ROR_WORD  r10, r11, r8, r12, 25, 25
+	A_LD	r8, r12, 9
+	XOR_ROR_WORD  r10, r11, r8, r12,  1,  2
+	A_LD	r8, r12, 14
+	XOR_ROR_WORD  r10, r11, r8, r12, 23, 23
+	A_LD	r8, r12, 19
+	XOR_ROR_WORD  r10, r11, r8, r12,  4,  4
+
+	@ We will perform 24 rounds. Each loop iteration performs one round.
+	@ We keep minus eight times the current round counter in r14 (i.e. a
+	@ multiple of 8, from -192 to -8).
+	mvn	r14, #0xBF
+
+fndsa_sha3_process_block__loop_step2:
+	@ The A[] words have delayed rotations from the previous round:
+	@   A[ 0]   er0:  0  er1:  0
+	@   A[ 1]   er0: 10  er1: 10
+	@   A[ 2]   er0: 10  er1: 11
+	@   A[ 3]   er0: 21  er1: 22
+	@   A[ 4]   er0: 25  er1: 25
+	@   A[ 5]   er0: 18  er1: 18
+	@   A[ 6]   er0: 22  er1: 22
+	@   A[ 7]   er0: 30  er1: 31
+	@   A[ 8]   er0:  9  er1: 10
+	@   A[ 9]   er0:  1  er1:  2
+	@   A[10]   er0: 31  er1:  0
+	@   A[11]   er0: 29  er1: 29
+	@   A[12]   er0: 19  er1: 20
+	@   A[13]   er0: 28  er1: 28
+	@   A[14]   er0: 23  er1: 23
+	@   A[15]   er0: 18  er1: 19
+	@   A[16]   er0: 14  er1: 14
+	@   A[17]   er0: 27  er1: 27
+	@   A[18]   er0: 24  er1: 25
+	@   A[19]   er0:  4  er1:  4
+	@   A[20]   er0:  1  er1:  1
+	@   A[21]   er0:  4  er1:  5
+	@   A[22]   er0: 12  er1: 13
+	@   A[23]   er0: 11  er1: 12
+	@   A[24]   er0: 31  er1: 31
+
+	@ t0 = xor(A[5*i+4]) ^ rotl1(xor(A[5*i+1])) -> r8:r12
+	@ t1 = xor(A[5*i+0]) ^ rotl1(xor(A[5*i+2])) -> r0:r1
+	@ t2 = xor(A[5*i+1]) ^ rotl1(xor(A[5*i+3])) -> r2:r3
+	@ t3 = xor(A[5*i+2]) ^ rotl1(xor(A[5*i+4])) -> r4:r5
+	@ t4 = xor(A[5*i+3]) ^ rotl1(xor(A[5*i+0])) -> r6:r7
+
+	eor	r12, r11, r2
+	eor	r8, r10, r3, ror #31
+	eor	r3, r3, r6
+	eor	r2, r2, r7, ror #31
+	eor	r7, r7, r0
+	eor	r6, r6, r1, ror #31
+	eor	r1, r1, r4
+	eor	r0, r0, r5, ror #31
+	eor	r5, r5, r10
+	eor	r4, r4, r11, ror #31
+
+	@ XOR each t_i into A[5*j+i] (for j = 0 to 4).
+	@   t0:t1    value t_i (register pair)
+	@   idx      index of A[] word
+	@   e0, e1   delayed rotations for the A[] word
+	@   swap     non-zero for a register swap
+	@ The delayed rotations (from the previous round) are absorbed here.
+	@ New delayed rotations are created here; only the register swap
+	@ is performed (if the new rotation count, over 64 bits, is odd).
+	@ Clobbers: r10, r11
+
+.macro	XOR_T  t0, t1, idx, e0, e1, swap
+	A_LD	r10, r11, \idx
+	.if (\e0) == 0
+	eor	r10, \t0, r10
+	.else
+	eor	r10, \t0, r10, ror #(\e0)
+	.endif
+	.if (\e1) == 0
+	eor	r11, \t1, r11
+	.else
+	eor	r11, \t1, r11, ror #(\e1)
+	.endif
+	.if (\swap) != 0
+	A_ST	r11, r10, \idx
+	.else
+	A_ST	r10, r11, \idx
+	.endif
+.endm
+
+	@ We process all words except 0, 6, 12, 18 and 24, which come last
+	@ with a special sequence.
+	@ We also interleave "high" (16+) and "low" (0 to 15) words, so as
+	@ to avoid str + ldrd sequences which create memory stalls.
+
+	XOR_T	r0,  r1,   1, 10, 10, 1
+	XOR_T	r0,  r1,  16, 14, 14, 1
+	XOR_T	r2,  r3,   2, 10, 11, 0
+	XOR_T	r2,  r3,  17, 27, 27, 1
+	XOR_T	r4,  r5,   3, 21, 22, 0
+	XOR_T	r6,  r7,  19,  4,  4, 0
+	XOR_T	r6,  r7,   4, 25, 25, 1
+	XOR_T	r8,  r12, 20,  1,  1, 0
+	XOR_T	r8,  r12,  5, 18, 18, 0
+	XOR_T	r0,  r1,  21,  4,  5, 0
+	XOR_T	r2,  r3,   7, 30, 31, 0
+	XOR_T	r2,  r3,  22, 12, 13, 1
+	XOR_T	r4,  r5,   8,  9, 10, 1
+	XOR_T	r4,  r5,  23, 11, 12, 0
+	XOR_T	r6,  r7,   9,  1,  2, 0
+	XOR_T	r8,  r12, 10, 31,  0, 1
+	XOR_T	r0,  r1,  11, 29, 29, 0
+	XOR_T	r4,  r5,  13, 28, 28, 1
+	XOR_T	r6,  r7,  14, 23, 23, 1
+	XOR_T	r8,  r12, 15, 18, 19, 1
+
+	@ For words 0, 6, 12, 18 and 24, we omit writing back to storage
+	@ because we'll need them right away in the first KHI_STEP.
+	A_LD	r10, r11, 24
+	eor	r10, r6, r10, ror #31
+	eor	r11, r7, r11, ror #31
+	A_LD	r7, r6, 18
+	eor	r7, r4, r7, ror #24
+	eor	r6, r5, r6, ror #25
+	A_LD	r5, r4, 12
+	eor	r5, r2, r5, ror #19
+	eor	r4, r3, r4, ror #20
+	A_LD	r2, r3, 6
+	eor	r2, r0, r2, ror #22
+	eor	r3, r1, r3, ror #22
+	A_LD	r0, r1, 0
+	eor	r0, r8, r0
+	eor	r1, r12, r1
+
+	@ Delayed right-rotations on low (even) and high (odd) words:
+	@   A[ 0]   dr0:  0  dr1:  0
+	@   A[ 1]   dr0: 31  dr1:  0
+	@   A[ 2]   dr0:  1  dr1:  1
+	@   A[ 3]   dr0: 18  dr1: 18
+	@   A[ 4]   dr0: 18  dr1: 19
+	@   A[ 5]   dr0: 14  dr1: 14
+	@   A[ 6]   dr0: 10  dr1: 10
+	@   A[ 7]   dr0: 29  dr1: 29
+	@   A[ 8]   dr0:  4  dr1:  5
+	@   A[ 9]   dr0: 22  dr1: 22
+	@   A[10]   dr0: 30  dr1: 31
+	@   A[11]   dr0: 27  dr1: 27
+	@   A[12]   dr0: 10  dr1: 11
+	@   A[13]   dr0: 19  dr1: 20
+	@   A[14]   dr0: 12  dr1: 13
+	@   A[15]   dr0: 11  dr1: 12
+	@   A[16]   dr0:  9  dr1: 10
+	@   A[17]   dr0: 24  dr1: 25
+	@   A[18]   dr0: 21  dr1: 22
+	@   A[19]   dr0: 28  dr1: 28
+	@   A[20]   dr0: 23  dr1: 23
+	@   A[21]   dr0: 31  dr1: 31
+	@   A[22]   dr0:  1  dr1:  2
+	@   A[23]   dr0:  4  dr1:  4
+	@   A[24]   dr0: 25  dr1: 25
+
+@ Apply operation 'op' (Boolean bitwise opcode) on values xa0:xa1
+@ and xb0:xb1 (register pairs), then XOR with xd0:xd1 and write
+@ result at index j. Each register comes with a "delayed rotation" count
+@ which is applied here.
+.macro	KHI_OP xa0, da0, xa1, da1, xb0, db0, xb1, db1, xd0, dd0, xd1, dd1, j
+	@ 'op' on xa and xb
+	.if (\da0) == (\db0)
+	bic	r8, \xb0, \xa0
+	.else
+	bic	r8, \xb0, \xa0, ror #((32 + (\da0) - (\db0)) & 31)
+	.endif
+	.if (\da1) == (\db1)
+	bic	r12, \xb1, \xa1
+	.else
+	bic	r12, \xb1, \xa1, ror #((32 + (\da1) - (\db1)) & 31)
+	.endif
+	@ XOR with xd, result back in r8:r12 (xd itself is unmodified)
+	@ r8 and r12 have delayed rotations by da0 and da1, respectively
+	.if (\db0) == (\dd0)
+	eor	r8, \xd0, r8
+	.else
+	eor	r8, \xd0, r8, ror #((32 + (\db0) - (\dd0)) & 31)
+	.endif
+	.if (\db1) == (\dd1)
+	eor	r12, \xd1, r12
+	.else
+	eor	r12, \xd1, r12, ror #((32 + (\db1) - (\dd1)) & 31)
+	.endif
+	@ Store back XOR result
+	A_ST	r8, r12, \j
+.endm
+
+@ Apply Khi on five words. Word indexes are i0 to i4. Each word comes with
+@ its two "delayed rotation" counts.
+@ If notx1 is non-zero, then word i1 is complemented before the computations.
+@ If notx3 is non-zero, then word i3 is complemented before the computations.
+@ If swap1 is non-zero, then the two operands for the second operation are
+@ swapped.
+.macro	KHI_STEP i0, e00, e01, i1, e10, e11, i2, e20, e21, i3, e30, e31, i4, e40, e41
+	@ Load all five state words.
+	A_LDX	r0,  r1,  \i0
+	A_LDX	r2,  r3,  \i1
+	A_LDX	r4,  r5,  \i2
+	A_LDX	r6,  r7,  \i3
+	A_LDX	r10, r11  \i4
+	@ Apply operations.
+	KHI_OP	r2,  \e10, r3,  \e11, r4,  \e20, r5,  \e21, r0,  \e00, r1,  \e01, \i0
+	KHI_OP	r4,  \e20, r5,  \e21, r6,  \e30, r7,  \e31, r2,  \e10, r3,  \e11, \i1
+	KHI_OP	r6,  \e30, r7,  \e31, r10, \e40, r11, \e41, r4,  \e20, r5,  \e21, \i2
+	KHI_OP	r10, \e40, r11, \e41, r0,  \e00, r1,  \e01, r6,  \e30, r7,  \e31, \i3
+	KHI_OP	r0,  \e00, r1,  \e01, r2,  \e10, r3,  \e11, r10, \e40, r11, \e41, \i4
+.endm
+
+@ Special case for first KHI_STEP:
+@ Words are received already loaded in registers (permuted).
+.macro	KHI_STEP_1 i0, e00, e01, i1, e10, e11, i2, e20, e21, i3, e30, e31, i4, e40, e41, op0, op1, op2, op3, op4
+	KHI_OP	r2,  \e10, r3,  \e11, r4,  \e20, r5,  \e21, r0,  \e00, r1,  \e01, \i0
+	KHI_OP	r4,  \e20, r5,  \e21, r6,  \e30, r7,  \e31, r2,  \e10, r3,  \e11, \i1
+	KHI_OP	r6,  \e30, r7,  \e31, r10, \e40, r11, \e41, r4,  \e20, r5,  \e21, \i2
+	KHI_OP	r10, \e40, r11, \e41, r0,  \e00, r1,  \e01, r6,  \e30, r7,  \e31, \i3
+	KHI_OP	r0,  \e00, r1,  \e01, r2,  \e10, r3,  \e11, r10, \e40, r11, \e41, \i4
+.endm
+
+	@ 0, 6, 12, 18, 24
+	KHI_STEP_1  0,  0,  0,  6, 10, 10, 12, 10, 11, 18, 21, 22, 24, 25, 25
+
+	@ 3, 9, 10, 16, 22
+	KHI_STEP    3, 18, 18,  9, 22, 22, 10, 30, 31, 16,  9, 10, 22,  1,  2
+
+	@ 1, 7, 13, 19, 20
+	KHI_STEP    1, 31,  0,  7, 29, 29, 13, 19, 20, 19, 28, 28, 20, 23, 23
+
+	@ 4, 5, 11, 17, 23
+	KHI_STEP    4, 18, 19,  5, 14, 14, 11, 27, 27, 17, 24, 25, 23,  4,  4
+
+	@ 2, 8, 14, 15, 21
+	KHI_STEP    2,  1,  1,  8,  4,  5, 14, 12, 13, 15, 11, 12, 21, 31, 31
+
+	@ XOR next round constant into A[0]
+	adr.w	r5, process_block_RC__end
+	add.w	r5, r14
+	ldrd	r2, r3, [r5]
+	A_LD	r0, r1, 0
+	eors	r0, r2
+	eors	r1, r3
+	@ Increment counter for next iteration. Since the counter starts at
+	@ -192, it reaches 0 when 24 rounds have been completed.
+	adds	r14, #8
+	beq.w	fndsa_sha3_process_block__final
+
+	@ We store back the modified A[0] only if looping (exit sequence
+	@ uses the r0:r1 registers directly).
+	A_ST	r0, r1, 0
+
+	@ Permute the state words for next round.
+	@    6 ->  1
+	@    1 -> 10
+	@   10 ->  7
+	@    7 -> 11
+	@   11 -> 17
+	@   17 -> 18
+	@   18 ->  3
+	@    3 ->  5
+	@    5 -> 16
+	@   16 ->  8
+	@    8 -> 21
+	@   21 -> 24
+	@   24 ->  4
+	@    4 -> 15
+	@   15 -> 23
+	@   23 -> 19
+	@   19 -> 13
+	@   13 -> 12
+	@   12 ->  2
+	@    2 -> 20
+	@   20 -> 14
+	@   14 -> 22
+	@   22 ->  9
+	@    9 ->  6
+	@ Word 0 is not permuted.
+	@ We compute the XOR of the permuted words, as would normally
+	@ be done at the start of the next iteration. For that computation,
+	@ we need to take the delayed rotations into account.
+
+	@ Load word i, store in j, and also rotate the in-register copy
+	@ to absorbate the specified delayed rotations.
+.macro	A_LD_ST  i, j, e0, e1
+	.if (\j) % 5 == 0
+	A_LD	r0, r1, \i
+	A_ST	r0, r1, \j
+	ROR_WORD  r0, r1, \e0, \e1
+	.elseif (\j) % 5 == 1
+	A_LD	r2, r3, \i
+	A_ST	r2, r3, \j
+	ROR_WORD  r2, r3, \e0, \e1
+	.elseif (\j) % 5 == 2
+	A_LD	r4, r5, \i
+	A_ST	r4, r5, \j
+	ROR_WORD  r4, r5, \e0, \e1
+	.elseif (\j) % 5 == 3
+	A_LD	r6, r7, \i
+	A_ST	r6, r7, \j
+	ROR_WORD  r6, r7, \e0, \e1
+	.else
+	A_LD	r10, r11, \i
+	A_ST	r10, r11, \j
+	ROR_WORD  r10, r11, \e0, \e1
+	.endif
+.endm
+
+	@ Load word i, store in j, and also XOR that word into the
+	@ appropriate registers (based on j mod 5), applying the specified
+	@ delayed rotations.
+.macro	A_LD_XOR_ST  i, j, e0, e1
+	A_LD	r8, r12, \i
+	.if (\j) % 5 == 0
+	XOR_ROR_WORD  r0,  r1,  r8,  r12, \e0, \e1
+	.elseif (\j) % 5 == 1
+	XOR_ROR_WORD  r2,  r3,  r8,  r12, \e0, \e1
+	.elseif (\j) % 5 == 2
+	XOR_ROR_WORD  r4,  r5,  r8,  r12, \e0, \e1
+	.elseif (\j) % 5 == 3
+	XOR_ROR_WORD  r6,  r7,  r8,  r12, \e0, \e1
+	.else
+	XOR_ROR_WORD  r10, r11, r8,  r12, \e0, \e1
+	.endif
+	A_ST	r8, r12, \j
+.endm
+
+	@ r0:r1 still contains A[0], for whom the delayed rotations are zero.
+	A_LD	r2, r3, 6
+	str	r2, [sp, #80]
+	str	r3, [sp, #84]
+	ror	r2, r2, #10
+	ror	r3, r3, #10
+	A_LD_XOR_ST   9,  6, 22, 22
+	A_LD_ST	     22,  9,  1,  2
+	A_LD_ST      14, 22, 12, 13
+	A_LD_XOR_ST  20, 14, 23, 23
+	A_LD_XOR_ST   2, 20,  1,  1
+	A_LD_XOR_ST  12,  2, 10, 11
+	A_LD_XOR_ST  13, 12, 19, 20
+	A_LD_ST      19, 13, 28, 28
+	A_LD_XOR_ST  23, 19,  4,  4
+	A_LD_XOR_ST  15, 23, 11, 12
+	A_LD_XOR_ST   4, 15, 18, 19
+	A_LD_XOR_ST  24,  4, 25, 25
+	A_LD_XOR_ST  21, 24, 31, 31
+	A_LD_XOR_ST   8, 21,  4,  5
+	A_LD_XOR_ST  16,  8,  9, 10
+	A_LD_XOR_ST   5, 16, 14, 14
+	A_LD_XOR_ST   3,  5, 18, 18
+	A_LD_XOR_ST  18,  3, 21, 22
+	A_LD_XOR_ST  17, 18, 24, 25
+	A_LD_XOR_ST  11, 17, 27, 27
+	A_LD_XOR_ST   7, 11, 29, 29
+	A_LD_XOR_ST  10,  7, 30, 31
+	A_LD_XOR_ST   1, 10, 31,  0
+	ldrd	r8, r12, [sp, #80]
+	A_ST	r8, r12, 1
+
+	b.w	fndsa_sha3_process_block__loop_step2
+
+fndsa_sha3_process_block__final:
+	@ Recombine even-indexed and odd-indexed bits.
+	@ Everything is written back into the original state array.
+	@ Words are still in permuted state, and have delayed rotations
+	@ that should be applied here.
+	ldr	r12, [sp, #72]   @ Pointer to state array
+	ldr	r14, [sp, #76]   @ Rate (9, 13, 17, 18 or 21)
+
+	@ Load word i into the given registers, and apply the specified
+	@ rotations. Moreover, if donot is non-zero, the word is negated.
+.macro	A_LDROR  x0, x1, i, e0, e1
+	A_LD		\x0, \x1, \i
+	ROR_WORD	\x0, \x1, \e0, \e1
+.endm
+
+	@ r0:r1 was set in the last iteration and has no delayed rotation
+	@A_LD	r0,  r1,   0
+	A_LDROR	r2,  r3,   6, 10, 10
+	A_LDROR	r4,  r5,  12, 10, 11
+	A_LDROR	r6,  r7,  18, 21, 22
+	A_LDROR	r10, r11, 24, 25, 25
+	bl	bit_merge_5
+	stm	r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 }
+	A_LDROR	r0,  r1,   3, 18, 18
+	A_LDROR	r2,  r3,   9, 22, 22
+	A_LDROR	r4,  r5,  10, 30, 31
+	A_LDROR	r6,  r7,  16,  9, 10
+	A_LDROR	r10, r11, 22,  1,  2
+	cmp	r14, #9
+	bhi.w	fndsa_sha3_process_block__L10
+	bl	bit_merge_4
+	b.w	fndsa_sha3_process_block__L11
+fndsa_sha3_process_block__L10:
+	bl	bit_merge_5
+fndsa_sha3_process_block__L11:
+	stm	r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 }
+	A_LDROR	r0,  r1,   1, 31,  0
+	A_LDROR	r2,  r3,   7, 29, 29
+	A_LDROR	r4,  r5,  13, 19, 20
+	A_LDROR	r6,  r7,  19, 28, 28
+	A_LDROR	r10, r11, 20, 23, 23
+	cmp	r14, #13
+	bhi.w	fndsa_sha3_process_block__L12
+	cmp	r14, #10
+	bls.w	fndsa_sha3_process_block__L13
+	bl	bit_merge_3
+	b.w	fndsa_sha3_process_block__L13
+fndsa_sha3_process_block__L12:
+	bl	bit_merge_5
+fndsa_sha3_process_block__L13:
+	stm	r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 }
+	A_LDROR	r0,  r1,   4, 18, 19
+	A_LDROR	r2,  r3,   5, 14, 14
+	A_LDROR	r4,  r5,  11, 27, 27
+	A_LDROR	r6,  r7,  17, 24, 25
+	A_LDROR	r10, r11, 23,  4,  4
+	cmp	r14, #17
+	beq.w	fndsa_sha3_process_block__L15
+	cmp	r14, #18
+	beq.w	fndsa_sha3_process_block__L14
+	cmp	r14, #15
+	bls.w	fndsa_sha3_process_block__L16
+	bl	bit_merge_5
+	b.w	fndsa_sha3_process_block__L16
+fndsa_sha3_process_block__L14:
+	bl	bit_merge_3
+	b.w	fndsa_sha3_process_block__L16
+fndsa_sha3_process_block__L15:
+	bl	bit_merge_2
+fndsa_sha3_process_block__L16:
+	stm	r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 }
+	A_LDROR	r0,  r1,   2,  1,  1
+	A_LDROR	r2,  r3,   8,  4,  5
+	A_LDROR	r4,  r5,  14, 12, 13
+	A_LDROR	r6,  r7,  15, 11, 12
+	A_LDROR	r10, r11, 21, 31, 31
+	cmp	r14, #20
+	bls.w	fndsa_sha3_process_block__L17
+	bl	bit_merge_1
+fndsa_sha3_process_block__L17:
+	stm	r12!, { r0, r1, r2, r3, r4, r5, r6, r7, r10, r11 }
+
+	add	sp, #88
+	vpop.64 { d8, d9, d10, d11, d12, d13, d14, d15 }
+	pop	{ r4, r5, r6, r7, r8, r10, r11, pc }
+
+	.align	2
+process_block_RC:
+	.word	0x00000001, 0x00000000
+	.word	0x00000000, 0x00000089
+	.word	0x00000000, 0x8000008B
+	.word	0x00000000, 0x80008080
+	.word	0x00000001, 0x0000008B
+	.word	0x00000001, 0x00008000
+	.word	0x00000001, 0x80008088
+	.word	0x00000001, 0x80000082
+	.word	0x00000000, 0x0000000B
+	.word	0x00000000, 0x0000000A
+	.word	0x00000001, 0x00008082
+	.word	0x00000000, 0x00008003
+	.word	0x00000001, 0x0000808B
+	.word	0x00000001, 0x8000000B
+	.word	0x00000001, 0x8000008A
+	.word	0x00000001, 0x80000081
+	.word	0x00000000, 0x80000081
+	.word	0x00000000, 0x80000008
+	.word	0x00000000, 0x00000083
+	.word	0x00000000, 0x80008003
+	.word	0x00000001, 0x80008088
+	.word	0x00000000, 0x80000088
+	.word	0x00000001, 0x00008000
+	.word	0x00000000, 0x80008082
+process_block_RC__end:
+
+	.size	fndsa_sha3_process_block,.-fndsa_sha3_process_block
diff --git a/crypto_sign/fndsa_provisional-512/m4f/sign.c b/crypto_sign/fndsa_provisional-512/m4f/sign.c
new file mode 120000
index 00000000..c0dd0533
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/sign.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/sign.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/sign_core.c b/crypto_sign/fndsa_provisional-512/m4f/sign_core.c
new file mode 120000
index 00000000..879d614a
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/sign_core.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/sign_core.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/sign_fpoly.c b/crypto_sign/fndsa_provisional-512/m4f/sign_fpoly.c
new file mode 120000
index 00000000..b5d1b080
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/sign_fpoly.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/sign_fpoly.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/sign_fpr.c b/crypto_sign/fndsa_provisional-512/m4f/sign_fpr.c
new file mode 120000
index 00000000..26216c14
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/sign_fpr.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/sign_fpr.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/sign_fpr_cm4.s b/crypto_sign/fndsa_provisional-512/m4f/sign_fpr_cm4.s
new file mode 100644
index 00000000..d8e85a83
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/sign_fpr_cm4.s
@@ -0,0 +1,1020 @@
+	.syntax	unified
+	.cpu	cortex-m4
+	.file	"sign_fpr_cm4.s"
+	.text
+
+@ =======================================================================
+@ fpr fndsa_fpr_scaled(int64_t i, int sc)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_fpr_scaled
+	.thumb
+	.thumb_func
+	.type	fndsa_fpr_scaled, %function
+fndsa_fpr_scaled:
+	@push	{ r4, r5 }
+	vmov	s0, s1, r4, r5
+
+	@ Get absolute value into r0:r5.
+	eor	r0, r0, r1, asr #31
+	eor	r5, r1, r1, asr #31
+	subs	r0, r0, r1, asr #31
+	sbc	r5, r5, r1, asr #31
+
+	@ Count leading zeros of r0:r5 (into r3).
+	@ r12 = 1 if r3 >= 32, 0 otherwise.
+	clz	r3, r5
+	clz	r4, r0
+	lsrs	r12, r3, #5
+	umlal	r3, r4, r4, r12
+
+	@ Normalize absolute value to [2^63,2^64-1]: we shift-left the value
+	@ by r3 bits. We also adjust the scaling (sc, in r2) accordingly.
+	subs	r2, r2, r3
+
+	@ At this point, r12 = 1 if r3 >= 32, 0 otherwise.
+	rsbs	r12, #0           @ r1 <- -1 if r12 = 1, 0 otherwise
+	umlal	r0, r5, r0, r12   @ if r5 = 0 then r0:r5 <- 0:r0
+	and	r12, r3, #31
+	movs	r4, #1
+	lsls	r4, r12
+	umull	r0, r12, r0, r4
+	umlal	r12, r4, r5, r4
+
+	@ Normalized absolute value is now in r0:r12.
+	@ If the source integer was zero, then r0:r12 = 0 at this point.
+	@ Since the pre-normalized absolute value was at most 2^63-1, the
+	@ lowest bit of r0 is necessarily zero.
+
+	@ Adjust exponent. The mantissa will spill an extra 1 into the
+	@ exponent.
+	addw	r2, r2, #1085
+
+	@ Shrink mantissa to [2^52,2^53-1] with rounding.
+	@ See fpr_add() for details. Since we can only guarantee that the
+	@ lowest bit is 0, the method involves adding 0x7FE00000, which
+	@ cannot fit in a representable constant for add; we have to
+	@ use movw and a shift.
+	lsls	r2, r2, #20          @ exponent
+	lsls	r4, r0, #21
+	lsrs	r0, r0, #11
+	bfi	r4, r0, #21, #1
+	movw	r5, #0x7FE0
+	adds	r4, r4, r5, lsl #16
+	adcs	r0, r0, r12, lsl #21
+	adcs	r12, r2, r12, lsr #11
+
+	@ If the source value was zero then the mantissa is still zero,
+	@ but the exponent field is wrong and must be adjusted. We still
+	@ have the count of leading zeros in r3; source was 0 if and only
+	@ if r3 = 64.
+	sbfx	r3, r3, #6, #1
+	bics	r12, r3
+
+	@ Insert back the sign.
+	bfi	r1, r12, #0, #31
+
+	@pop	{ r4, r5 }
+	vmov	r4, r5, s0, s1
+	bx	lr
+	.size	fndsa_fpr_scaled,.-fndsa_fpr_scaled
+
+@ =======================================================================
+@ fpr fndsa_fpr_add(fpr x, fpr y)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_fpr_add
+	.thumb
+	.thumb_func
+	.type	fndsa_fpr_add, %function
+fndsa_fpr_add:
+	@push	{ r4, r5, r6, r7, r8 }
+	vmov	s0, s1, r4, r5
+	vmov	s2, s3, r6, r7
+	vmov	s4, r8
+
+	@ Operands are in r0:r1 and r2:r3. We want to conditionally swap
+	@ them, so that x (r0:r1) has the greater absolute value of the two;
+	@ if both have the same absolute value and different signs, then
+	@ x should be positive. This ensures that the exponent of y is not
+	@ greater than that of x, and the result has the sign of x.
+	@
+	@ To ignore the sign bit in the comparison, we left-shift the high
+	@ word of both operands by 1 bit (this does not change the order of
+	@ the absolute values). To cover the case of two equal absolute
+	@ values, we inject the sign of x as an initial borrow (thus, if
+	@ the absolute values are equal but x is negative, then the
+	@ comparison will decide that x is "lower" and do the swap). We
+	@ leverage the fact that r1 cannot be 0xFFFFFFFF (it would mean that
+	@ x is a NaN), and therefore subtracting the word-extended sign bit
+	@ will produce the expected borrow.
+	lsls	r7, r1, #1            @ Left-shift high word of x
+	subs	r6, r1, r1, asr #31   @ Initial borrow if x is negative
+	sbcs	r6, r0, r2            @ Sub: low words
+	sbcs	r6, r7, r3, lsl #1    @ Sub: high words (with shift of y)
+	sbcs	r4, r4                @ r4 is set to 0xFFFFFFFF for a swap
+	uadd8	r4, r4, r4
+	sel	r6, r2, r0
+	sel	r7, r3, r1
+	sel	r2, r0, r2
+	sel	r3, r1, r3
+
+	@ Now x is in r6:r7, and y is in r2:r3.
+
+	@ Extract mantissa of x into r6:r7, exponent in r4, sign in r5.
+	@ For the mantissa, we must set bit 52 to 1, except if the (encoded)
+	@ exponent is zero; in the latter case, the whole value must be zero
+	@ or minus zero (we do not support subnormals).
+	asrs	r5, r7, #31        @ Sign bit (extended to whole word)
+	ubfx	r4, r7, #20, #11   @ Exponent in r4 (without sign)
+	addw	r8, r4, #2047      @ r8 >= 2048 if and only if r4 != 0
+	lsrs	r8, r8, #11        @ r8 = 1 except if r4 = 0
+	bfi	r7, r8, #20, #12   @ Set high mantissa bits
+
+	@ Extract mantissa of y into r2:r3, exponent in r0.
+	@ r1 receives the xor of the signs of x and y (extended).
+	eor	r1, r5, r3, asr #31
+	ubfx	r0, r3, #20, #11   @ Exponent in r0 (without sign)
+	addw	r8, r0, #2047      @ r8 >= 2048 if and only if r0 != 0
+	lsrs	r8, r8, #11        @ r8 = 1 except if r0 = 0
+	bfi	r3, r8, #20, #12   @ Set high mantissa bits
+
+	@ Scale mantissas up by three bits (i.e. multiply both by 8).
+	mov	r8, #7
+	lsls	r7, #3
+	umlal	r6, r7, r6, r8
+	lsls	r3, #3
+	umlal	r2, r3, r2, r8
+
+	@ x: exponent=r4, sign=r5, mantissa=r6:r7 (scaled up 3 bits)
+	@ y: exponent=r0, sign-xor=r1, mantissa=r2:r3 (scaled up 3 bits)
+
+	@ At that point, the exponent of x (in r4) is larger than that
+	@ of y (in r0). The difference is the amount of shifting that
+	@ should be done on y. If that amount is larger than 59 then
+	@ we clamp y to 0. We won't need y's exponent beyond that point,
+	@ so we store that shift count in r0.
+	subs	r0, r4, r0
+	subs	r8, r0, #60
+	ands	r2, r2, r8, asr #31
+	ands	r3, r3, r8, asr #31
+
+	@ Shift right r2:r3 by r0 bits (with result in r3:r0). The
+	@ shift count is in the 0..59 range. r12 will be non-zero if and
+	@ only if some non-zero bits were dropped.
+
+	@ If r0 >= 32, then right-shift by 32 bits; r12 is set to the
+	@ dropped bits (or 0 if r0 < 32).
+	sbfx	r8, r0, #5, #1
+	and	r12, r2, r8
+	bic	r2, r2, r8
+	umlal	r3, r2, r3, r8
+	@ Right-shift by r0 mod 32 bits; dropped bits (from r3) are
+	@ accumulated into r12 (with OR).
+	and	r0, r0, #31
+	mov	r8, #0xFFFFFFFF
+	lsr	r8, r0            @ r8 <- 2^(32-sc) - 1
+	eors	r0, r0
+	umlal	r3, r0, r3, r8
+	umlal	r2, r3, r2, r8
+	orr	r12, r12, r2
+
+	@ If r12 is non-zero then some non-zero bit was dropped and the
+	@ low bit of r2 must be forced to 1 ('sticky bit').
+	rsbs	r2, r12, #0
+	orrs	r2, r2, r12
+	orrs	r3, r3, r2, lsr #31
+
+	@ x: exponent=r4, sign=r5, mantissa=r6:r7 (scaled up 3 bits)
+	@ y: sign=r1, value=r3:r0 (scaled to same exponent as x)
+
+	@ If x and y have the same sign (r1 = 0), then we add r3:r0 to r6:r7.
+	@ Otherwise (r1 = -1), we subtract r3:r0 from r6:r7. Both values are
+	@ less than 2^56, and output cannot be negative.
+	orr	r2, r1, #1        @ r2 = 1 if r1 = 0, or -1 if r1 = -1
+	umlal	r6, r7, r3, r2
+	muls	r3, r1
+	umaal	r7, r3, r0, r2
+
+	@ result: exponent=r4, sign=r5, mantissa=r6:r7 (scaled up 3 bits)
+	@ Value in r6:r7 is necessarily less than 2^57.
+
+	@ Normalize the result with some left-shifting to full 64-bit
+	@ width. Shift count goes to r2, and exponent (r4) is adjusted.
+	clz	r2, r7
+	clz	r3, r6
+	sbfx	r0, r2, #5, #1
+	umlal	r3, r2, r3, r0
+	subs	r4, r4, r2
+
+	@ Shift r6:r7 to the left by r2 bits.
+	@ If r2 >= 32, then r7 = 0 and r0 = -1, and we set: r6:r7 <- 0:r6
+	umlal	r6, r7, r6, r0
+	@ Left-shift by r2 mod 32
+	and	r2, #31
+	movs	r1, #1
+	lsls	r1, r2
+	umull	r6, r12, r6, r1
+	umlal	r12, r7, r7, r1
+
+	@ Normalized mantissa is now in r6:r12
+	@ Since the mantissa was at most 57-bit pre-normalization, the low
+	@ 7 bits of r6 must be zero.
+
+	@ The exponent of x was in r4. The left-shift operation has
+	@ subtracted some value from it, 8 in case the result has the
+	@ same exponent as x. However, the high bit of the mantissa will
+	@ add 1 to the exponent, so we only add back 7 (the exponent is
+	@ added in because rounding might have produced a carry, which
+	@ should then spill into the exponent).
+	adds	r4, #7
+
+	@ If the new mantissa is non-zero, then its bit 63 is non-zero
+	@ (thanks to the normalizing shift). Otherwise, that bit is
+	@ zero, and we should then set the exponent to zero as well.
+	ands	r4, r4, r12, asr #31
+
+	@ We have a 64-bit value which we must shrink down to 53 bits, i.e.
+	@ removing the low 11 bits. Rounding must be applied. The low 12
+	@ bits of r6 (in high-to-low order) are:
+	@    b4 b3 b2 b1 b0 0000000
+	@ (as mentioned earlier, the lowest 7 bits must be zero)
+	@ After a strict right shift, b4 is the lowest bit. Rounding will
+	@ add +1 to the value if and only if:
+	@   - b4 = 0 and b3:b2:b1:b0 >= 1001
+	@   - b4 = 1 and b3:b2:b1:b0 >= 1000
+	@ Equivalently, we must add +1 after the shift if and only if:
+	@   b3:b2:b1:b0:b4 + 01111 >= 100000
+	lsls	r5, #31              @ sign of output is sign of x
+	orr	r1, r5, r4, lsl #20  @ exponent and sign
+	lsls	r3, r6, #21          @ top(r3) = b3:b2:b1:b0:00...
+	lsrs	r0, r6, #11
+	bfi	r3, r0, #27, #1      @ top(r3) = b3:b2:b1:b0:b4:00...
+	adds	r3, r3, #0x78000000  @ add 01111 to top bits, carry is adjust
+	adcs	r0, r0, r12, lsl #21
+	adcs	r1, r1, r12, lsr #11
+
+	@ If the mantissa in r6:r7 was zero, then r0:r1 contains zero at
+	@ this point, and the exponent r4 was cleared before, so there is
+	@ not need for further correcting actions.
+
+	@pop	{ r4, r5, r6, r7, r8 }
+	vmov	r4, r5, s0, s1
+	vmov	r6, r7, s2, s3
+	vmov	r8, s4
+	bx	lr
+	.size	fndsa_fpr_add,.-fndsa_fpr_add
+
+@ =======================================================================
+@ fpr*2 fndsa_fpr_add_sub(fpr x, fpr y)
+@ This function returns two 64-bit values: x+y in r0:r1, and x-y in r2:r3
+@
+@ This does not follow the AAPCS, hence the caller must be custom (inline)
+@ assembly that specifies clobbers and dispatches the two results
+@ appropriately.
+@ Clobbers: r4, r5, r6, r7, r8, r10, r11, r12, r14, s15, flags
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_fpr_add_sub
+	.thumb
+	.thumb_func
+	.type	fndsa_fpr_add_sub, %function
+fndsa_fpr_add_sub:
+	@ Operands are in r0:r1 and r2:r3. We want to conditionally swap
+	@ them, so that x (r0:r1) has the greater absolute value of the two;
+	@ if both have the same absolute value and different signs, then
+	@ x should be positive. This ensures that the exponent of y is not
+	@ greater than that of x, and the result of the addition has the
+	@ sign of x. We must still remember whether a swap occurred, because
+	@ in that case the subtraction will compute y-x instead of x-y,
+	@ and we will have to negate the second output.
+	@
+	@ Signs for zeros: for any z, z + (-z) and z - z should be +0,
+	@ never -0. The exact process is:
+	@
+	@     swap <- false
+	@     if abs(x) < abs(y):
+	@         swap <- true
+	@     elif abs(x) == abs(y):
+	@         if is_neg(x):
+	@             swap <- true
+        @     if swap:
+	@         (x, y) <- (y, x)
+	@     a <- abs(x) + abs(y)
+	@     b <- abs(x) - abs(y)
+	@     sign(a) <- sign(x)
+	@     if swap:
+	@         sign(b) <- sign(x)
+	@     else:
+	@         sign(b) <- sign(-x)
+	@
+	@ Indeed, if abs(x) = abs(y):
+	@   x  y  x+y  x-y
+	@   +  +   +    +    no swap
+	@   +  -   +    +    no swap
+	@   -  +   +    -    swap
+	@   -  -   -    +    swap
+	@
+	@ To ignore the sign bit in the comparison, we left-shift the high
+	@ word of both operands by 1 bit (this does not change the order of
+	@ the absolute values). To cover the case of two equal absolute
+	@ values, we inject the sign of x as an initial borrow (thus, if
+	@ the absolute values are equal but x is negative, then the
+	@ comparison will decide that x is "lower" and do the swap). We
+	@ leverage the fact that r1 cannot be 0xFFFFFFFF (it would mean that
+	@ x is a NaN), and therefore subtracting the word-extended sign bit
+	@ will produce the expected borrow.
+	lsls	r7, r1, #1            @ Left-shift high word of x
+	subs	r6, r1, r1, asr #31   @ Initial borrow if x is negative
+	sbcs	r6, r0, r2            @ Sub: low words
+	sbcs	r6, r7, r3, lsl #1    @ Sub: high words (with shift of y)
+	sbc	r11, r11              @ r11 is set to 0xFFFFFFFF for a swap
+	uadd8	r4, r11, r11
+	sel	r6, r2, r0
+	sel	r7, r3, r1
+	sel	r2, r0, r2
+	sel	r3, r1, r3
+
+	@ Now x is in r6:r7, and y is in r2:r3.
+
+	@ Extract mantissa of x into r6:r7, exponent in r4, sign in r5.
+	@ For the mantissa, we must set bit 52 to 1, except if the (encoded)
+	@ exponent is zero; in the latter case, the whole value must be zero
+	@ or minus zero (we do not support subnormals).
+	asrs	r5, r7, #31        @ Sign bit (extended to whole word)
+	ubfx	r4, r7, #20, #11   @ Exponent in r4 (without sign)
+	addw	r8, r4, #2047      @ r8 >= 2048 if and only if r4 != 0
+	lsrs	r8, r8, #11        @ r8 = 1 except if r4 = 0
+	bfi	r7, r8, #20, #12   @ Set high mantissa bits
+
+	@ Extract mantissa of y into r2:r3, exponent in r0.
+	@ r1 receives the xor of the signs of x and y (extended).
+	eor	r1, r5, r3, asr #31
+	ubfx	r0, r3, #20, #11   @ Exponent in r0 (without sign)
+	addw	r8, r0, #2047      @ r8 >= 2048 if and only if r0 != 0
+	lsrs	r8, r8, #11        @ r8 = 1 except if r0 = 0
+	bfi	r3, r8, #20, #12   @ Set high mantissa bits
+
+	@ Scale mantissas up by three bits (i.e. multiply both by 8).
+	mov	r8, #7
+	lsls	r7, #3
+	umlal	r6, r7, r6, r8
+	lsls	r3, #3
+	umlal	r2, r3, r2, r8
+
+	@ x: exponent=r4, sign=r5, mantissa=r6:r7 (scaled up 3 bits)
+	@ y: exponent=r0, sign-xor=r1, mantissa=r2:r3 (scaled up 3 bits)
+
+	@ At that point, the exponent of x (in r4) is larger than that
+	@ of y (in r0). The difference is the amount of shifting that
+	@ should be done on y. If that amount is larger than 59 then
+	@ we clamp y to 0. We won't need y's exponent beyond that point,
+	@ so we store that shift count in r0.
+	subs	r0, r4, r0
+	subs	r8, r0, #60
+	ands	r2, r2, r8, asr #31
+	ands	r3, r3, r8, asr #31
+
+	@ Shift right r2:r3 by r0 bits (with result in r3:r0). The
+	@ shift count is in the 0..59 range. r12 will be non-zero if and
+	@ only if some non-zero bits were dropped.
+
+	@ If r0 >= 32, then right-shift by 32 bits; r12 is set to the
+	@ dropped bits (or 0 if r0 < 32).
+	sbfx	r8, r0, #5, #1
+	and	r12, r2, r8
+	bic	r2, r2, r8
+	umlal	r3, r2, r3, r8
+	@ Right-shift by r0 mod 32 bits; dropped bits (from r3) are
+	@ accumulated into r12 (with OR).
+	and	r0, r0, #31
+	mov	r8, #0xFFFFFFFF
+	lsr	r8, r0            @ r8 <- 2^(32-sc) - 1
+	eors	r0, r0
+	umlal	r3, r0, r3, r8
+	umlal	r2, r3, r2, r8
+	orr	r12, r12, r2
+
+	@ If r12 is non-zero then some non-zero bit was dropped and the
+	@ low bit of r2 must be forced to 1 ('sticky bit').
+	rsbs	r2, r12, #0
+	orrs	r2, r2, r12
+	orrs	r3, r3, r2, lsr #31
+
+	@ x: exponent=r4, sign=r5, mantissa=r6:r7 (scaled up 3 bits)
+	@ y: sign=r1, value=r3:r0 (scaled to same exponent as x)
+
+	@ Compute the sum (into r6:r7) and the difference (into r12:r8).
+	subs	r12, r6, r3
+	sbcs	r8, r7, r0
+	adds	r6, r6, r3
+	adcs	r7, r7, r0
+
+	@ Swap the values if r1 = -1. Second output goes to: r10:r12
+	uadd8	r10, r1, r1
+	sel	r10, r6, r12
+	sel	r6, r12, r6
+	sel	r12, r7, r8
+	sel	r7, r8, r7
+
+	@ Save high word of second output (low word is kept in r10).
+	vmov	s15, r12
+
+	@ Post-processing for first output
+	@ --------------------------------
+
+	@ result: exponent=r4, sign=r5, mantissa=r6:r7 (scaled up 3 bits)
+	@ Value in r6:r7 is necessarily less than 2^57.
+
+	@ Normalize the result with some left-shifting to full 64-bit
+	@ width. Shift count goes to r2, and exponent (r4) is adjusted.
+	@ The adjusted exponent goes to r8 (we want to keep r4 untouched).
+	clz	r2, r7
+	clz	r3, r6
+	sbfx	r0, r2, #5, #1
+	umlal	r3, r2, r3, r0
+	sub	r8, r4, r2
+
+	@ Shift r6:r7 to the left by r2 bits.
+	@ If r2 >= 32, then r7 = 0 and r0 = -1, and we set: r6:r7 <- 0:r6
+	umlal	r6, r7, r6, r0
+	@ Left-shift by r2 mod 32
+	and	r2, #31
+	movs	r1, #1
+	lsls	r1, r2
+	umull	r6, r12, r6, r1
+	umlal	r12, r7, r7, r1
+
+	@ Normalized mantissa is now in r6:r12
+	@ Since the mantissa was at most 57-bit pre-normalization, the low
+	@ 7 bits of r6 must be zero.
+
+	@ The exponent of x was in r8. The left-shift operation has
+	@ subtracted some value from it, 8 in case the result has the
+	@ same exponent as x. However, the high bit of the mantissa will
+	@ add 1 to the exponent, so we only add back 7 (the exponent is
+	@ added in because rounding might have produced a carry, which
+	@ should then spill into the exponent).
+	add	r8, r8, #7
+
+	@ If the new mantissa is non-zero, then its bit 63 is non-zero
+	@ (thanks to the normalizing shift). Otherwise, that bit is
+	@ zero, and we should then set the exponent to zero as well.
+	and	r8, r8, r12, asr #31
+
+	@ We have a 64-bit value which we must shrink down to 53 bits, i.e.
+	@ removing the low 11 bits. Rounding must be applied. The low 12
+	@ bits of r6 (in high-to-low order) are:
+	@    b4 b3 b2 b1 b0 0000000
+	@ (as mentioned earlier, the lowest 7 bits must be zero)
+	@ After a strict right shift, b4 is the lowest bit. Rounding will
+	@ add +1 to the value if and only if:
+	@   - b4 = 0 and b3:b2:b1:b0 >= 1001
+	@   - b4 = 1 and b3:b2:b1:b0 >= 1000
+	@ Equivalently, we must add +1 after the shift if and only if:
+	@   b3:b2:b1:b0:b4 + 01111 >= 100000
+	lsls	r5, #31              @ sign of output is sign of x
+	orr	r1, r5, r8, lsl #20  @ exponent and sign
+	lsls	r3, r6, #21          @ top(r3) = b3:b2:b1:b0:00...
+	lsrs	r0, r6, #11
+	bfi	r3, r0, #27, #1      @ top(r3) = b3:b2:b1:b0:b4:00...
+	adds	r3, r3, #0x78000000  @ add 01111 to top bits, carry is adjust
+	adcs	r0, r0, r12, lsl #21
+	adcs	r1, r1, r12, lsr #11
+
+	@ If the mantissa in r6:r7 was zero, then r0:r1 contains zero at
+	@ this point, and the exponent r8 was cleared before, so there is
+	@ not need for further correcting actions.
+
+	@ Post-processing for second output
+	@ ---------------------------------
+
+	@ Unprocessed second output is in r10:s15
+	vmov	r7, s15
+
+	@ result: exponent=r4, sign=r5 (top), mantissa=r10:r7 (scaled up 3 bits)
+	@ Value in r10:r7 is necessarily less than 2^57.
+
+	@ Normalize the result with some left-shifting to full 64-bit
+	@ width. Shift count goes to r2, and exponent (r4) is adjusted.
+	clz	r2, r7
+	clz	r3, r10
+	sbfx	r8, r2, #5, #1
+	umlal	r3, r2, r3, r8
+	subs	r4, r4, r2
+
+	@ Shift r10:r7 to the left by r2 bits (into r6:r12)
+	@ If r2 >= 32, then r7 = 0 and r8 = -1, and we set: r10:r7 <- 0:r10
+	umlal	r10, r7, r10, r8
+	@ Left-shift by r2 mod 32
+	and	r2, #31
+	movw	r8, #1
+	lsl	r8, r2
+	umull	r6, r12, r10, r8
+	umlal	r12, r7, r7, r8
+
+	@ Normalized mantissa is now in r6:r12
+	@ Since the mantissa was at most 57-bit pre-normalization, the low
+	@ 7 bits of r6 must be zero.
+
+	@ The exponent of x was in r4. The left-shift operation has
+	@ subtracted some value from it, 8 in case the result has the
+	@ same exponent as x. However, the high bit of the mantissa will
+	@ add 1 to the exponent, so we only add back 7 (the exponent is
+	@ added in because rounding might have produced a carry, which
+	@ should then spill into the exponent).
+	adds	r4, #7
+
+	@ If the new mantissa is non-zero, then its bit 63 is non-zero
+	@ (thanks to the normalizing shift). Otherwise, that bit is
+	@ zero, and we should then set the exponent to zero as well.
+	ands	r4, r4, r12, asr #31
+
+	@ We have a 64-bit value which we must shrink down to 53 bits, i.e.
+	@ removing the low 11 bits. Rounding must be applied. The low 12
+	@ bits of r6 (in high-to-low order) are:
+	@    b4 b3 b2 b1 b0 0000000
+	@ (as mentioned earlier, the lowest 7 bits must be zero)
+	@ After a strict right shift, b4 is the lowest bit. Rounding will
+	@ add +1 to the value if and only if:
+	@   - b4 = 0 and b3:b2:b1:b0 >= 1001
+	@   - b4 = 1 and b3:b2:b1:b0 >= 1000
+	@ Equivalently, we must add +1 after the shift if and only if:
+	@   b3:b2:b1:b0:b4 + 01111 >= 100000
+	orr	r7, r5, r4, lsl #20  @ exponent and sign
+	lsls	r3, r6, #21          @ top(r3) = b3:b2:b1:b0:00...
+	lsr	r8, r6, #11
+	bfi	r3, r8, #27, #1      @ top(r3) = b3:b2:b1:b0:b4:00...
+	adds	r3, r3, #0x78000000  @ add 01111 to top bits, carry is adjust
+	adcs	r2, r8, r12, lsl #21
+	adcs	r3, r7, r12, lsr #11
+
+	@ If there was an operand swap, then we should reverse the sign
+	@ of the second operand here. As described previsouly, this also
+	@ correctly handles situations involving zeros.
+	@ Swap flag (-1 or 0) is still in r11.
+	eor	r3, r3, r11, lsl #31
+
+	bx	lr
+	.size	fndsa_fpr_add_sub,.-fndsa_fpr_add_sub
+
+@ =======================================================================
+@ fpr fndsa_fpr_mul(fpr x, fpr y)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_fpr_mul
+	.thumb
+	.thumb_func
+	.type	fndsa_fpr_mul, %function
+fndsa_fpr_mul:
+	@push	{ r4, r5, r6, r7 }
+	vmov	s0, s1, r4, r5
+	vmov	s2, s3, r6, r7
+
+	@ Extract mantissas: x.m = r0:r4, y.m = r2:r5
+	@ We assume both operands are non-zero.
+	ubfx	r4, r1, #0, #20
+	ubfx	r5, r3, #0, #20
+	orr	r4, r4, #0x00100000
+	orr	r5, r5, #0x00100000
+
+	@ Extract signs and exponent. We want to store the aggregate sign
+	@ (XOR of the two signs) in r1 (top bit, other bits cleared),
+	@ and in r3 the aggregate exponent.
+	ubfx	r6, r1, #20, #11
+	ubfx	r7, r3, #20, #11
+	eors	r1, r3
+	bfc	r1, #0, #31
+	adds	r3, r6, r7
+	sub	r3, r3, #1024
+	@ If either of the exponents is zero, then we clear the exponent
+	@ and the first mantissa, which will lead through all subsequent
+	@ computations to a zero result (except for the sign bit).
+	muls	r6, r7
+	rsbs	r6, #0
+	and	r3, r3, r6, asr #31
+	and	r0, r0, r6, asr #31
+	and	r4, r4, r6, asr #31
+	@ Move the exponent to its correct position in r1.
+	add	r1, r1, r3, lsl #20
+	@ r3 is now free.
+
+	@ Compute mantissa product into r6:r7:r3:r0.
+	umull	r6, r7, r0, r2
+	umull	r3, r0, r0, r5
+	umaal	r7, r3, r4, r2
+	umaal	r3, r0, r4, r5
+
+	@ r2, r4 and r5 are free.
+
+	@ Product is in [2^104, 2^106 - 2^54 + 1]. We right-shift it
+	@ by 52 or 53 bits, into r5:r7, so that the output is in
+	@ [2^52, 2^53-1]. We must keep track of dropped bits so that we
+	@ may apply rounding properly.
+	@ Set r5 to 1 if we need to shift by 53, or to 0 otherwise.
+	@ If r5 is 1 then we must adjust the exponent.
+	lsrs	r5, r0, #9
+	add	r1, r1, r5, lsl #20
+	@ Set r4 to 2^11 (if r5 = 1) or 2^12 (if r5 = 0). We will use
+	@ it to perform a left shift by 11 or 12 bits, which is the same
+	@ as a right shift by 53 or 52 bits if we use the correct output
+	@ registers.
+	movw	r4, #0x1000
+	lsrs	r4, r5
+	@ r5 is now free.
+	@ Do the shift. Dropped bits are r6 (entire register) and r2 (top
+	@ bits, in order, rest of the register bits are zero).
+	umull	r2, r5, r7, r4
+	umull	r7, r12, r0, r4
+	umlal	r5, r7, r3, r4
+
+	@ Rounding may need to add 1. The top bits of r2 are the top dropped
+	@ bits. We keep bit 31 as is, then compact all other dropped bits
+	@ into bit 30 ("sticky bit") and finally push a copy of the least
+	@ significant kept bit (lowest bit of r5) into bit 29 of r2.
+	orr	r6, r6, r2, lsl #1
+	clz	r6, r6             @ 32 if all bits are 0
+	mvns	r6, r6, lsr #5
+	bfi	r2, r6, #30, #1
+	bfi	r2, r5, #29, #1
+	@ By adding 011 to the top bits of r2, we generate the rounding
+	@ adjustment into the carry, which we can then apply to the
+	@ mantissa.
+	adds	r2, r2, #0x60000000
+	adcs	r0, r5, #0
+	adcs	r1, r7
+
+	@pop	{ r4, r5, r6, r7 }
+	vmov	r4, r5, s0, s1
+	vmov	r6, r7, s2, s3
+	bx	lr
+	.size	fndsa_fpr_mul,.-fndsa_fpr_mul
+
+@ =======================================================================
+@ fpr fndsa_fpr_div(fpr x, fpr y)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_fpr_div
+	.thumb
+	.thumb_func
+	.type	fndsa_fpr_div, %function
+fndsa_fpr_div:
+	push	{ r4, r5, r6, r7, r8, r10, r11, r14 }
+
+	@ Save high words of inputs (signs, exponents).
+	vmov	s0, r1
+	vmov	s1, r3
+
+	@ Extract mantissas (assuming values are non-zero).
+	@  r0:r1 <- x.m
+	@  r2:r3 <- y.m
+	ubfx	r1, r1, #0, #20
+	ubfx	r3, r3, #0, #20
+	orr	r1, r1, #0x00100000
+	orr	r3, r3, #0x00100000
+
+	@ Bit-by-bit division of the mantissas: we run it for 55 iterations
+	@ then append an extra 56-th sticky bit (non-zero if the remainder
+	@ is not zero at this point). Quotient goes to r10:r12.
+	eor	r10, r10
+
+	@ For divisor mantissa y.m, we prepare the following:
+	@   r2:r3   y.m*2
+	@   r4      hi(y.m*4)
+	@   r5      hi(y.m*8)
+	@   r6      hi(y.m*16)
+	@   r7:r8   -(y.m*2)
+	adds	r2, r2
+	adcs	r3, r3
+	adds	r7, r2, r2
+	adcs	r4, r3, r3
+	adds	r7, r7
+	adcs	r5, r4, r4
+	adds	r7, r7
+	adcs	r6, r5, r5
+	subs	r7, r10, r2
+	sbcs	r8, r10, r3
+
+	mov	r12, #15
+.macro DIVIDEND_MUL16
+	lsls	r1, #4
+	umlal	r0, r1,	r0, r12
+.endm
+	mov	r14, #2
+
+	@ Parameter sh is 1, 2, 3 or 4.
+	@ DIVSTEP_SH takes current dividend in r0:r1 and assumes that it
+	@ is left-shifted by sh bits compared to its theoretical value.
+	@ Divisor if subtracted (if possible), yielding the next quotient
+	@ bit, which is pushed into r10. After the conditional subtraction,
+	@ the dividend is formally left-shifted by 1 bit, but this macro
+	@ omits the shift.
+.macro	DIVSTEP_SH  sh
+	@ Check whether the dividend can be subtracted; we must use the
+	@ properly shifted dividend to match the divisor shift.
+	subs	r11, r0, r2, lsl #(\sh)
+	.if (\sh) == 1
+	sbcs	r11, r1, r3
+	.elseif (\sh) == 2
+	sbcs	r11, r1, r4
+	.elseif (\sh) == 3
+	sbcs	r11, r1, r5
+	.else
+	sbcs	r11, r1, r6
+	.endif
+	@ Inject next quotient bit in r10. Also extract that bit into r11,
+	@ left-shifted by sh-1 bits (r7:r8 is negation of a shifted divisor).
+	adcs	r10, r10
+	.if (\sh) == 2
+	and	r11, r14, r10, lsl #1
+	.else
+	and	r11, r10, #1
+	.if (\sh) != 1
+	lsl	r11, r11, #((\sh) - 1)
+	.endif
+	.endif
+	@ Subtract the dividend conditionally on the quotient bit.
+	umlal	r0, r1, r7, r11
+	umlal	r1, r11, r8, r11
+.endm
+
+	@ Four successive division steps.
+.macro	DIVSTEP4
+	DIVIDEND_MUL16
+	DIVSTEP_SH  4
+	DIVSTEP_SH  3
+	DIVSTEP_SH  2
+	DIVSTEP_SH  1
+.endm
+
+	@ Eight successive division steps.
+.macro	DIVSTEP8
+	DIVSTEP4
+	DIVSTEP4
+.endm
+
+	@ First 24 iterations to get the upper 24 quotient bits.
+	DIVSTEP8
+	DIVSTEP8
+	DIVSTEP8
+
+	@ Save upper quotient bits.
+	vmov	s2, r10
+
+	@ 31 iterations for the next bits.
+	DIVSTEP8
+	DIVSTEP8
+	DIVSTEP8
+	DIVSTEP4
+	DIVIDEND_MUL16
+	DIVSTEP_SH  4
+	DIVSTEP_SH  3
+	DIVSTEP_SH  2
+
+	@ Current remainder is in r0:r1 (left-shifted by 1 bit). If it is
+	@ non-zero then we must set the last bit of the quotient (sticky bit).
+	subs	r0, #1
+	sbcs	r1, #0
+	adcs	r10, r10
+
+	@ Restore upper quotient bits into r12.
+	vmov	r12, s2
+
+	@ We have a quotient q in r10:r12, with value up to 2^56-1. It cannot
+	@ be lower than 2^54, since both operands were in [2^52, 2^53-1].
+	@ This is a situation similar to that of multiplication. We
+	@ normalize r10:r12 to 2^54..2^55-1 (into r6:r7) with a conditional
+	@ shift (low bit is sticky). r5 contains -1 if the shift was done,
+	@ 0 otherwise.
+	sbfx	r5, r12, #23, #1
+	subs	r4, r5, #1
+	rors	r4, #1
+	eors	r7, r7
+	umlal	r12, r7, r12, r4
+	umlal	r10, r12, r10, r4
+	orr	r6, r12, r10, lsr #31   @ dropped bit is sticky
+
+	@ We recover source top words into r1 and r3. r5 contains the extra
+	@ shift flag. r6:r7 is the 55-bit output mantissa. Other registers
+	@ are free.
+	vmov	r1, s0
+	vmov	r3, s1
+
+	@ Extract source exponents ex and ey (encoded) into r0 and r2.
+	@ Also set r4 to a negative value if x = 0, or to 0 otherwise
+	@ (by our assumptions, divisor y is non-zero).
+	ubfx	r0, r1, #20, #11
+	ubfx	r2, r3, #20, #11
+	subs	r4, r0, #1
+
+	@ Compute aggregate exponent: ex - ey + 1022 + w
+	@ (where w = 1 if the conditional shift was done, 0 otherwise)
+	@ But we subtract 1 because the injection of the mantissa high
+	@ bit will increment the exponent by 1.
+	subs	r2, r0, r2
+	add	r2, r2, #1021
+	subs	r2, r2, r5
+
+	@ If dividend is zero, then clamp mantissa and aggregate exponent
+	@ to zero.
+	bic	r2, r2, r4, asr #31
+	bic	r6, r6, r4, asr #31
+	bic	r7, r7, r4, asr #31
+
+	@ Sign is the XOR of the sign of the operands. This is true in
+	@ all cases, including very small results (exponent underflow)
+	@ and zeros.
+	eors	r1, r3
+	bfc	r1, #0, #31
+
+	@ Plug in the exponent.
+	bfi	r1, r2, #20, #11
+
+	@ r2 and r3 are free.
+	@ Shift back to the normal 53-bit mantissa, with rounding.
+	@ Mantissa goes into r0:r1. r1 already contains the exponent and
+	@ sign bit; we must do an addition, which will also cover the case
+	@ of a carry (from rounding) spilling into the exponent.
+	@ Rounding adds 1 to the shifted mantissa when the three low bits
+	@ of the mantissa (before the shift) are 011, 110 or 111, i.e.
+	@ exactly when: (bit0 and bit1) or (bit1 and bit2) = 1.
+	and	r3, r6, r6, lsr #1
+	orr	r3, r3, r3, lsr #1
+	and	r0, r3, #1
+	add	r0, r0, r6, lsr #2
+	adds	r0, r0, r7, lsl #30
+	adcs	r1, r1, r7, lsr #2
+
+	pop	{ r4, r5, r6, r7, r8, r10, r11, pc }
+	.size	fndsa_fpr_div,.-fndsa_fpr_div
+
+@ =======================================================================
+@ fpr fndsa_fpr_sqrt(fpr x)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_fpr_sqrt
+	.thumb
+	.thumb_func
+	.type	fndsa_fpr_sqrt, %function
+fndsa_fpr_sqrt:
+	push	{ r4, r5, r6, r7, r8, r10 }
+
+	@ Extract exponent and mantissa. By assumption, the operand is
+	@ non-negative, hence we ignore the sign bit (sign bit could be 1
+	@ if the operand is minus zero). We also decode the exponent
+	@ corresponding to a mantissa between 1 and 2.
+	@ For now, we suppose that the source is not zero.
+	@ r0:r1 <- mantissa
+	@ r12 <- encoded exponent
+	@ r2 <- decoded exponent
+	ubfx	r12, r1, #20, #11
+	sub	r2, r12, #1023
+	bfc	r1, #20, #12
+	orr	r1, r1, #0x00100000
+
+	@ If the exponent is odd, then multiply mantissa by 2 and subtract 1
+	@ from the exponent.
+	sbfx	r3, r2, #0, #1
+	and	r4, r0, r3
+	and	r5, r1, r3
+	adds	r0, r4
+	adcs	r1, r5
+	adds	r2, r3
+
+	@ Exponent is now even, we can halve it.
+	asrs	r2, #1
+
+	@ Left-shift the mantissa so that it is in [2^61, 2^63-1]. This
+	@ allows performing the first 30 iterations with some shortcuts
+	@ (one-word operations).
+	lsls	r1, r1, #9
+	orr	r1, r1, r0, lsr #23
+	lsls	r0, r0, #9
+
+	@ r0:r1 is an integer between 1 (inclusive) and 4 (exclusive) in
+	@ a fixed-point notation (53 fractional bits). We compute the
+	@ square root bit by bit (54 iterations). We'll then append an
+	@ extra sticky bit.
+	eors	r3, r3
+	eors	r5, r5
+
+.macro	SQRTSTEP_HI  bit
+	orr	r6, r5, #(1 << (\bit))
+	subs	r7, r1, r6
+	rrx	r3, r3
+	and	r6, r6, r3, asr #31
+	subs	r1, r1, r6
+	lsrs	r6, r3, #31
+	orr	r5, r5, r6, lsl #((\bit) + 1)
+	adds	r0, r0
+	adcs	r1, r1
+.endm
+
+.macro  SQRTSTEP_HI_x5  bb
+	SQRTSTEP_HI	((\bb) + 4)
+	SQRTSTEP_HI	((\bb) + 3)
+	SQRTSTEP_HI	((\bb) + 2)
+	SQRTSTEP_HI	((\bb) + 1)
+	SQRTSTEP_HI	((\bb) + 0)
+.endm
+
+	SQRTSTEP_HI_x5	25
+	SQRTSTEP_HI_x5	20
+	SQRTSTEP_HI_x5	15
+	SQRTSTEP_HI_x5	10
+	SQRTSTEP_HI_x5	5
+	SQRTSTEP_HI_x5	0
+
+	@ We got top 30 bits of the result, in reverse order.
+	rbit	r3, r3
+
+	@ For the next 24 iterations, we must use two-word operations.
+	@ First iteration is special because the potential bit goes into
+	@ r5, not r6.
+	eors	r4, r4
+	eors	r6, r6
+
+	orr	r7, r6, #(1 << 31)
+	subs	r8, r0, r7
+	sbcs	r10, r1, r5
+	rrx	r4, r4
+	and	r7, r7, r4, asr #31
+	and	r8, r5, r4, asr #31
+	subs	r0, r0, r7
+	sbcs	r1, r1, r8
+	lsrs	r7, r4, #31
+	orr	r5, r5, r4, lsr #31
+	adds	r0, r0
+	adcs	r1, r1
+
+.macro	SQRTSTEP_LO  bit
+	orr	r7, r6, #(1 << (\bit))
+	subs	r8, r0, r7
+	sbcs	r10, r1, r5
+	rrx	r4, r4
+	and	r7, r7, r4, asr #31
+	and	r8, r5, r4, asr #31
+	subs	r0, r0, r7
+	sbcs	r1, r1, r8
+	lsrs	r7, r4, #31
+	orr	r6, r6, r7, lsl #((\bit) + 1)
+	adds	r0, r0
+	adcs	r1, r1
+.endm
+
+.macro	SQRTSTEP_LO_x4  bb
+	SQRTSTEP_LO	((\bb) + 3)
+	SQRTSTEP_LO	((\bb) + 2)
+	SQRTSTEP_LO	((\bb) + 1)
+	SQRTSTEP_LO	((\bb) + 0)
+.endm
+
+	SQRTSTEP_LO	30
+	SQRTSTEP_LO	29
+	SQRTSTEP_LO	28
+	SQRTSTEP_LO_x4	24
+	SQRTSTEP_LO_x4	20
+	SQRTSTEP_LO_x4	16
+	SQRTSTEP_LO_x4	12
+	SQRTSTEP_LO_x4	8
+
+	@ Put low 24 bits in the right order.
+	rbit	r4, r4
+
+	@ We now have a 54-bit result (low 24 bits in r4, top 30 bits in r3).
+	@ We need to round the value; the sticky bit is implicit (it is 1 if
+	@ the remainder in r0:r1 is non-zero at this point).
+	orrs	r0, r1
+	rsbs	r1, r0, #0
+	orrs	r0, r1       @ sticky bit is in r0[31]
+	and	r0, r4, r0, lsr #31
+	and	r1, r4, r4, lsr #1
+	orrs	r0, r1
+	ands	r0, #1       @ r0 contains the rounding adjustment
+	lsrs	r1, r3, #9
+	add	r0, r0, r4, lsr #1
+	adds	r0, r0, r3, lsl #23
+	adcs	r1, #0
+
+	@ We have a rounded mantissa (including its top bit). We plug the
+	@ exponent, which is currently in r2 in decoded format. Since the
+	@ mantissa top bit is present, we encode r2 by adding 1022.
+	add	r2, #1022
+	add	r1, r1, r2, lsl #20
+
+	@ We have the result, except if the source operand was zero, in
+	@ which case we must clamp the value to 0. Original exponent
+	@ (encoded) is still in r12.
+	rsb	r3, r12, #0
+	and	r0, r0, r3, asr #31
+	and	r1, r1, r3, asr #31
+
+	pop	{ r4, r5, r6, r7, r8, r10 }
+	bx	lr
+	.size	fndsa_fpr_sqrt,.-fndsa_fpr_sqrt
diff --git a/crypto_sign/fndsa_provisional-512/m4f/sign_inner.h b/crypto_sign/fndsa_provisional-512/m4f/sign_inner.h
new file mode 120000
index 00000000..b36c72da
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/sign_inner.h
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/sign_inner.h
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/sign_sampler.c b/crypto_sign/fndsa_provisional-512/m4f/sign_sampler.c
new file mode 120000
index 00000000..7ed648ae
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/sign_sampler.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/sign_sampler.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/sign_sampler_cm4.s b/crypto_sign/fndsa_provisional-512/m4f/sign_sampler_cm4.s
new file mode 100644
index 00000000..fe19b6a4
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/sign_sampler_cm4.s
@@ -0,0 +1,143 @@
+	.syntax	unified
+	.cpu	cortex-m4
+	.file	"sign_sampler_cm4.s"
+	.text
+
+@ =======================================================================
+@ int32_t fndsa_gaussian0_helper(uint64_t lo, uint32_t hi)
+@ =======================================================================
+
+	.align	2
+	.global	fndsa_gaussian0_helper
+	.thumb
+	.thumb_func
+	.type	fndsa_gaussian0_helper, %function
+fndsa_gaussian0_helper:
+	push.w	{ r4, r5, r6, r7, r8 }
+
+	adr.w	r12, fndsa_gaussian0_helper__gauss0_low
+
+	@ 0 and 1
+	ldm	r12!, { r4, r5, r6, r7 }
+	subs	r8, r0, r4
+	sbcs	r8, r1, r5
+	sbcs	r8, r2, #163  @ high[0]
+	lsr.w	r3, r8, #31
+	subs	r8, r0, r6
+	sbcs	r8, r1, r7
+	sbcs	r8, r2, #84   @ high[1]
+	add.w	r3, r3, r8, lsr #31
+
+	@ 2 and 3
+	ldm	r12!, { r4, r5, r6, r7 }
+	subs	r8, r0, r4
+	sbcs	r8, r1, r5
+	sbcs	r8, r2, #34   @ high[2]
+	add.w	r3, r3, r8, lsr #31
+	subs	r8, r0, r6
+	sbcs	r8, r1, r7
+	sbcs	r8, r2, #10   @ high[3]
+	add.w	r3, r3, r8, lsr #31
+
+	@ 4 and 5
+	ldm	r12!, { r4, r5, r6, r7 }
+	subs	r8, r0, r4
+	sbcs	r8, r1, r5
+	sbcs	r8, r2, #2    @ high[4]
+	add.w	r3, r3, r8, lsr #31
+	subs	r8, r0, r6
+	sbcs	r8, r1, r7
+	sbcs	r8, r2, #0    @ high[5]
+	add.w	r3, r3, r8, lsr #31
+
+	@ 6 and 7
+	ldm	r12!, { r4, r5, r6, r7 }
+	subs	r8, r0, r4
+	sbcs	r8, r1, r5
+	sbcs	r8, r2, #0    @ high[6]
+	add.w	r3, r3, r8, lsr #31
+	subs	r8, r0, r6
+	sbcs	r8, r1, r7
+	sbcs	r8, r2, #0    @ high[7]
+	add.w	r3, r3, r8, lsr #31
+
+	@ 8 and 9
+	ldm	r12!, { r4, r5, r6, r7 }
+	subs	r8, r0, r4
+	sbcs	r8, r1, r5
+	sbcs	r8, r2, #0    @ high[8]
+	add.w	r3, r3, r8, lsr #31
+	subs	r8, r0, r6
+	sbcs	r8, r1, r7
+	sbcs	r8, r2, #0    @ high[9]
+	add.w	r3, r3, r8, lsr #31
+
+	@ 10, 11 and 12
+	ldm	r12!, { r4, r5, r6, r7 }
+	subs	r8, r0, r4
+	sbcs	r8, r1, r5
+	sbcs	r8, r2, #0    @ high[10]
+	add.w	r3, r3, r8, lsr #31
+	subs	r8, r0, r6
+	sbcs	r8, r1, #148  @ mid[11]
+	sbcs	r8, r2, #0    @ high[11]
+	add.w	r3, r3, r8, lsr #31
+	subs	r8, r0, r7
+	sbcs	r8, r1, #3    @ mid[12]
+	sbcs	r8, r2, #0    @ high[12]
+	add.w	r3, r3, r8, lsr #31
+
+	@ 13, 14, 15, 16
+	ldm	r12!, { r4, r5, r6, r7 }
+	subs	r8, r0, r4
+	sbcs	r8, r1, #0    @ mid[13]
+	sbcs	r8, r2, #0    @ high[13]
+	add.w	r3, r3, r8, lsr #31
+	subs	r8, r0, r5
+	sbcs	r8, r1, #0    @ mid[14]
+	sbcs	r8, r2, #0    @ high[14]
+	add.w	r3, r3, r8, lsr #31
+	subs	r8, r0, r6
+	sbcs	r8, r1, #0    @ mid[15]
+	sbcs	r8, r2, #0    @ high[15]
+	add.w	r3, r3, r8, lsr #31
+	subs	r8, r0, r7
+	sbcs	r8, r1, #0    @ mid[16]
+	sbcs	r8, r2, #0    @ high[16]
+	add.w	r3, r3, r8, lsr #31
+
+	@ 17
+	ldr.w	r4, [r12]
+	subs	r8, r0, r4
+	sbcs	r8, r1, #0    @ mid[17]
+	sbcs	r8, r2, #0    @ high[17]
+	add.w	r3, r3, r8, lsr #31
+
+	mov.w	r0, r3
+	pop	{ r4, r5, r6, r7, r8 }
+	bx	lr
+	.align	3
+fndsa_gaussian0_helper__gauss0_low:
+	@ This is the RCDT table from the specification. Only the low 64 bits
+	@ of each value are stored here; the high 8 bits are provided in
+	@ comments but otherwise hardcoded in the instructions above.
+	.word	2889422850, 4159975123  @ high: 163
+	.word	1065212802, 3542816799  @ high:  84
+	.word	1210696191, 2110640275  @ high:  34
+	.word	3348712164, 3514123127  @ high:  10
+	.word	4081000303, 2508483758  @ high:   2
+	.word	3983850847, 2001389396  @ high:   0
+	.word	 729246436,  270851412  @ high:   0
+	.word	1705862106,   27394012  @ high:   0
+	.word	2323342376,    2064600  @ high:   0
+	.word	2986609769,     115709  @ high:   0
+	.word	 617624059,       4815  @ high:   0
+	@ Starting at value 11, we only store the low 32 bits.
+	.word	2676689183  @ mid: 148    high:   0
+	.word	1717414296  @ mid:   3    high:   0
+	.word	 247426747  @ mid:   0    high:   0
+	.word	   3104126  @ mid:   0    high:   0
+	.word	     28824  @ mid:   0    high:   0
+	.word	       198  @ mid:   0    high:   0
+	.word	         1  @ mid:   0    high:   0
+	.size	fndsa_gaussian0_helper,.-fndsa_gaussian0_helper
diff --git a/crypto_sign/fndsa_provisional-512/m4f/sysrng.c b/crypto_sign/fndsa_provisional-512/m4f/sysrng.c
new file mode 120000
index 00000000..fa7fb5a7
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/sysrng.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/sysrng.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/util.c b/crypto_sign/fndsa_provisional-512/m4f/util.c
new file mode 120000
index 00000000..6736b902
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/util.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/util.c
\ No newline at end of file
diff --git a/crypto_sign/fndsa_provisional-512/m4f/vrfy.c b/crypto_sign/fndsa_provisional-512/m4f/vrfy.c
new file mode 120000
index 00000000..da558d38
--- /dev/null
+++ b/crypto_sign/fndsa_provisional-512/m4f/vrfy.c
@@ -0,0 +1 @@
+../../../mupq/crypto_sign/fndsa_provisional-512/ref/vrfy.c
\ No newline at end of file
diff --git a/mupq b/mupq
index 61850b39..27157f09 160000
--- a/mupq
+++ b/mupq
@@ -1 +1 @@
-Subproject commit 61850b39de4add2616b20a440a7376b6d4e396de
+Subproject commit 27157f09c3e450100101ef9f1cd7167131402a3a
diff --git a/skiplist.py b/skiplist.py
index 4ae65098..5757114f 100644
--- a/skiplist.py
+++ b/skiplist.py
@@ -59,19 +59,11 @@
     {'scheme': 'cross-sha3-r-sdpg-3-small', 'implementation': 'ref', 'estmemory': 776192},
     {'scheme': 'cross-sha3-r-sdpg-5-fast', 'implementation': 'ref', 'estmemory': 440320},
     {'scheme': 'cross-sha3-r-sdpg-5-small', 'implementation': 'ref', 'estmemory': 1063936},
-    {'scheme': 'falcon-1024', 'implementation': 'clean', 'estmemory': 91136},
-    {'scheme': 'falcon-1024', 'implementation': 'm4-ct', 'estmemory': 89088},
-    {'scheme': 'falcon-1024', 'implementation': 'opt-ct', 'estmemory': 89088},
-    {'scheme': 'falcon-1024', 'implementation': 'opt-leaktime', 'estmemory': 90112},
-    {'scheme': 'falcon-1024-tree', 'implementation': 'opt-ct', 'estmemory': 185344},
-    {'scheme': 'falcon-1024-tree', 'implementation': 'opt-leaktime', 'estmemory': 186368},
-    {'scheme': 'falcon-512', 'implementation': 'clean', 'estmemory': 48128},
-    {'scheme': 'falcon-512', 'implementation': 'm4-ct', 'estmemory': 46080},
-    {'scheme': 'falcon-512', 'implementation': 'opt-ct', 'estmemory': 46080},
-    {'scheme': 'falcon-512', 'implementation': 'opt-leaktime', 'estmemory': 47104},
-    {'scheme': 'falcon-512-tree', 'implementation': 'm4-ct', 'estmemory': 90112},
-    {'scheme': 'falcon-512-tree', 'implementation': 'opt-ct', 'estmemory': 90112},
-    {'scheme': 'falcon-512-tree', 'implementation': 'opt-leaktime', 'estmemory': 91136},
+    # skip outdated Falcon implementations from PQClean (see https://github.com/mupq/pqm4/pull/377)
+    {'scheme': 'falcon-1024', 'implementation': 'clean', 'estmemory': 999999999999},
+    {'scheme': 'falcon-512', 'implementation': 'clean', 'estmemory': 999999999999},
+    {'scheme': 'falcon-padded-1024', 'implementation': 'clean', 'estmemory': 999999999999},
+    {'scheme': 'falcon-padded-512', 'implementation': 'clean', 'estmemory': 999999999999},
     {'scheme': 'haetae2', 'implementation': 'm4f', 'estmemory': 60416},
     {'scheme': 'haetae2', 'implementation': 'ref', 'estmemory': 59392},
     {'scheme': 'haetae3', 'implementation': 'm4f', 'estmemory': 90112},
@@ -213,8 +205,6 @@
     {'scheme': 'sphincs-shake-192s-simple', 'implementation': 'clean', 'estmemory': 22528},
     {'scheme': 'sphincs-shake-256f-simple', 'implementation': 'clean', 'estmemory': 59392},
     {'scheme': 'sphincs-shake-256s-simple', 'implementation': 'clean', 'estmemory': 38912},
-    {'scheme': 'falcon-padded-1024', 'implementation': 'clean', 'estmemory': 91136},
-    {'scheme': 'falcon-padded-512', 'implementation': 'clean', 'estmemory': 48128},
     {'scheme': 'ml-dsa-87', 'implementation': 'm4fstack', 'estmemory': 21504},
     {'scheme': 'ml-dsa-87', 'implementation': 'm4f', 'estmemory': 129024},
     {'scheme': 'ml-dsa-65', 'implementation': 'm4fstack', 'estmemory': 17408},
@@ -224,4 +214,8 @@
     {'scheme': 'ml-dsa-87', 'implementation': 'clean', 'estmemory': 136192},
     {'scheme': 'ml-dsa-65', 'implementation': 'clean', 'estmemory': 90112},
     {'scheme': 'ml-dsa-44', 'implementation': 'clean', 'estmemory': 59392},
+    {'scheme': 'fndsa_provisional-1024', 'implementation': 'ref', 'estmemory': 89088},
+    {'scheme': 'fndsa_provisional-512', 'implementation': 'ref', 'estmemory': 46080},
+    {'scheme': 'fndsa_provisional-1024', 'implementation': 'm4f', 'estmemory': 89088},
+    {'scheme': 'fndsa_provisional-512', 'implementation': 'm4f', 'estmemory': 46080},
 ]