-
Notifications
You must be signed in to change notification settings - Fork 0
/
triple.cu
274 lines (235 loc) · 13.7 KB
/
triple.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#include <iostream>
#include <cmath>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/partition.h>
using namespace std;
// 2^64-1 - if you want to fix the overflow issues you'll have to somehow cap values that overflow this.
#define absmax 18446744073709551615
// This is generally a mouthful to write.
#define pr thrust::pair<uint32_t, uint32_t>
/* ------------------------------------------------------------------------------------------------------------------
*
* TRIPLE.CU version 1.0 written 2024-01-20 =========================================================================
*
* This is a program to find triples of k-gonal numbers that have the property D(n).
* Authored by Christian Zhou-Zheng, with help from Sounak Bagchi.
* Created for number theory research at Euler Circle under Simon Rubenstein-Salzedo.
*
* USER GUIDE -------------------------------------------------------------------------------------------------------
*
* Everything that should be altered during use is in the "PARAMETERS" section below.
* Change the parameters k and n accordingly.
* Change the parameters abound, bbound, and cbound to change the search space.
* - Practical bounds on each (you run out of memory) are 10^5 for abound and bbound, and 10^10 for cbound.
* You can also change the number of blocks and threads per block for your GPU.
* When running this from the command line, on Linux (which you have to be on to run this as far as I'm aware), you
* can put "time" in front of it to benchmark runtime.
*
* IMPLEMENTATION NOTES ---------------------------------------------------------------------------------------------
*
* The program is split into two parts: checkPair and findc. checkPair takes an a and a b and checks if they satisfy
* D(n). If they do, it outputs the pair (a,b) to a thrust::device_vector. This section is known to work.
* findc takes the output of checkPair and checks if the pair (a,b) satisfies D(n) with a c within cbound. If it does,
* it outputs c to a thrust::device_vector. This section is known to have issues.
* There's a bit of intermediate processing in main() because due to how CUDA multithreading works, you can't have a
* dynamically resized output like a vector (even if thrust::device_vector masquerades as one, you can only modify)
* it from host code). So I have a full array of size abound*bbound (which is what causes the memory overflow that
* locks abound and bbound at 10^5) which has one position for each potential pair, each of which only gets written
* to if necessary. This results in a lot of waste that I can't figure out how to reduce. I do use
* thrust::stable_partition to remove all the pairs with zeros, creating a smaller thrust::device_vector that I can
* pass to findc, but I don't know how to reduce the size in checkPair.
*
* KNOWN BUGS/NOTES -------------------------------------------------------------------------------------------------
*
* I haven't actually tested the program for n != 1. It should work regardless.
* Very large values of c/cbound will cause overflow issues with the eventual output. This makes no sense to me
* but it is what it is, and I think the <= in the final "print to stdout" section in main() fixes it.
* Again, 10^5 is the practical limit for abound and bbound, due to the reasons detailed above. This can be fixed.
*
* TODOS ------------------------------------------------------------------------------------------------------------
*
* Ensure the program works for n != 1.
* Fix the overflow issues with values of c.
* Allow for a larger search space on abound and bbound by reducing the size of the array written to. Look into
* stream compaction. From Lemon#3040 on Discord:
* "Youll have to implement your own resizable vector
* The two choices is to either allocate what you are certain to be enough, then push the pairs there (if you dont
* care about ordering then you can simply increment an atomic, it wont hurt performance(at least if you
* coalesce them by doing warp vote count, then have leader do the atomic)), and if it wasnt enough, you will
* have to reallocate and run it again
* The problem is that you cant really do allocations from gpu
* You can preallocate a large buffer and then write a custom allocator, but it has its own problems
* So yea, resizable vectors are a lie
* Just get the upper bounds on output size, then use an atomic increment"
* In particular, see https://stackoverflow.com/questions/34059753/cuda-stream-compaction-algorithm.
*
* ------------------------------------------------------------------------------------------------------------------
* Below here, change your parameters.
// PARAMETERS ---------------------------------------------------------------------------------------------------- */
#define k 7 // DEFAULT k=3 5 7 9
#define n 9 // DEFAULT n=1 or 9 for heptagonals
#define abound 1000 // DEFAULT 1000
#define bbound 1000 // DEFAULT 1000
#define cbound 10000000 // DEFAULT 10000000
#define boxsize 200 // DEFAULT 200 LIMIT 1024
#define threadsize 1000 // DEFAULT 1000 LIMIT 1024
// SQUARE-DETERMINING -----------------------------------------------------------------------------------------------
// Thanks to Norbert Juffa for this section, https://forums.developer.nvidia.com/t/integer-square-root/198642.
// This is imported code to quickly determine the floored integer square root of an integer.
/*
Copyright (c) 2021, Norbert Juffa
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Fast multiplication of two unsigned 32-bit integers with 64-bit result.
__device__ unsigned long long int umul_wide (unsigned int a, unsigned int b) {
unsigned long long int r;
asm ("mul.wide.u32 %0,%1,%2;\n\t" : "=l"(r) : "r"(a), "r"(b));
return r;
}
// Compute the integer square root of a 64-bit unsigned integer by multiplying it with its inverse square root
// (which can be computed much faster), then applying one Newton-Raphson iteration and flooring.
__device__ uint64_t isqrtll (uint64_t a) {
uint64_t rem, arg;
uint32_t b, r, s, scal;
arg = a;
// Normalize argument
scal = __clzll (a) & ~1;
a = a << scal;
b = a >> 32;
// Approximate rsqrt accurately. Make sure it's an underestimate!
float fb, fr;
fb = (float)b;
asm ("rsqrt.approx.ftz.f32 %0,%1; \n\t" : "=f"(fr) : "f"(fb));
r = (uint32_t) fmaf (1.407374884e14f, fr, -438.0f);
// Compute sqrt(a) as a * rsqrt(a).
s = __umulhi (r, b);
// NR iteration combined with back multiply.
s = s * 2;
rem = a - umul_wide (s, s);
r = __umulhi ((uint32_t)(rem >> 32) + 1, r);
s = s + r;
// Denormalize result.
s = s >> (scal >> 1);
// Make sure we get the floor correct; can be off by one to either side.
rem = arg - umul_wide (s, s);
if ((int64_t)rem < 0) s--;
else if (rem >= ((uint64_t)s * 2 + 1)) s++;
return (arg == 0) ? 0 : s;
}
/* ------------------------------------------------------------------------------------------------------------------
* My code starts here. Don't touch getKGonal or checkPair, but change findc and mess with types as needed (long long
* vs. uint64_t in particular). Todo: fix the overflow issues.
* --------------------------------------------------------------------------------------------------------------- */
// General formula for the k-gonal numbers. Do not touch the logic (unless I did something REALLY dumb), but change the type as needed.
__device__ long long int getKGonal(long long a) { return (k % 2 == 1) ? a*((k-2)*a+(4-k))/2 : a*((k/2-1)*a-((k/2-1)-1))/2; }
// Check if a is a perfect square. Potential issues here.
__device__ bool isPerfectSquare(uint64_t a) {
uint64_t s = isqrtll(a);
// Check whether the square of its integer square root is itself.
return s*s == a;
}
// Each block takes an a-value, then the threads within that block split up b.
__global__ void checkPair(pr *out) {
int aidx = blockIdx.x+1;
int bidx = threadIdx.x;
for(int a = aidx; a <= abound; a += boxsize) {
if(a==0) continue;
// This is not very efficient! You lose a couple of threads per block (depending on parameters and search bounds).
for(int b = a+bidx; b <= bbound; b += threadsize) {
long long int aprime = getKGonal(a);
long long int bprime = getKGonal(b);
// Write the pair to the output array if it satisfies D(n).
if(isPerfectSquare(aprime * bprime + n)) out[(a-1)*bbound + b] = thrust::pair(a,b);
}
}
}
// We have a and b already, so check c-values.
// Each block takes a pair (a, b) in in, and each thread takes a subset of the search space for c.
__global__ void findc(thrust::device_vector<pr> in, long long int *out) {
for(int q = blockIdx.x; q <= in.size(); q += boxsize) {
// Grab the pair - for some reason you can't oneline this.
pr p = in[q];
int a = p.first;
int b = p.second;
// The "if(x==0) continue;" statements are here to avoid issues with values equalling 0.
// I don't know whether I still need them but I worry the program will fall apart if I remove them.
if(a==0 || b==0) continue;
long long int aprime = getKGonal(a);
long long int bprime = getKGonal(b);
for(long long int c = b+threadIdx.x; c <= cbound; c += threadsize) {
if(c==0) continue;
// The overflow issues probably kick in around here. This is where you should test for the overflow issues.
long long int cprime = getKGonal(c);
if(isPerfectSquare(aprime * cprime + n) && isPerfectSquare(cprime * bprime + n)) out[q] = c;
}
}
}
// Predicate for thrust::stable_partition to remove all pairs with zeros below.
struct is_nonzero {
__host__ __device__
bool operator()(const pr x) { return x.first != 0 && x.second != 0; }
};
// MAIN -------------------------------------------------------------------------------------------------------------
int main()
{
// I wish there was an easier way to do this... Standard memory allocation.
// I don't actually know if I need to instantiate an array on the host side.
// Check that out if you try to fix the memory overflow issues.
pr *out, *d_out;
out = (pr *)malloc(abound*bbound*sizeof(pr));
cudaMalloc((void **)&d_out, abound*bbound*sizeof(pr));
cudaMemcpy(d_out, out, abound*bbound*sizeof(pr), cudaMemcpyHostToDevice);
// Run the first part of the program.
checkPair<<<boxsize, threadsize>>>(d_out);
// Absolutely need to wait for that to finish running before continuing.
cudaDeviceSynchronize();
// THE CODE IS DEFINITELY GOOD UP TO HERE. Past here, uh, no promises.
// Grab the pointer from the output from the last part and use it to create a thrust::device_vector with the values.
thrust::device_ptr<pr> d_ptr(d_out);
thrust::device_vector<pr> d_vec(d_ptr, d_ptr + abound*bbound);
// Sorts the vector putting all nonzero pairs first and nonzero pairs last, and returns an iterator to the first zero.
thrust::device_vector<pr>::iterator new_end = thrust::stable_partition(d_vec.begin(), d_vec.end(), is_nonzero());
// Create a new thrust::device_vector with exactly the nonzero pairs.
thrust::device_vector<pr> d_vec2(d_vec.begin(), new_end);
// THE CODE IS VERY NEARLY DEFINITELY GOOD UP TO HERE.
// Second verse, same as the first. Standard memory allocation, but do I need to allocate memory on the host?
long long int *out2, *d_out2;
out2 = (long long int *)malloc(d_vec2.size()*sizeof(long long int));
cudaMalloc((void **)&d_out2, d_vec2.size()*sizeof(long long int));
cudaMemcpy(d_out2, out2, d_vec2.size()*sizeof(long long int), cudaMemcpyHostToDevice);
// Run the second part of the program.
findc<<<boxsize, threadsize>>>(d_vec2, d_out2);
// Absolutely need to wait for that to finish running before continuing.
cudaDeviceSynchronize();
// Copy the output back to the host since we don't need to fanagle with intermediate processing again.
cudaMemcpy(out2, d_out2, d_vec2.size()*sizeof(long long int), cudaMemcpyDeviceToHost);
// Print final result to stdout.
for(int i = 0; i < d_vec2.size(); i++) {
// I'm PRETTY sure this line combats the overflow (the less-than in particular) but I'm not 100% sure.
if(out2[i] <= 0) continue;
pr p = d_vec2[i];
cout << p.first << " " << p.second << " " << out2[i] << std::endl;
}
return 0;
}