-
Notifications
You must be signed in to change notification settings - Fork 3
Scalability
Hüseyin Tuğrul BÜYÜKIŞIK edited this page Mar 7, 2021
·
1 revision
If a thread is busy waiting data from a graphics card, it yields execution and another thread switches for the context. This lets threads hide the I/O latency behind another task (like computations) and works for more threads than cpu's number of logical cores.
Testing 8-logical-core-cpu with cache bandwidth algorithm:
output:
init: 1697040785 nanoseconds (bandwidth = 2172.25 MB/s)
2 threads read: 7028960426 nanoseconds (bandwidth = 53704.58 MB/s) (throughput = 0.15 nanoseconds per iteration)
4 threads read: 3920124368 nanoseconds (bandwidth = 96294.74 MB/s) (throughput = 0.08 nanoseconds per iteration)
6 threads read: 3588529510 nanoseconds (bandwidth = 105192.77 MB/s) (throughput = 0.08 nanoseconds per iteration)
8 threads read: 3124735758 nanoseconds (bandwidth = 120806.17 MB/s) (throughput = 0.07 nanoseconds per iteration)
10 threads read: 2940791214 nanoseconds (bandwidth = 128362.52 MB/s) (throughput = 0.06 nanoseconds per iteration)
12 threads read: 2910410330 nanoseconds (bandwidth = 129702.45 MB/s) (throughput = 0.06 nanoseconds per iteration)
14 threads read: 2866475612 nanoseconds (bandwidth = 131690.41 MB/s) (throughput = 0.06 nanoseconds per iteration)
16 threads read: 2811449525 nanoseconds (bandwidth = 134267.88 MB/s) (throughput = 0.06 nanoseconds per iteration)
18 threads read: 2808361785 nanoseconds (bandwidth = 134415.50 MB/s) (throughput = 0.06 nanoseconds per iteration)
20 threads read: 2798188690 nanoseconds (bandwidth = 134904.18 MB/s) (throughput = 0.06 nanoseconds per iteration)
22 threads read: 2764898330 nanoseconds (bandwidth = 136528.48 MB/s) (throughput = 0.06 nanoseconds per iteration)
24 threads read: 2762656276 nanoseconds (bandwidth = 136639.28 MB/s) (throughput = 0.06 nanoseconds per iteration)
26 threads read: 2747808082 nanoseconds (bandwidth = 137377.63 MB/s) (throughput = 0.06 nanoseconds per iteration)
28 threads read: 2765609427 nanoseconds (bandwidth = 136493.37 MB/s) (throughput = 0.06 nanoseconds per iteration)
30 threads read: 2740091219 nanoseconds (bandwidth = 137764.52 MB/s) (throughput = 0.06 nanoseconds per iteration)
32 threads read: 2727705705 nanoseconds (bandwidth = 138390.06 MB/s) (throughput = 0.06 nanoseconds per iteration)
34 threads read: 2720381549 nanoseconds (bandwidth = 138762.65 MB/s) (throughput = 0.06 nanoseconds per iteration)
36 threads read: 2711926240 nanoseconds (bandwidth = 139195.29 MB/s) (throughput = 0.06 nanoseconds per iteration)
38 threads read: 2723528360 nanoseconds (bandwidth = 138602.32 MB/s) (throughput = 0.06 nanoseconds per iteration)
40 threads read: 2751352638 nanoseconds (bandwidth = 137200.65 MB/s) (throughput = 0.06 nanoseconds per iteration)
42 threads read: 2757359368 nanoseconds (bandwidth = 136901.76 MB/s) (throughput = 0.06 nanoseconds per iteration)
44 threads read: 2702749684 nanoseconds (bandwidth = 139667.90 MB/s) (throughput = 0.06 nanoseconds per iteration)
46 threads read: 2713890874 nanoseconds (bandwidth = 139094.52 MB/s) (throughput = 0.06 nanoseconds per iteration)
48 threads read: 2763985401 nanoseconds (bandwidth = 136573.57 MB/s) (throughput = 0.06 nanoseconds per iteration)
50 threads read: 2708934419 nanoseconds (bandwidth = 139349.02 MB/s) (throughput = 0.06 nanoseconds per iteration)
52 threads read: 2732242673 nanoseconds (bandwidth = 138160.26 MB/s) (throughput = 0.06 nanoseconds per iteration)
54 threads read: 2742502564 nanoseconds (bandwidth = 137643.39 MB/s) (throughput = 0.06 nanoseconds per iteration)
56 threads read: 2724952764 nanoseconds (bandwidth = 138529.87 MB/s) (throughput = 0.06 nanoseconds per iteration)
58 threads read: 2709850386 nanoseconds (bandwidth = 139301.92 MB/s) (throughput = 0.06 nanoseconds per iteration)
60 threads read: 2725409629 nanoseconds (bandwidth = 138506.65 MB/s) (throughput = 0.06 nanoseconds per iteration)
62 threads read: 2751607134 nanoseconds (bandwidth = 137187.96 MB/s) (throughput = 0.06 nanoseconds per iteration)
64 threads read: 2725538887 nanoseconds (bandwidth = 138500.08 MB/s) (throughput = 0.06 nanoseconds per iteration)
Source:
#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "PcieBandwidthBenchmarker.h"
#include "CpuBenchmarker.h"
// testing
#include <iostream>
#include "omp.h"
struct Object
{
size_t id;
char str[32];
Object():id(-1){}
Object(size_t idPrm):id(idPrm){}
};
int main()
{
const long long n = 1024ll*1000*90;
VirtualMultiArray<Object> test(n,GraphicsCardSupplyDepot().requestGpus(),1024*5,1,PcieBandwidthBenchmarker().bestBandwidth(20));
const int nL1CacheElements = 512;
const int chunkSize = 1024;
{
CpuBenchmarker bench(n*sizeof(Object),"init");
#pragma omp parallel for
for(long long i=0;i<n;i+=chunkSize)
{
test.mappedReadWriteAccess(i,chunkSize,[&,i](Object * ptr){
for(long long j=0;j<chunkSize;j++)
{
ptr[j+i] = Object(j+i);
}
},false,false,true);
}
}
for(int t=1;t<=32;t++)
{
const int thr = t*2;
CpuBenchmarker bench(n*sizeof(Object::id)*nL1CacheElements,std::to_string(thr)+std::string(" threads read"),n*nL1CacheElements);
#pragma omp parallel for num_threads(thr)
for(long long i=0;i<n;i+=chunkSize)
{
alignas(32)
size_t cache[nL1CacheElements];
for(int i=0;i<nL1CacheElements;i++)
{
cache[i]=-i-1;
}
test.mappedReadWriteAccess(i,chunkSize,[&,i](Object * ptr){
for(long long j=0;j<chunkSize;j++)
{
Object& o = ptr[j+i];
// scan through L1 cached content
alignas(32)
int ctr[8]={0,0,0,0,0,0,0,0};
for(int k=0;k<nL1CacheElements;k+=8)
{
ctr[0] += (o.id == cache[k]);
ctr[1] += (o.id == cache[k+1]);
ctr[2] += (o.id == cache[k+2]);
ctr[3] += (o.id == cache[k+3]);
ctr[4] += (o.id == cache[k+4]);
ctr[5] += (o.id == cache[k+5]);
ctr[6] += (o.id == cache[k+6]);
ctr[7] += (o.id == cache[k+7]);
}
if(ctr[0]+ctr[1]+ctr[2]+ctr[3]+ctr[4]+ctr[5]+ctr[6]+ctr[7]>0)
{
std::cout<<" Error: id="<<o.id<<std::endl;
}
}
},false,true,false);
}
}
return 0;
}
Increasing nL1CacheElements
value to 1024 or 2048 makes better cache bandwidth measurement while i/o overlapping gets worse. Tuning page size and chunk size also changes behavior.