Skip to content

Scalability

Hüseyin Tuğrul BÜYÜKIŞIK edited this page Mar 7, 2021 · 1 revision

If a thread is busy waiting data from a graphics card, it yields execution and another thread switches for the context. This lets threads hide the I/O latency behind another task (like computations) and works for more threads than cpu's number of logical cores.

Testing 8-logical-core-cpu with cache bandwidth algorithm:

output:

init: 1697040785 nanoseconds     (bandwidth = 2172.25 MB/s)     
2 threads read: 7028960426 nanoseconds     (bandwidth = 53704.58 MB/s)      (throughput = 0.15 nanoseconds per iteration) 
4 threads read: 3920124368 nanoseconds     (bandwidth = 96294.74 MB/s)      (throughput = 0.08 nanoseconds per iteration) 
6 threads read: 3588529510 nanoseconds     (bandwidth = 105192.77 MB/s)      (throughput = 0.08 nanoseconds per iteration) 
8 threads read: 3124735758 nanoseconds     (bandwidth = 120806.17 MB/s)      (throughput = 0.07 nanoseconds per iteration) 
10 threads read: 2940791214 nanoseconds     (bandwidth = 128362.52 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
12 threads read: 2910410330 nanoseconds     (bandwidth = 129702.45 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
14 threads read: 2866475612 nanoseconds     (bandwidth = 131690.41 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
16 threads read: 2811449525 nanoseconds     (bandwidth = 134267.88 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
18 threads read: 2808361785 nanoseconds     (bandwidth = 134415.50 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
20 threads read: 2798188690 nanoseconds     (bandwidth = 134904.18 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
22 threads read: 2764898330 nanoseconds     (bandwidth = 136528.48 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
24 threads read: 2762656276 nanoseconds     (bandwidth = 136639.28 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
26 threads read: 2747808082 nanoseconds     (bandwidth = 137377.63 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
28 threads read: 2765609427 nanoseconds     (bandwidth = 136493.37 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
30 threads read: 2740091219 nanoseconds     (bandwidth = 137764.52 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
32 threads read: 2727705705 nanoseconds     (bandwidth = 138390.06 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
34 threads read: 2720381549 nanoseconds     (bandwidth = 138762.65 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
36 threads read: 2711926240 nanoseconds     (bandwidth = 139195.29 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
38 threads read: 2723528360 nanoseconds     (bandwidth = 138602.32 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
40 threads read: 2751352638 nanoseconds     (bandwidth = 137200.65 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
42 threads read: 2757359368 nanoseconds     (bandwidth = 136901.76 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
44 threads read: 2702749684 nanoseconds     (bandwidth = 139667.90 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
46 threads read: 2713890874 nanoseconds     (bandwidth = 139094.52 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
48 threads read: 2763985401 nanoseconds     (bandwidth = 136573.57 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
50 threads read: 2708934419 nanoseconds     (bandwidth = 139349.02 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
52 threads read: 2732242673 nanoseconds     (bandwidth = 138160.26 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
54 threads read: 2742502564 nanoseconds     (bandwidth = 137643.39 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
56 threads read: 2724952764 nanoseconds     (bandwidth = 138529.87 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
58 threads read: 2709850386 nanoseconds     (bandwidth = 139301.92 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
60 threads read: 2725409629 nanoseconds     (bandwidth = 138506.65 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
62 threads read: 2751607134 nanoseconds     (bandwidth = 137187.96 MB/s)      (throughput = 0.06 nanoseconds per iteration) 
64 threads read: 2725538887 nanoseconds     (bandwidth = 138500.08 MB/s)      (throughput = 0.06 nanoseconds per iteration) 

Source:

#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "PcieBandwidthBenchmarker.h"
#include "CpuBenchmarker.h"

// testing
#include <iostream>
#include "omp.h"


struct Object
{
	size_t id;
	char str[32];
	Object():id(-1){}
	Object(size_t idPrm):id(idPrm){}
};
int main()
{
	const long long n = 1024ll*1000*90;


	VirtualMultiArray<Object> test(n,GraphicsCardSupplyDepot().requestGpus(),1024*5,1,PcieBandwidthBenchmarker().bestBandwidth(20));

	const int nL1CacheElements = 512;
	const int chunkSize = 1024;
	{
		CpuBenchmarker bench(n*sizeof(Object),"init");
		#pragma omp parallel for
		for(long long i=0;i<n;i+=chunkSize)
		{

			test.mappedReadWriteAccess(i,chunkSize,[&,i](Object * ptr){

				for(long long j=0;j<chunkSize;j++)
					{
						ptr[j+i] = Object(j+i);
					}
			},false,false,true);
		}

	}

	for(int t=1;t<=32;t++)
	{
		const int thr = t*2;
		CpuBenchmarker bench(n*sizeof(Object::id)*nL1CacheElements,std::to_string(thr)+std::string(" threads read"),n*nL1CacheElements);

		#pragma omp parallel for num_threads(thr)
		for(long long i=0;i<n;i+=chunkSize)
		{
			alignas(32)
			size_t cache[nL1CacheElements];
			for(int i=0;i<nL1CacheElements;i++)
			{
				cache[i]=-i-1;
			}
			test.mappedReadWriteAccess(i,chunkSize,[&,i](Object * ptr){

				for(long long j=0;j<chunkSize;j++)
				{
					Object& o = ptr[j+i];

					// scan through L1 cached content
					alignas(32)
					int ctr[8]={0,0,0,0,0,0,0,0};

					for(int k=0;k<nL1CacheElements;k+=8)
					{
						ctr[0] += (o.id == cache[k]);
						ctr[1] += (o.id == cache[k+1]);
						ctr[2] += (o.id == cache[k+2]);
						ctr[3] += (o.id == cache[k+3]);
						ctr[4] += (o.id == cache[k+4]);
						ctr[5] += (o.id == cache[k+5]);
						ctr[6] += (o.id == cache[k+6]);
						ctr[7] += (o.id == cache[k+7]);
					}
					if(ctr[0]+ctr[1]+ctr[2]+ctr[3]+ctr[4]+ctr[5]+ctr[6]+ctr[7]>0)
					{
						std::cout<<" Error: id="<<o.id<<std::endl;
					}
				}
			},false,true,false);
		}
	}
	return 0;
}

Increasing nL1CacheElements value to 1024 or 2048 makes better cache bandwidth measurement while i/o overlapping gets worse. Tuning page size and chunk size also changes behavior.