Skip to content

Overlapped read performance

Hüseyin Tuğrul BÜYÜKIŞIK edited this page Mar 18, 2021 · 8 revisions

Generally, overlapping scalar/bulk reads/writes helps usage of (LRU) page caching to increase bandwidth. Overlapping too much causes page-lock contention. Following program tests performance of all read methods:

#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "PcieBandwidthBenchmarker.h"

// testing
#include <iostream>
#include<omp.h>


class Obj
{
public:
	Obj() { b = -1; }
	Obj(int i) { b = i; }
	int b;
	char buf[100];
};

int main(int argC, char** argV)
{

	std::cout << "preparing virtual array..." << std::endl;
	size_t n = 1000000;
	size_t p = 1000;
	GraphicsCardSupplyDepot gpu;
	VirtualMultiArray<Obj> arr(n, gpu.requestGpus(), p,50,{4,4,4,4,4,4},VirtualMultiArray<Obj>::MemMult::UseDefault,true,true);

	std::cout << "initializing data..." << std::endl;
#pragma omp parallel for
	for (int i = 0; i < n; i++)
	{
		arr.set(i, Obj(i));
	}
	std::cout << "non-overlapped n = " << sizeof(Obj) * n << " bytes" << std::endl;
	std::cout << "overlapped n x 1000 = " << sizeof(Obj) * n * 1000 << " bytes" << std::endl;
	{
		std::cout << "<overlapped> get() x n x 1000:    " << std::flush;
		std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
#pragma omp parallel for
		for (int i = 1000; i < n - 1000; i++)
		{
			for (int j = 0; j < 1000; j++)
			{
				if (arr.get(i + j).b != i + j)
				{
					std::cout << "err" << std::endl;
				}
			}
		}
		std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::cout << t2.count() - t1.count() << "ms" << std::endl;
		std::cout<<"Cache hit ratio="<<arr.getTotalCacheHitRatio()<<std::endl;
		arr.resetTotalCacheHitRatio();
	}

	{
		std::cout << "<overlapped> readOnlyGetN() x n:    " << std::flush;
		std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
#pragma omp parallel for
		for (int i = 1000; i < n - 1000; i++)
		{
			auto data = arr.readOnlyGetN(i, 1000);
			for (int j = 0; j < 1000; j++)
			{
				if (data[j].b != i + j)
				{
					std::cout << "err" << std::endl;
				}
			}
		}
		std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::cout << t2.count() - t1.count() << "ms" << std::endl;
		std::cout<<"Cache hit ratio="<<arr.getTotalCacheHitRatio()<<std::endl;
		arr.resetTotalCacheHitRatio();
	}

	{
		std::cout << "<overlapped> mappedReadWriteAccess() x n:    " << std::flush;
		std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
#pragma omp parallel for
		for (int i = 1000; i < n - 1000; i++)
		{
			arr.mappedReadWriteAccess(i, 1000, [&](Obj* ptr)
				{
					for (int j = i; j < i + 1000; j++)
					{
						if (ptr[j].b != j)
						{
							std::cout << "err" << std::endl;
						}
					}
				}, false, true, false);
		}
		std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::cout << t2.count() - t1.count() << "ms" << std::endl;
		std::cout<<"Cache hit ratio="<<arr.getTotalCacheHitRatio()<<std::endl;
		arr.resetTotalCacheHitRatio();
	}


	{
		std::cout << "<overlapped> mappedReadWriteAccess(userPtr) x n:    " << std::flush;
		std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::vector<Obj> tmp(1000 * omp_get_max_threads());

#pragma omp parallel for
		for (int i = 1000; i < n - 1000; i++)
		{
			int threadId = omp_get_thread_num();
			arr.mappedReadWriteAccess(i, 1000, [&](Obj* ptr)
				{
					for (int j = i; j < i + 1000; j++)
					{
						if (ptr[j].b != j)
						{
							std::cout << "err" << std::endl;
						}
					}
				}, false, true, false, tmp.data() + (threadId * 1000));
		}
		std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::cout << t2.count() - t1.count() << "ms" << std::endl;
		std::cout<<"Cache hit ratio="<<arr.getTotalCacheHitRatio()<<std::endl;
		arr.resetTotalCacheHitRatio();
	}


	{
		std::cout << "<overlapped, variable length> mappedReadWriteAccess() x n:    " << std::flush;
		std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
#pragma omp parallel for
		for (int i = 1000; i < n - 2000; i++)
		{
			arr.mappedReadWriteAccess(i, 1000 + i % 1000, [&](Obj* ptr)
				{
					for (int j = i; j < i + 1000 + i % 1000; j++)
					{
						if (ptr[j].b != j)
						{
							std::cout << "err" << std::endl;
						}
					}
				}, false, true, false);
		}
		std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::cout << t2.count() - t1.count() << "ms" << std::endl;
		std::cout<<"Cache hit ratio="<<arr.getTotalCacheHitRatio()<<std::endl;
		arr.resetTotalCacheHitRatio();
	}

	{
		std::cout << "<overlapped, variable length> mappedReadWriteAccess(userPtr) x n:    " << std::flush;
		std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::vector<Obj> tmp(2000 * omp_get_max_threads());
#pragma omp parallel for
		for (int i = 1000; i < n - 2000; i++)
		{
			int threadId = omp_get_thread_num();
			arr.mappedReadWriteAccess(i, 1000 + i % 1000, [&](Obj* ptr)
				{
					for (int j = i; j < i + 1000 + i % 1000; j++)
					{
						if (ptr[j].b != j)
						{
							std::cout << "err" << std::endl;
						}
					}
				}, false, true, false, tmp.data() + (threadId * 2000));
		}
		std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::cout << t2.count() - t1.count() << "ms" << std::endl;
		std::cout<<"Cache hit ratio="<<arr.getTotalCacheHitRatio()<<std::endl;
		arr.resetTotalCacheHitRatio();
	}

	{
		std::cout << "<non-overlapped> mappedReadWriteAccess(userPtr):    " << std::flush;
		std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::vector<Obj> tmp(1000 * omp_get_max_threads());
		for (int k = 0; k < 100; k++)
		{
#pragma omp parallel for
			for (int i = 0; i < n; i += 1000)
			{
				int threadId = omp_get_thread_num();
				arr.mappedReadWriteAccess(i, 1000, [&](Obj* ptr)
					{
						for (int j = i; j < i + 1000; j++)
						{
							if (ptr[j].b != j)
							{
								std::cout << "err" << std::endl;
							}
						}
					}, false, true, false, tmp.data() + (threadId * 1000));
			}
		}
		std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::cout << (t2.count() - t1.count()) / 100.0 << "ms" << std::endl;
		std::cout<<"Cache hit ratio="<<arr.getTotalCacheHitRatio()<<std::endl;
	}

	return 0;
}

Output for fx8150(2.1GHz) CPU:

preparing virtual array...
initializing data...
non-overlapped n = 104000000 bytes
overlapped n x 1000 = 104000000000 bytes
<overlapped> get() x n x 1000:    59841ms
Cache hit ratio=0.999998
<overlapped> readOnlyGetN() x n:    13297ms
Cache hit ratio=0.999499
<overlapped> mappedReadWriteAccess() x n:    5128ms
Cache hit ratio=0.999499
<overlapped> mappedReadWriteAccess(userPtr) x n:    5183ms
Cache hit ratio=0.999499
<overlapped, variable length> mappedReadWriteAccess() x n:    13793ms
Cache hit ratio=0.999599
<overlapped, variable length> mappedReadWriteAccess(userPtr) x n:    9899ms
Cache hit ratio=0.999599
<non-overlapped> mappedReadWriteAccess(userPtr):    30.46ms
Cache hit ratio=0.000450134

Mapping achieved best performance due to the aligned raw-pointer. For systems with memory fragmentation, userPtr mapping should work better due to re-using user-pointers.

An <overlapped, variable length> mappedReadWriteAccess(userPtr) x n benchmark result of 3249ms (when CPU is at 3.6GHz) means (n=)1M times 1000-element scan and equals to 32GB/s.

Non-overlapped part does not test the LRU cache but PCIE bandwidth.