-
Notifications
You must be signed in to change notification settings - Fork 3
Overlapped read performance
Hüseyin Tuğrul BÜYÜKIŞIK edited this page Mar 18, 2021
·
8 revisions
Generally, overlapping scalar/bulk reads/writes helps usage of (LRU) page caching to increase bandwidth. Overlapping too much causes page-lock contention. Following program tests performance of all read methods:
#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "PcieBandwidthBenchmarker.h"
// testing
#include <iostream>
#include<omp.h>
class Obj
{
public:
Obj() { b = -1; }
Obj(int i) { b = i; }
int b;
char buf[100];
};
int main(int argC, char** argV)
{
std::cout << "preparing virtual array..." << std::endl;
size_t n = 1000000;
size_t p = 1000;
GraphicsCardSupplyDepot gpu;
VirtualMultiArray<Obj> arr(n, gpu.requestGpus(), p,50,{4,4,4,4,4,4},VirtualMultiArray<Obj>::MemMult::UseDefault,true,true);
std::cout << "initializing data..." << std::endl;
#pragma omp parallel for
for (int i = 0; i < n; i++)
{
arr.set(i, Obj(i));
}
std::cout << "non-overlapped n = " << sizeof(Obj) * n << " bytes" << std::endl;
std::cout << "overlapped n x 1000 = " << sizeof(Obj) * n * 1000 << " bytes" << std::endl;
{
std::cout << "<overlapped> get() x n x 1000: " << std::flush;
std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
#pragma omp parallel for
for (int i = 1000; i < n - 1000; i++)
{
for (int j = 0; j < 1000; j++)
{
if (arr.get(i + j).b != i + j)
{
std::cout << "err" << std::endl;
}
}
}
std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
std::cout << t2.count() - t1.count() << "ms" << std::endl;
std::cout<<"Cache hit ratio="<<arr.getTotalCacheHitRatio()<<std::endl;
arr.resetTotalCacheHitRatio();
}
{
std::cout << "<overlapped> readOnlyGetN() x n: " << std::flush;
std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
#pragma omp parallel for
for (int i = 1000; i < n - 1000; i++)
{
auto data = arr.readOnlyGetN(i, 1000);
for (int j = 0; j < 1000; j++)
{
if (data[j].b != i + j)
{
std::cout << "err" << std::endl;
}
}
}
std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
std::cout << t2.count() - t1.count() << "ms" << std::endl;
std::cout<<"Cache hit ratio="<<arr.getTotalCacheHitRatio()<<std::endl;
arr.resetTotalCacheHitRatio();
}
{
std::cout << "<overlapped> mappedReadWriteAccess() x n: " << std::flush;
std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
#pragma omp parallel for
for (int i = 1000; i < n - 1000; i++)
{
arr.mappedReadWriteAccess(i, 1000, [&](Obj* ptr)
{
for (int j = i; j < i + 1000; j++)
{
if (ptr[j].b != j)
{
std::cout << "err" << std::endl;
}
}
}, false, true, false);
}
std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
std::cout << t2.count() - t1.count() << "ms" << std::endl;
std::cout<<"Cache hit ratio="<<arr.getTotalCacheHitRatio()<<std::endl;
arr.resetTotalCacheHitRatio();
}
{
std::cout << "<overlapped> mappedReadWriteAccess(userPtr) x n: " << std::flush;
std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
std::vector<Obj> tmp(1000 * omp_get_max_threads());
#pragma omp parallel for
for (int i = 1000; i < n - 1000; i++)
{
int threadId = omp_get_thread_num();
arr.mappedReadWriteAccess(i, 1000, [&](Obj* ptr)
{
for (int j = i; j < i + 1000; j++)
{
if (ptr[j].b != j)
{
std::cout << "err" << std::endl;
}
}
}, false, true, false, tmp.data() + (threadId * 1000));
}
std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
std::cout << t2.count() - t1.count() << "ms" << std::endl;
std::cout<<"Cache hit ratio="<<arr.getTotalCacheHitRatio()<<std::endl;
arr.resetTotalCacheHitRatio();
}
{
std::cout << "<overlapped, variable length> mappedReadWriteAccess() x n: " << std::flush;
std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
#pragma omp parallel for
for (int i = 1000; i < n - 2000; i++)
{
arr.mappedReadWriteAccess(i, 1000 + i % 1000, [&](Obj* ptr)
{
for (int j = i; j < i + 1000 + i % 1000; j++)
{
if (ptr[j].b != j)
{
std::cout << "err" << std::endl;
}
}
}, false, true, false);
}
std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
std::cout << t2.count() - t1.count() << "ms" << std::endl;
std::cout<<"Cache hit ratio="<<arr.getTotalCacheHitRatio()<<std::endl;
arr.resetTotalCacheHitRatio();
}
{
std::cout << "<overlapped, variable length> mappedReadWriteAccess(userPtr) x n: " << std::flush;
std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
std::vector<Obj> tmp(2000 * omp_get_max_threads());
#pragma omp parallel for
for (int i = 1000; i < n - 2000; i++)
{
int threadId = omp_get_thread_num();
arr.mappedReadWriteAccess(i, 1000 + i % 1000, [&](Obj* ptr)
{
for (int j = i; j < i + 1000 + i % 1000; j++)
{
if (ptr[j].b != j)
{
std::cout << "err" << std::endl;
}
}
}, false, true, false, tmp.data() + (threadId * 2000));
}
std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
std::cout << t2.count() - t1.count() << "ms" << std::endl;
std::cout<<"Cache hit ratio="<<arr.getTotalCacheHitRatio()<<std::endl;
arr.resetTotalCacheHitRatio();
}
{
std::cout << "<non-overlapped> mappedReadWriteAccess(userPtr): " << std::flush;
std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
std::vector<Obj> tmp(1000 * omp_get_max_threads());
for (int k = 0; k < 100; k++)
{
#pragma omp parallel for
for (int i = 0; i < n; i += 1000)
{
int threadId = omp_get_thread_num();
arr.mappedReadWriteAccess(i, 1000, [&](Obj* ptr)
{
for (int j = i; j < i + 1000; j++)
{
if (ptr[j].b != j)
{
std::cout << "err" << std::endl;
}
}
}, false, true, false, tmp.data() + (threadId * 1000));
}
}
std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
std::cout << (t2.count() - t1.count()) / 100.0 << "ms" << std::endl;
std::cout<<"Cache hit ratio="<<arr.getTotalCacheHitRatio()<<std::endl;
}
return 0;
}
Output for fx8150(2.1GHz) CPU:
preparing virtual array...
initializing data...
non-overlapped n = 104000000 bytes
overlapped n x 1000 = 104000000000 bytes
<overlapped> get() x n x 1000: 59841ms
Cache hit ratio=0.999998
<overlapped> readOnlyGetN() x n: 13297ms
Cache hit ratio=0.999499
<overlapped> mappedReadWriteAccess() x n: 5128ms
Cache hit ratio=0.999499
<overlapped> mappedReadWriteAccess(userPtr) x n: 5183ms
Cache hit ratio=0.999499
<overlapped, variable length> mappedReadWriteAccess() x n: 13793ms
Cache hit ratio=0.999599
<overlapped, variable length> mappedReadWriteAccess(userPtr) x n: 9899ms
Cache hit ratio=0.999599
<non-overlapped> mappedReadWriteAccess(userPtr): 30.46ms
Cache hit ratio=0.000450134
Mapping achieved best performance due to the aligned raw-pointer. For systems with memory fragmentation, userPtr mapping should work better due to re-using user-pointers.
An <overlapped, variable length> mappedReadWriteAccess(userPtr) x n
benchmark result of 3249ms (when CPU is at 3.6GHz) means (n=)1M times 1000-element scan and equals to 32GB/s.
Non-overlapped part does not test the LRU cache but PCIE bandwidth.