-
Notifications
You must be signed in to change notification settings - Fork 57
/
tsx.cc
420 lines (354 loc) · 10.4 KB
/
tsx.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
/**
* Compile by:
* $ g++ -O2 -std=c++11 -DL1DSZ=$(getconf LEVEL1_DCACHE_LINESIZE) -DCORES=$(grep -c processor /proc/cpuinfo) tsx.cc -lpthread
*
* Add -DABORT_COUNT to get TSX aborts statistic in the program output.
*
* Copyright (C) 2013 Alexander Krizhanovsky (ak@tempesta-tech.com).
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 59
* Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include <pthread.h>
#include <sys/time.h>
#include <atomic>
#include <iostream>
#include <thread>
#include <immintrin.h>
// TSX code is stolen from glibc-2.18
#define _XA_EXPLICIT 0
#define _XA_RETRY 1
#define _XA_CONFLICT 2
#define _XA_CAPACITY 3
#define _XBEGIN_STARTED (~0u)
#define _XABORT_EXPLICIT (1 << _XA_EXPLICIT)
#define _XABORT_RETRY (1 << _XA_RETRY)
#define _XABORT_CONFLICT (1 << _XA_CONFLICT)
#define _XABORT_CAPACITY (1 << _XA_CAPACITY)
#define _XABORT_DEBUG (1 << 4)
#define _XABORT_NESTED (1 << 5)
#define _XABORT_CODE(x) (((x) >> 24) & 0xff)
#define _ABORT_LOCK_BUSY 0xff
#define __force_inline __attribute__((__always_inline__)) inline
static __force_inline int _xbegin(void)
{
int ret = _XBEGIN_STARTED;
asm volatile (".byte 0xc7,0xf8 ; .long 0"
: "+a" (ret) :: "memory");
return ret;
}
static __force_inline void _xend(void)
{
asm volatile (".byte 0x0f,0x01,0xd5"
::: "memory");
}
static __force_inline void _xabort(const unsigned int status)
{
asm volatile (".byte 0xc6,0xf8,%P0"
:: "i" (status) : "memory");
}
static __force_inline int _xtest(void)
{
unsigned char out;
asm volatile (".byte 0x0f,0x01,0xd6 ; setnz %0"
: "=r" (out) :: "memory");
return out;
}
static const auto TRX_BUF_SZ_MAX = 8192UL;
enum class Sync : unsigned char {
TSX,
SpinLock,
};
pthread_spinlock_t spin_l;
struct CacheLine {
long c[L1DSZ / sizeof(long)];
CacheLine() : c{0} {}
long
operator+(const CacheLine &cl)
{
return c[0] + cl.c[0];
}
void
operator+=(int x)
{
c[0] += x;
}
} __attribute__((aligned(L1DSZ)));
// Memory changed in transactional context.
static CacheLine debit[TRX_BUF_SZ_MAX] __attribute__((aligned(L1DSZ)));
static CacheLine credit[TRX_BUF_SZ_MAX] __attribute__((aligned(L1DSZ)));
// Statistics.
std::atomic<long> aborts(0), retries(0);
__thread long _aborts __attribute__((aligned(L1DSZ)));
__thread long _retries __attribute__((aligned(L1DSZ)));
#ifdef ABORT_COUNT
__thread unsigned _abrt[4] __attribute__((aligned(L1DSZ)));
#define ABRT_COUNT(type, status) \
do { \
if (status & (1 << type)) \
_abrt[type]++; \
} while (0)
#else
#define ABRT_COUNT(...)
#endif
static unsigned char abrt_fallback[] __attribute__((aligned(L1DSZ))) = {
0x1c, 0x22, 0x15, 0x2c, 0x21, 0x29, 0x32, 0x31,
0x00, 0x01, 0x15, 0x04, 0x10, 0x0c, 0x1b, 0x16,
0x14, 0x0b, 0x13, 0x12, 0x02, 0x05, 0x0d, 0x17,
0x23, 0x1d, 0x24, 0x2b, 0x28, 0x32, 0x36, 0x39,
0x0a, 0x06, 0x15, 0x03, 0x01, 0x07, 0x0f, 0x18,
0x1e, 0x25, 0x38, 0x31, 0x30, 0x2e, 0x35, 0x33,
0x09, 0x07, 0x0e, 0x11, 0x08, 0x02, 0x1a, 0x19,
0x26, 0x27, 0x1f, 0x20, 0x2f, 0x2a, 0x37, 0x34,
};
static __thread int af = 0;
std::ostream &
operator<<(std::ostream &os, const CacheLine &cl)
{
os << cl.c[0];
return os;
}
// This function must be ran in transaction context
static inline void
trx_func(unsigned long thr_id, unsigned long trx_sz, int trx_count,
int overlap)
{
for (int c = 0; c < trx_count; c++)
for (unsigned i = 0; i < trx_sz; ++i) {
unsigned long shift = thr_id * trx_sz + i
- overlap * thr_id;
debit[shift] += 1;
credit[shift] += -1;
}
}
static void
warm_and_clear_memory()
{
aborts = 0;
retries = 0;
memset(debit, 0, sizeof(debit));
memset(credit, 0, sizeof(credit));
}
static void
check_consistency(unsigned trx_buf_sz)
{
for (unsigned i = 0; i < trx_buf_sz; ++i)
if (debit[i] + credit[i])
std::cout << "!!! INCONSISTENCY at " << i
<< ": debit=" << debit[i]
<< " credit=" << credit[i] << std::endl;
}
static void
execute_spinlock_trx(unsigned long thr_id, unsigned long trx_sz, int trx_count,
int overlap)
{
pthread_spin_lock(&spin_l);
trx_func(thr_id, trx_sz, trx_count, overlap);
pthread_spin_unlock(&spin_l);
}
// Transaction.
// Reruns transaction specified number of times before abort.
// @return false if the transaction is aborted and true otherwise.
static void
execute_short_trx(unsigned long trx_id, unsigned long trx_sz, int trx_count,
int overlap)
{
int abrt = 0;
while (1) {
unsigned status = _xbegin();
if (__builtin_expect(status == _XBEGIN_STARTED, 1)) {
// we're in transactional context
// Hacky check whether spinlock is locked.
// See glibc/nptl/sysdeps/x86_64/pthread_spin_unlock.S
if (__builtin_expect((int)spin_l != 1, 0))
_xabort(_ABORT_LOCK_BUSY);
trx_func(trx_id, trx_sz, trx_count, overlap);
_xend();
return;
}
ABRT_COUNT(_XA_RETRY, status);
ABRT_COUNT(_XA_EXPLICIT, status);
ABRT_COUNT(_XA_CONFLICT, status);
ABRT_COUNT(_XA_CAPACITY, status);
if (__builtin_expect(!(status & _XABORT_RETRY), 0)) {
++_aborts;
// "Randomized" backoffs as suggested by Andreas Kleen.
// See http://software.intel.com/en-us/forums/topic/488911
if (++abrt == abrt_fallback[af]) {
af = (af + 1) % (sizeof(abrt_fallback)
/ sizeof(*abrt_fallback));
break;
}
// Backoff if the abort was neither due to conflict with
// other transaction nor acquired spin lock.
if (!((status & _XABORT_CONFLICT)
|| ((status & _XABORT_EXPLICIT)
&& _XABORT_CODE(status) != _ABORT_LOCK_BUSY)))
break;
if ((status & _XABORT_EXPLICIT)
&& _XABORT_CODE(status) != _ABORT_LOCK_BUSY)
{
// Whait while spin lock is released before
// restart transaction.
while ((int)spin_l != 1)
_mm_pause();
continue;
}
}
++_retries;
_mm_pause();
}
// fallback to spinlock.
execute_spinlock_trx(trx_id, trx_sz, trx_count, overlap);
}
struct Thr {
unsigned long trx_sz;
unsigned long iter;
int thr_num, thr_id;
int trx_count, overlap;
Sync sync;
Thr(int trx_sz, int trx_count, int interleace, int iter, int thr_num,
int thr_id, Sync sync)
: trx_sz(trx_sz), trx_count(trx_count), overlap(overlap),
iter(iter), thr_num(thr_num), thr_id(thr_id), sync(sync)
{
assert(thr_id < CORES);
assert(thr_id < thr_num);
assert(thr_num * trx_sz <= TRX_BUF_SZ_MAX);
assert(overlap <= trx_sz);
}
Thr& operator()()
{
_aborts = _retries = 0;
set_affinity();
for (unsigned long i = 0; i < iter; ++i) {
switch (sync) {
case Sync::TSX:
execute_short_trx(thr_id, trx_sz, trx_count,
overlap);
break;
case Sync::SpinLock:
execute_spinlock_trx(thr_id, trx_sz, trx_count,
overlap);
break;
default:
abort();
}
}
// merge statistics
aborts += _aborts;
retries += _retries;
#ifdef ABORT_COUNT
pthread_spin_lock(&spin_l);
std::cout << "\t\texplicit abrt: " << _abrt[_XA_EXPLICIT]
<< "\n\t\tretry abrt: " << _abrt[_XA_RETRY]
<< "\n\t\tconflict abrt: " << _abrt[_XA_CONFLICT]
<< "\n\t\tcapacity abrt: " << _abrt[_XA_CAPACITY]
<< std::endl;
pthread_spin_unlock(&spin_l);
#endif
return *this;
}
private:
// Sets affinity for i7-4650U (dual core with hyper threading).
// This processor has 4 virtual processors (visible to Linux):
// cpus 0 and 2 are threads of 1st core and cpus 1 and 3 are threads
// of 2nd core.
// So set affinity to cpus 0 and 1 if thr_num == 2.
void
set_affinity()
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(thr_id, &cpuset);
int r = pthread_setaffinity_np(pthread_self(), sizeof(cpuset),
&cpuset);
assert(!r);
}
};
static inline unsigned long
tv_to_ms(const struct timeval &tv)
{
return ((unsigned long)tv.tv_sec * 1000000 + tv.tv_usec) / 1000;
}
static void
run_test(int thr_num, int trx_sz, int trx_count, int overlap, int iter,
Sync sync)
{
struct timeval tv0, tv1;
std::thread thr[thr_num];
warm_and_clear_memory();
int r = gettimeofday(&tv0, NULL);
assert(!r);
for (int i = 0; i < thr_num; ++i)
thr[i] = std::thread(Thr(trx_sz, trx_count, overlap, iter,
thr_num, i, sync));
for (auto &t : thr)
t.join();
r = gettimeofday(&tv1, NULL);
assert(!r);
check_consistency(thr_num * trx_sz);
std::cout << "thr=" << thr_num << "\ttrx_sz=" << trx_sz
<< "\ttrx_count=" << trx_count << "\toverlap=" << overlap
<< "\titer=" << iter
<< "\ttime=" << (tv_to_ms(tv1) - tv_to_ms(tv0)) << "ms"
<< "\taborts=" << aborts.load()
<< "(" << (aborts.load() * 100 / (iter * thr_num)) << "%)"
<< "\tretries=" << retries.load()
<< std::endl;
}
int
main(int argc, char *argv[])
{
pthread_spin_init(&spin_l, 0);
unsigned long iter = 10UL * 1000 * 1000;
/**
* Aborts statistics for single threaded load depending on transaction
* work set.
*/
//for (int trx_sz = 32; trx_sz <= 1024; trx_sz += 4)
// run_test(1, trx_sz, 1, 0, iter, Sync::TSX);
/*
* Compare TSX and spin lock performance depending on transaction
* time with small work set for 2 concurrent threads.
*/
//for (int trx_count = 1; trx_count <= 201; trx_count += 10)
// run_test(2, 2, trx_count, 0, iter, Sync::TSX);
//for (int trx_count = 1; trx_count <= 201; trx_count += 10)
// run_test(2, 2, trx_count, 0, iter, Sync::SpinLock);
/*
* Compare TSX and spin lock performance depending on transaction
* work set for 2 concurrent threads.
*/
for (int trx_sz = 1; trx_sz <= 256; trx_sz <<= 1)
run_test(2, trx_sz, 1, 0, iter, Sync::TSX);
//for (int trx_sz = 1; trx_sz <= 256; trx_sz <<= 1)
// run_test(2, trx_sz, 1, 0, iter, Sync::SpinLock);
/*
* Compare TSX and spin lock performance depending on
* data overlapping.
*/
//for (int overlap = 0; overlap <= 32; overlap++)
// run_test(2, 32, 1, overlap, iter, Sync::TSX);
//for (int overlap = 0; overlap <= 32; overlap++)
// run_test(2, 32, 1, overlap, iter, Sync::SpinLock);
pthread_spin_destroy(&spin_l);
return 0;
}