tsx.cc

/**
 * Compile by:
 * $ g++ -O2 -std=c++11 -DL1DSZ=$(getconf LEVEL1_DCACHE_LINESIZE) -DCORES=$(grep -c processor /proc/cpuinfo) tsx.cc -lpthread
 *
 * Add -DABORT_COUNT to get TSX aborts statistic in the program output.
 *
 * Copyright (C) 2013 Alexander Krizhanovsky (ak@tempesta-tech.com).
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program; if not, write to the Free Software Foundation, Inc., 59
 * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include <pthread.h>
#include <sys/time.h>

#include <atomic>
#include <iostream>
#include <thread>

#include <immintrin.h>

// TSX code is stolen from glibc-2.18

#define _XA_EXPLICIT		0
#define _XA_RETRY		1
#define _XA_CONFLICT		2
#define _XA_CAPACITY		3

#define _XBEGIN_STARTED		(~0u)
#define _XABORT_EXPLICIT	(1 << _XA_EXPLICIT)
#define _XABORT_RETRY		(1 << _XA_RETRY)
#define _XABORT_CONFLICT	(1 << _XA_CONFLICT)
#define _XABORT_CAPACITY	(1 << _XA_CAPACITY)
#define _XABORT_DEBUG		(1 << 4)
#define _XABORT_NESTED		(1 << 5)

#define _XABORT_CODE(x)		(((x) >> 24) & 0xff)

#define _ABORT_LOCK_BUSY	0xff

#define __force_inline __attribute__((__always_inline__)) inline

static __force_inline int _xbegin(void)
{
	int ret = _XBEGIN_STARTED;
	asm volatile (".byte 0xc7,0xf8 ; .long 0"
			: "+a" (ret) :: "memory");
	return ret;
}

static __force_inline void _xend(void)
{
	asm volatile (".byte 0x0f,0x01,0xd5"
			::: "memory");
}

static __force_inline void _xabort(const unsigned int status)
{
	asm volatile (".byte 0xc6,0xf8,%P0"
			:: "i" (status) : "memory");
}

static __force_inline int _xtest(void)
{
	unsigned char out;
	asm volatile (".byte 0x0f,0x01,0xd6 ; setnz %0"
			: "=r" (out) :: "memory");
	return out;
}

static const auto TRX_BUF_SZ_MAX = 8192UL;
enum class Sync : unsigned char {
	TSX,
	SpinLock,
};

pthread_spinlock_t spin_l;

struct CacheLine {
	long c[L1DSZ / sizeof(long)];
	CacheLine() : c{0} {}

	long
	operator+(const CacheLine &cl)
	{
		return c[0] + cl.c[0];
	}

	void
	operator+=(int x)
	{
		c[0] += x;
	}
} __attribute__((aligned(L1DSZ)));

// Memory changed in transactional context.
static CacheLine debit[TRX_BUF_SZ_MAX] __attribute__((aligned(L1DSZ)));
static CacheLine credit[TRX_BUF_SZ_MAX] __attribute__((aligned(L1DSZ)));

// Statistics.
std::atomic<long> aborts(0), retries(0);
__thread long _aborts __attribute__((aligned(L1DSZ)));
__thread long _retries __attribute__((aligned(L1DSZ)));
#ifdef ABORT_COUNT
__thread unsigned _abrt[4] __attribute__((aligned(L1DSZ)));
#define ABRT_COUNT(type, status)					\
do {									\
	if (status & (1 << type))					\
		_abrt[type]++;						\
} while (0)
#else
#define ABRT_COUNT(...)
#endif

static unsigned char abrt_fallback[] __attribute__((aligned(L1DSZ))) = {
	0x1c, 0x22, 0x15, 0x2c, 0x21, 0x29, 0x32, 0x31,
	0x00, 0x01, 0x15, 0x04, 0x10, 0x0c, 0x1b, 0x16,
	0x14, 0x0b, 0x13, 0x12, 0x02, 0x05, 0x0d, 0x17,
	0x23, 0x1d, 0x24, 0x2b, 0x28, 0x32, 0x36, 0x39,
	0x0a, 0x06, 0x15, 0x03, 0x01, 0x07, 0x0f, 0x18,
	0x1e, 0x25, 0x38, 0x31, 0x30, 0x2e, 0x35, 0x33,
	0x09, 0x07, 0x0e, 0x11, 0x08, 0x02, 0x1a, 0x19,
	0x26, 0x27, 0x1f, 0x20, 0x2f, 0x2a, 0x37, 0x34,
};
static __thread int af = 0;

std::ostream &
operator<<(std::ostream &os, const CacheLine &cl)
{
	os << cl.c[0];
	return os;
}

// This function must be ran in transaction context
static inline void
trx_func(unsigned long thr_id, unsigned long trx_sz, int trx_count,
	 int overlap)
{
	for (int c = 0; c < trx_count; c++)
		for (unsigned i = 0; i < trx_sz; ++i) {
			unsigned long shift = thr_id * trx_sz + i
					      - overlap * thr_id;
			debit[shift] += 1;
			credit[shift] += -1;
		}
}

static void
warm_and_clear_memory()
{
	aborts = 0;
	retries = 0;

	memset(debit, 0, sizeof(debit));
	memset(credit, 0, sizeof(credit));
}

static void
check_consistency(unsigned trx_buf_sz)
{
	for (unsigned i = 0; i < trx_buf_sz; ++i)
		if (debit[i] + credit[i])
			std::cout << "!!! INCONSISTENCY at " << i
				  << ": debit=" << debit[i]
				  << " credit=" << credit[i] << std::endl;
}

static void
execute_spinlock_trx(unsigned long thr_id, unsigned long trx_sz, int trx_count,
		     int overlap)
{
	pthread_spin_lock(&spin_l);

	trx_func(thr_id, trx_sz, trx_count, overlap);

	pthread_spin_unlock(&spin_l);
}

// Transaction.
// Reruns transaction specified number of times before abort.
// @return false if the transaction is aborted and true otherwise.
static void
execute_short_trx(unsigned long trx_id, unsigned long trx_sz, int trx_count,
		  int overlap)
{
	int abrt = 0;
	while (1) {
		unsigned status = _xbegin();

		if (__builtin_expect(status == _XBEGIN_STARTED, 1)) {
			// we're in transactional context

			// Hacky check whether spinlock is locked.
			// See glibc/nptl/sysdeps/x86_64/pthread_spin_unlock.S
			if (__builtin_expect((int)spin_l != 1, 0))
				_xabort(_ABORT_LOCK_BUSY);

			trx_func(trx_id, trx_sz, trx_count, overlap);

			_xend();

			return;
		}

		ABRT_COUNT(_XA_RETRY, status);
		ABRT_COUNT(_XA_EXPLICIT, status);
		ABRT_COUNT(_XA_CONFLICT, status);
		ABRT_COUNT(_XA_CAPACITY, status);

		if (__builtin_expect(!(status & _XABORT_RETRY), 0)) {
			++_aborts;

			// "Randomized" backoffs as suggested by Andreas Kleen.
			// See http://software.intel.com/en-us/forums/topic/488911
			if (++abrt == abrt_fallback[af]) {
				af = (af + 1) % (sizeof(abrt_fallback)
						 / sizeof(*abrt_fallback));
				break;
			}

			// Backoff if the abort was neither due to conflict with
			// other transaction nor acquired spin lock.
			if (!((status & _XABORT_CONFLICT)
			      || ((status & _XABORT_EXPLICIT)
				  && _XABORT_CODE(status) != _ABORT_LOCK_BUSY)))
				break;

			if ((status & _XABORT_EXPLICIT)
			    && _XABORT_CODE(status) != _ABORT_LOCK_BUSY)
			{
				// Whait while spin lock is released before
				// restart transaction.
				while ((int)spin_l != 1)
					_mm_pause();
				continue;
			}
		}

		++_retries;

		_mm_pause();
	}

	// fallback to spinlock.
	execute_spinlock_trx(trx_id, trx_sz, trx_count, overlap);
}

struct Thr {
	unsigned long trx_sz;
	unsigned long iter;
	int thr_num, thr_id;
	int trx_count, overlap;
	Sync sync;

	Thr(int trx_sz, int trx_count, int interleace, int iter, int thr_num,
	    int thr_id, Sync sync)
		: trx_sz(trx_sz), trx_count(trx_count), overlap(overlap),
		iter(iter), thr_num(thr_num), thr_id(thr_id), sync(sync)
	{
		assert(thr_id < CORES);
		assert(thr_id < thr_num);
		assert(thr_num * trx_sz <= TRX_BUF_SZ_MAX);
		assert(overlap <= trx_sz);
	}

	Thr& operator()()
	{
		_aborts = _retries = 0;

		set_affinity();

		for (unsigned long i = 0; i < iter; ++i) {
			switch (sync) {
			case Sync::TSX:
				execute_short_trx(thr_id, trx_sz, trx_count,
						  overlap);
				break;
			case Sync::SpinLock:
				execute_spinlock_trx(thr_id, trx_sz, trx_count,
						     overlap);
				break;
			default:
				abort();
			}
		}

		// merge statistics
		aborts += _aborts;
		retries += _retries;
#ifdef ABORT_COUNT
		pthread_spin_lock(&spin_l);
		std::cout << "\t\texplicit abrt: " << _abrt[_XA_EXPLICIT]
			  << "\n\t\tretry abrt: " << _abrt[_XA_RETRY]
			  << "\n\t\tconflict abrt: " << _abrt[_XA_CONFLICT]
			  << "\n\t\tcapacity abrt: " << _abrt[_XA_CAPACITY]
			  << std::endl;
		pthread_spin_unlock(&spin_l);
#endif
		return *this;
	}

private:
	// Sets affinity for i7-4650U (dual core with hyper threading).
	// This processor has 4 virtual processors (visible to Linux):
	// cpus 0 and 2 are threads of 1st core and cpus 1 and 3 are threads
	// of 2nd core.
	// So set affinity to cpus 0 and 1 if thr_num == 2.
	void
	set_affinity()
	{
		cpu_set_t cpuset;
		CPU_ZERO(&cpuset);
		CPU_SET(thr_id, &cpuset);
		int r = pthread_setaffinity_np(pthread_self(), sizeof(cpuset),
					       &cpuset);
		assert(!r);
	}
};

static inline unsigned long
tv_to_ms(const struct timeval &tv)
{
	return ((unsigned long)tv.tv_sec * 1000000 + tv.tv_usec) / 1000;
}

static void
run_test(int thr_num, int trx_sz, int trx_count, int overlap, int iter,
	 Sync sync)
{
	struct timeval tv0, tv1;
	std::thread thr[thr_num];

	warm_and_clear_memory();

	int r = gettimeofday(&tv0, NULL);
	assert(!r);

	for (int i = 0; i < thr_num; ++i)
		thr[i] = std::thread(Thr(trx_sz, trx_count, overlap, iter,
					 thr_num, i, sync));

	for (auto &t : thr)
		t.join();

	r = gettimeofday(&tv1, NULL);
	assert(!r);

	check_consistency(thr_num * trx_sz);

	std::cout << "thr=" << thr_num << "\ttrx_sz=" << trx_sz
		<< "\ttrx_count=" << trx_count << "\toverlap=" << overlap
		<< "\titer=" << iter
		<< "\ttime=" << (tv_to_ms(tv1) - tv_to_ms(tv0)) << "ms"
		<< "\taborts=" << aborts.load()
		<< "(" << (aborts.load() * 100 / (iter * thr_num)) << "%)"
		<< "\tretries=" << retries.load()
		<< std::endl;
}

int
main(int argc, char *argv[])
{
	pthread_spin_init(&spin_l, 0);

	unsigned long iter = 10UL * 1000 * 1000;

	/**
	 * Aborts statistics for single threaded load depending on transaction
	 * work set.
	 */
	//for (int trx_sz = 32; trx_sz <= 1024; trx_sz += 4)
	//	run_test(1, trx_sz, 1, 0, iter, Sync::TSX);

	/*
	 * Compare TSX and spin lock performance depending on transaction
	 * time with small work set for 2 concurrent threads.
	 */
	//for (int trx_count = 1; trx_count <= 201; trx_count += 10)
	//	run_test(2, 2, trx_count, 0, iter, Sync::TSX);
	//for (int trx_count = 1; trx_count <= 201; trx_count += 10)
	//	run_test(2, 2, trx_count, 0, iter, Sync::SpinLock);

	/*
	 * Compare TSX and spin lock performance depending on transaction
	 * work set for 2 concurrent threads.
	 */
	for (int trx_sz = 1; trx_sz <= 256; trx_sz <<= 1)
		run_test(2, trx_sz, 1, 0, iter, Sync::TSX);
	//for (int trx_sz = 1; trx_sz <= 256; trx_sz <<= 1)
	//	run_test(2, trx_sz, 1, 0, iter, Sync::SpinLock);

	/*
	 * Compare TSX and spin lock performance depending on
	 * data overlapping.
	 */
	//for (int overlap = 0; overlap <= 32; overlap++)
	//	run_test(2, 32, 1, overlap, iter, Sync::TSX);
	//for (int overlap = 0; overlap <= 32; overlap++)
	//	run_test(2, 32, 1, overlap, iter, Sync::SpinLock);

	pthread_spin_destroy(&spin_l);

	return 0;
}