-
Notifications
You must be signed in to change notification settings - Fork 22
/
OneFilePTMWF.hpp
1180 lines (1048 loc) · 50.6 KB
/
OneFilePTMWF.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright 2017-2019
* Andreia Correia <andreia.veiga@unine.ch>
* Pedro Ramalhete <pramalhe@gmail.com>
* Pascal Felber <pascal.felber@unine.ch>
* Nachshon Cohen <nachshonc@gmail.com>
*
* This work is published under the MIT license. See LICENSE.txt
*/
#ifndef _PERSISTENT_ONE_FILE_WAIT_FREE_TRANSACTIONAL_MEMORY_H_
#define _PERSISTENT_ONE_FILE_WAIT_FREE_TRANSACTIONAL_MEMORY_H_
#include <atomic>
#include <cassert>
#include <iostream>
#include <vector>
#include <functional>
#include <cstring>
#include <sys/mman.h> // Needed if we use mmap()
#include <sys/types.h> // Needed by open() and close()
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h> // Needed by close()
// Please keep this file in sync (as much as possible) with stms/OneFileWF.hpp
// Macros needed for persistence
#ifdef PWB_IS_CLFLUSH
/*
* More info at http://elixir.free-electrons.com/linux/latest/source/arch/x86/include/asm/special_insns.h#L213
* Intel programming manual at https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
* Use these for Broadwell CPUs (cervino server)
*/
#define PWB(addr) __asm__ volatile("clflush (%0)" :: "r" (addr) : "memory") // Broadwell only works with this.
#define PFENCE() {} // No ordering fences needed for CLFLUSH (section 7.4.6 of Intel manual)
#define PSYNC() {} // For durability it's not obvious, but CLFLUSH seems to be enough, and PMDK uses the same approach
#elif PWB_IS_CLWB
/* Use this for CPUs that support clwb, such as the SkyLake SP series (c5 compute intensive instances in AWS are an example of it) */
#define PWB(addr) __asm__ volatile(".byte 0x66; xsaveopt %0" : "+m" (*(volatile char *)(addr))) // clwb() only for Ice Lake onwards
#define PFENCE() __asm__ volatile("sfence" : : : "memory")
#define PSYNC() __asm__ volatile("sfence" : : : "memory")
#elif PWB_IS_NOP
/* pwbs are not needed for shared memory persistency (i.e. persistency across process failure) */
#define PWB(addr) {}
#define PFENCE() __asm__ volatile("sfence" : : : "memory")
#define PSYNC() __asm__ volatile("sfence" : : : "memory")
#elif PWB_IS_CLFLUSHOPT
/* Use this for CPUs that support clflushopt, which is most recent x86 */
#define PWB(addr) __asm__ volatile(".byte 0x66; clflush %0" : "+m" (*(volatile char *)(addr))) // clflushopt (Kaby Lake)
#define PFENCE() __asm__ volatile("sfence" : : : "memory")
#define PSYNC() __asm__ volatile("sfence" : : : "memory")
#else
#error "You must define what PWB is. Choose PWB_IS_CLFLUSHOPT if you don't know what your CPU is capable of"
#endif
/*
* Differences between POneFileWF and the non-persistent OneFileWF:
* - A secondary redo log (PWriteSet) is placed in persistent memory before attempting a 'commit'.
* - The set of the request in helpApply() is always done with a CAS to enforce ordering on the PWBs of the DCAS;
* - The persistent logs are allocated in PM, same as all user allocations from tmNew(), 'curTx', and 'request'
*/
namespace pofwf {
//
// User configurable variables.
// Feel free to change these if you need larger transactions, more allocations per transacation, or more threads.
//
// Maximum number of registered threads that can execute transactions
static const int REGISTRY_MAX_THREADS = 128;
// Maximum number of stores in the WriteSet per transaction
static const uint64_t TX_MAX_STORES = 40*1024;
// Number of buckets in the hashmap of the WriteSet.
static const uint64_t HASH_BUCKETS = 2048;
// Persistent-specific configuration
// Name of persistent file mapping
static const char * PFILE_NAME = "/dev/shm/ponefilewf_shared";
// Start address of mapped persistent memory
static uint8_t* PREGION_ADDR = (uint8_t*)0x7ff000000000;
// Size of persistent memory. Part of it will be used by the redo logs
static const uint64_t PREGION_SIZE = 1024*1024*1024ULL; // 1 GB by default
// End address of mapped persistent memory
static uint8_t* PREGION_END = (PREGION_ADDR+PREGION_SIZE);
// Maximum number of root pointers available for the user
static const uint64_t MAX_ROOT_POINTERS = 100;
// DCAS / CAS2 macro
#define DCAS(ptr, o1, o2, n1, n2) \
({ \
char __ret; \
__typeof__(o2) __junk; \
__typeof__(*(ptr)) __old1 = (o1); \
__typeof__(o2) __old2 = (o2); \
__typeof__(*(ptr)) __new1 = (n1); \
__typeof__(o2) __new2 = (n2); \
asm volatile("lock cmpxchg16b %2;setz %1" \
: "=d"(__junk), "=a"(__ret), "+m" (*ptr) \
: "b"(__new1), "c"(__new2), \
"a"(__old1), "d"(__old2)); \
__ret; })
// Functions to convert between a transaction identifier (uint64_t) and a pair of {sequence,index}
static inline uint64_t seqidx2trans(uint64_t seq, uint64_t idx) {
return (seq << 10) | idx;
}
static inline uint64_t trans2seq(uint64_t trans) {
return trans >> 10;
}
static inline uint64_t trans2idx(uint64_t trans) {
return trans & 0x3FF; // 10 bits
}
// Flush each cache line in a range
static inline void flushFromTo(void* from, void* to) noexcept {
const uint64_t cache_line_size = 64;
uint8_t* ptr = (uint8_t*)(((uint64_t)from) & (~(cache_line_size-1)));
for (; ptr < (uint8_t*)to; ptr += cache_line_size) PWB(ptr);
}
//
// Thread Registry stuff
//
extern void thread_registry_deregister_thread(const int tid);
// An helper class to do the checkin and checkout of the thread registry
struct ThreadCheckInCheckOut {
static const int NOT_ASSIGNED = -1;
int tid { NOT_ASSIGNED };
~ThreadCheckInCheckOut() {
if (tid == NOT_ASSIGNED) return;
thread_registry_deregister_thread(tid);
}
};
extern thread_local ThreadCheckInCheckOut tl_tcico;
// Forward declaration of global/singleton instance
class ThreadRegistry;
extern ThreadRegistry gThreadRegistry;
/*
* <h1> Registry for threads </h1>
*
* This is singleton type class that allows assignement of a unique id to each thread.
* The first time a thread calls ThreadRegistry::getTID() it will allocate a free slot in 'usedTID[]'.
* This tid wil be saved in a thread-local variable of the type ThreadCheckInCheckOut which
* upon destruction of the thread will call the destructor of ThreadCheckInCheckOut and free the
* corresponding slot to be used by a later thread.
*/
class ThreadRegistry {
private:
alignas(128) std::atomic<bool> usedTID[REGISTRY_MAX_THREADS]; // Which TIDs are in use by threads
alignas(128) std::atomic<int> maxTid {-1}; // Highest TID (+1) in use by threads
public:
ThreadRegistry() {
for (int it = 0; it < REGISTRY_MAX_THREADS; it++) {
usedTID[it].store(false, std::memory_order_relaxed);
}
}
// Progress condition: wait-free bounded (by the number of threads)
int register_thread_new(void) {
for (int tid = 0; tid < REGISTRY_MAX_THREADS; tid++) {
if (usedTID[tid].load(std::memory_order_acquire)) continue;
bool unused = false;
if (!usedTID[tid].compare_exchange_strong(unused, true)) continue;
// Increase the current maximum to cover our thread id
int curMax = maxTid.load();
while (curMax <= tid) {
maxTid.compare_exchange_strong(curMax, tid+1);
curMax = maxTid.load();
}
tl_tcico.tid = tid;
return tid;
}
std::cout << "ERROR: Too many threads, registry can only hold " << REGISTRY_MAX_THREADS << " threads\n";
assert(false);
}
// Progress condition: wait-free population oblivious
inline void deregister_thread(const int tid) {
usedTID[tid].store(false, std::memory_order_release);
}
// Progress condition: wait-free population oblivious
static inline uint64_t getMaxThreads(void) {
return gThreadRegistry.maxTid.load(std::memory_order_acquire);
}
// Progress condition: wait-free bounded (by the number of threads)
static inline int getTID(void) {
int tid = tl_tcico.tid;
if (tid != ThreadCheckInCheckOut::NOT_ASSIGNED) return tid;
return gThreadRegistry.register_thread_new();
}
};
// Each object tracked by Hazard Eras needs to have tmbase as one of its base classes.
struct tmbase {
uint64_t newEra_ {0}; // Filled by tmNew() or tmMalloc()
uint64_t delEra_ {0}; // Filled by tmDelete() or tmFree()
};
// A wrapper to std::function so that we can track it with Hazard Eras
struct TransFunc : public tmbase {
std::function<uint64_t()> func;
template<typename F> TransFunc(F&& f) : func{f} { }
};
// This is a specialized implementation of Hazard Eras meant to be used in the OneFile STM.
// Hazard Eras is a lock-free memory reclamation technique described here:
// https://github.com/pramalhe/ConcurrencyFreaks/blob/master/papers/hazarderas-2017.pdf
// https://dl.acm.org/citation.cfm?id=3087588
//
// We're using OF::curTx.seq as the global era.
//
// This implementation is different from the lock-free OneFile STM because we need
// to track the lifetime of the std::function objects where the lambdas are put.
class HazardErasOF {
private:
static const uint64_t NOERA = 0;
static const int CLPAD = 128/sizeof(std::atomic<uint64_t>);
static const int THRESHOLD_R = 0; // This is named 'R' in the HP paper
const unsigned int maxThreads;
alignas(128) std::atomic<uint64_t>* he;
// It's not nice that we have a lot of empty vectors, but we need padding to avoid false sharing
alignas(128) std::vector<TransFunc*> retiredListTx[REGISTRY_MAX_THREADS*CLPAD];
public:
HazardErasOF(unsigned int maxThreads=REGISTRY_MAX_THREADS) : maxThreads{maxThreads} {
he = new std::atomic<uint64_t>[REGISTRY_MAX_THREADS*CLPAD];
for (unsigned it = 0; it < REGISTRY_MAX_THREADS; it++) {
he[it*CLPAD].store(NOERA, std::memory_order_relaxed);
retiredListTx[it*CLPAD].reserve(REGISTRY_MAX_THREADS);
}
}
~HazardErasOF() {
// Clear the objects in the retired lists
for (unsigned it = 0; it < maxThreads; it++) {
for (unsigned iret = 0; iret < retiredListTx[it*CLPAD].size(); iret++) {
TransFunc* tx = retiredListTx[it*CLPAD][iret];
delete tx;
}
}
delete[] he;
}
// Progress condition: wait-free population oblivious
inline void clear(const int tid) {
he[tid*CLPAD].store(NOERA, std::memory_order_release);
}
// Progress condition: wait-free population oblivious
inline void set(uint64_t trans, const int tid) {
he[tid*CLPAD].store(trans2seq(trans));
}
// Progress condition: wait-free population oblivious
inline void addToRetiredListTx(TransFunc* tx, const int tid) {
retiredListTx[tid*CLPAD].push_back(tx);
}
/**
* Progress condition: bounded wait-free
*
* Attemps to delete the no-longer-in-use objects in the retired list.
* We need to pass the currEra coming from the seq of the currTx so that
* the objects from the current transaction don't get deleted.
*
* TODO: consider using erase() with std::remove_if()
*/
void clean(uint64_t curEra, const int tid) {
for (unsigned iret = 0; iret < retiredListTx[tid*CLPAD].size();) {
TransFunc* tx = retiredListTx[tid*CLPAD][iret];
if (canDelete(curEra, tx)) {
retiredListTx[tid*CLPAD].erase(retiredListTx[tid*CLPAD].begin() + iret);
delete tx;
continue;
}
iret++;
}
}
// Progress condition: wait-free bounded (by the number of threads)
inline bool canDelete(uint64_t curEra, tmbase* del) {
// We can't delete objects from the current transaction
if (del->delEra_ == curEra) return false;
for (unsigned it = 0; it < ThreadRegistry::getMaxThreads(); it++) {
const auto era = he[it*CLPAD].load(std::memory_order_acquire);
if (era == NOERA || era < del->newEra_ || era > del->delEra_) continue;
return false;
}
return true;
}
};
// T is typically a pointer to a node, but it can be integers or other stuff, as long as it fits in 64 bits
template<typename T> struct tmtype {
// Stores the actual value as an atomic
std::atomic<uint64_t> val;
// Lets hope this comes immediately after 'val' in memory mapping, otherwise the DCAS() will fail
std::atomic<uint64_t> seq;
tmtype() { }
tmtype(T initVal) { pstore(initVal); }
// Casting operator
operator T() { return pload(); }
// Prefix increment operator: ++x
void operator++ () { pstore(pload()+1); }
// Prefix decrement operator: --x
void operator-- () { pstore(pload()-1); }
void operator++ (int) { pstore(pload()+1); }
void operator-- (int) { pstore(pload()-1); }
// Equals operator: first downcast to T and then compare
bool operator == (const T& otherval) const { return pload() == otherval; }
// Difference operator: first downcast to T and then compare
bool operator != (const T& otherval) const { return pload() != otherval; }
// Relational operators
bool operator < (const T& rhs) { return pload() < rhs; }
bool operator > (const T& rhs) { return pload() > rhs; }
bool operator <= (const T& rhs) { return pload() <= rhs; }
bool operator >= (const T& rhs) { return pload() >= rhs; }
// Operator arrow ->
T operator->() { return pload(); }
// Copy constructor
tmtype<T>(const tmtype<T>& other) { pstore(other.pload()); }
// Assignment operator from an tmtype
tmtype<T>& operator=(const tmtype<T>& other) {
pstore(other.pload());
return *this;
}
// Assignment operator from a value
tmtype<T>& operator=(T value) {
pstore(value);
return *this;
}
// Operator &
T* operator&() {
return (T*)this;
}
// Meant to be called when know we're the only ones touching
// these contents, for example, in the constructor of an object, before
// making the object visible to other threads.
inline void isolated_store(T newVal) {
val.store((uint64_t)newVal, std::memory_order_relaxed);
}
// Used only internally to initialize the operations[] array
inline void operationsInit() {
val.store((uint64_t)nullptr, std::memory_order_relaxed);
seq.store(0, std::memory_order_relaxed);
}
// Used only internally to initialize the results[] array
inline void resultsInit() {
val.store(0, std::memory_order_relaxed);
seq.store(1, std::memory_order_relaxed);
}
// Used only internally by updateTx() to determine if the request is opened or not
inline uint64_t getSeq() const {
return seq.load(std::memory_order_acquire);
}
// Used only internally by updateTx()
inline void rawStore(T& newVal, uint64_t lseq) {
val.store((uint64_t)newVal, std::memory_order_relaxed);
seq.store(lseq, std::memory_order_release);
}
// Methods that are defined later because they have compilation dependencies on gOFWF
inline T pload() const;
inline bool rawLoad(T& keepVal, uint64_t& keepSeq);
inline void pstore(T newVal);
};
/*
* EsLoco is an Extremely Simple memory aLOCatOr
*
* It is based on intrusive singly-linked lists (a free-list), one for each power of two size.
* All blocks are powers of two, the smallest size enough to contain the desired user data plus the block header.
* There is an array named 'freelists' where each entry is a pointer to the head of a stack for that respective block size.
* Blocks are allocated in powers of 2 of words (64bit words).
* Each block has an header with two words: the size of the node (in words), the pointer to the next node.
* The minimum block size is 4 words, with 2 for the header and 2 for the user.
* When there is no suitable block in the freelist, it will create a new block from the remaining pool.
*
* EsLoco was designed for usage in PTMs but it doesn't have to be used only for that.
* Average number of stores for an allocation is 1.
* Average number of stores for a de-allocation is 2.
*
* Memory layout:
* ------------------------------------------------------------------------
* | poolTop | freelists[0] ... freelists[61] | ... allocated objects ... |
* ------------------------------------------------------------------------
*/
template <template <typename> class P>
class EsLoco {
private:
struct block {
P<block*> next; // Pointer to next block in free-list (when block is in free-list)
P<uint64_t> size; // Exponent of power of two of the size of this block in bytes.
};
const bool debugOn = false;
// Volatile data
uint8_t* poolAddr {nullptr};
uint64_t poolSize {0};
// Pointer to array of persistent heads of free-list
block* freelists {nullptr};
// Volatile pointer to persistent pointer to last unused address (the top of the pool)
P<uint8_t*>* poolTop {nullptr};
// Number of blocks in the freelists array.
// Each entry corresponds to an exponent of the block size: 2^4, 2^5, 2^6... 2^40
static const int kMaxBlockSize = 50; // 1024 TB of memory should be enough
// For powers of 2, returns the highest bit, otherwise, returns the next highest bit
uint64_t highestBit(uint64_t val) {
uint64_t b = 0;
while ((val >> (b+1)) != 0) b++;
if (val > (1ULL << b)) return b+1;
return b;
}
uint8_t* aligned(uint8_t* addr) {
return (uint8_t*)((size_t)addr & (~0x3FULL)) + 128;
}
public:
void init(void* addressOfMemoryPool, size_t sizeOfMemoryPool, bool clearPool=true) {
// Align the base address of the memory pool
poolAddr = aligned((uint8_t*)addressOfMemoryPool);
poolSize = sizeOfMemoryPool + (uint8_t*)addressOfMemoryPool - poolAddr;
// The first thing in the pool is a pointer to the top of the pool
poolTop = (P<uint8_t*>*)poolAddr;
// The second thing in the pool is the array of freelists
freelists = (block*)(poolAddr + sizeof(*poolTop));
if (clearPool) {
std::memset(poolAddr, 0, poolSize);
for (int i = 0; i < kMaxBlockSize; i++) freelists[i].next.pstore(nullptr);
// The size of the freelists array in bytes is sizeof(block)*kMaxBlockSize
// Align to cache line boundary (DCAS needs 16 byte alignment)
poolTop->pstore(aligned(poolAddr + sizeof(*poolTop) + sizeof(block)*kMaxBlockSize));
}
if (debugOn) printf("Starting EsLoco with poolAddr=%p and poolSize=%ld, up to %p\n", poolAddr, poolSize, poolAddr+poolSize);
}
// Resets the metadata of the allocator back to its defaults
void reset() {
std::memset(poolAddr, 0, sizeof(block)*kMaxBlockSize);
poolTop->pstore(nullptr);
}
// Returns the number of bytes that may (or may not) have allocated objects, from the base address to the top address
uint64_t getUsedSize() {
return poolTop->pload() - poolAddr;
}
// Takes the desired size of the object in bytes.
// Returns pointer to memory in pool, or nullptr.
// Does on average 1 store to persistent memory when re-utilizing blocks.
void* malloc(size_t size) {
P<uint8_t*>* top = (P<uint8_t*>*)(((uint8_t*)poolTop));
block* flists = (block*)(((uint8_t*)freelists));
// Adjust size to nearest (highest) power of 2
uint64_t bsize = highestBit(size + sizeof(block));
if (debugOn) printf("malloc(%ld) requested, block size exponent = %ld\n", size, bsize);
block* myblock = nullptr;
// Check if there is a block of that size in the corresponding freelist
if (flists[bsize].next.pload() != nullptr) {
if (debugOn) printf("Found available block in freelist\n");
// Unlink block
myblock = flists[bsize].next;
flists[bsize].next = myblock->next; // pstore()
} else {
if (debugOn) printf("Creating new block from top, currently at %p\n", top->pload());
// Couldn't find a suitable block, get one from the top of the pool if there is one available
if (top->pload() + (1<<bsize) > poolSize + poolAddr) {
printf("EsLoco: Out of memory for %ld bytes allocation\n", size);
return nullptr;
}
myblock = (block*)top->pload();
top->pstore(top->pload() + (1<<bsize)); // pstore()
myblock->size = bsize; // pstore()
}
if (debugOn) printf("returning ptr = %p\n", (void*)((uint8_t*)myblock + sizeof(block)));
// Return the block, minus the header
return (void*)((uint8_t*)myblock + sizeof(block));
}
// Takes a pointer to an object and puts the block on the free-list.
// Does on average 2 stores to persistent memory.
void free(void* ptr) {
if (ptr == nullptr) return;
block* flists = (block*)(((uint8_t*)freelists));
block* myblock = (block*)((uint8_t*)ptr - sizeof(block));
if (debugOn) printf("free(%p) block size exponent = %ld\n", ptr, myblock->size.pload());
// Insert the block in the corresponding freelist
myblock->next = flists[myblock->size].next; // pstore()
flists[myblock->size].next = myblock; // pstore()
}
};
// An entry in the persistent write-set (compacted for performance reasons)
struct PWriteSetEntry {
void* addr; // Address of value+sequence to change
uint64_t val; // Desired value to change to
};
// The persistent write-set
struct PWriteSet {
uint64_t numStores {0}; // Number of stores in the writeSet for the current transaction
std::atomic<uint64_t> request {0}; // Can be moved to CLOSED by other threads, using a CAS
PWriteSetEntry plog[TX_MAX_STORES]; // Redo log of stores
// Applies all entries in the log. Called only by recover() which is non-concurrent.
void applyFromRecover() {
// We're assuming that 'val' is the size of a uint64_t
for (uint64_t i = 0; i < numStores; i++) {
*((uint64_t*)plog[i].addr) = plog[i].val;
PWB(plog[i].addr);
}
}
};
// The persistent metadata is a 'header' that contains all the logs and the persistent curTx variable.
// It is located at the start of the persistent region, and the remaining region contains the data available for the allocator to use.
struct PMetadata {
static const uint64_t MAGIC_ID = 0x1337babe;
std::atomic<uint64_t> curTx {seqidx2trans(1,0)};
std::atomic<uint64_t> pad1[15];
tmtype<void*> rootPtrs[MAX_ROOT_POINTERS];
PWriteSet plog[REGISTRY_MAX_THREADS];
uint64_t id {0};
uint64_t pad2 {0};
};
// A single entry in the write-set
struct WriteSetEntry {
void* addr {nullptr}; // Address of value+sequence to change
uint64_t val; // Desired value to change to
WriteSetEntry* next {nullptr}; // Pointer to next node in the (intrusive) hash map
};
extern thread_local bool tl_is_read_only;
// The write-set is a log of the words modified during the transaction.
// This log is an array with an intrusive hashmap of size HASH_BUCKETS.
struct WriteSet {
static const uint64_t MAX_ARRAY_LOOKUP = 30; // Beyond this, it seems to be faster to use the hashmap
WriteSetEntry log[TX_MAX_STORES]; // Redo log of stores
uint64_t numStores {0}; // Number of stores in the writeSet for the current transaction
WriteSetEntry* buckets[HASH_BUCKETS]; // Intrusive HashMap for fast lookup in large(r) transactions
WriteSet() {
numStores = 0;
for (int i = 0; i < HASH_BUCKETS; i++) buckets[i] = &log[TX_MAX_STORES-1];
}
// Copies the current write set to persistent memory
inline void persistAndFlushLog(PWriteSet* const pwset) {
for (uint64_t i = 0; i < numStores; i++) {
pwset->plog[i].addr = log[i].addr;
pwset->plog[i].val = log[i].val;
}
pwset->numStores = numStores;
// Flush the log and the numStores variable
flushFromTo(&pwset->numStores, &pwset->plog[numStores+1]);
}
// Uses the log to flush the modifications to NVM.
// We assume tmtype does not cross cache line boundaries.
inline void flushModifications() {
for (uint64_t i = 0; i < numStores; i++) PWB(log[i].addr);
}
// Each address on a different bucket
inline uint64_t hash(const void* addr) const {
return (((uint64_t)addr) >> 3) % HASH_BUCKETS;
}
// Adds a modification to the redo log
inline void addOrReplace(void* addr, uint64_t val) {
if (tl_is_read_only) tl_is_read_only = false;
const uint64_t hashAddr = hash(addr);
if ((((size_t)addr & (~0xFULL)) != (size_t)addr)) {
printf("Alignment ERROR in addOrReplace() at address %p\n", addr);
assert(false);
}
if (numStores < MAX_ARRAY_LOOKUP) {
// Lookup in array
for (unsigned int idx = 0; idx < numStores; idx++) {
if (log[idx].addr == addr) {
log[idx].val = val;
return;
}
}
} else {
// Lookup in hashmap
WriteSetEntry* be = buckets[hashAddr];
if (be < &log[numStores] && hash(be->addr) == hashAddr) {
while (be != nullptr) {
if (be->addr == addr) {
be->val = val;
return;
}
be = be->next;
}
}
}
// Add to array
WriteSetEntry* e = &log[numStores++];
assert(numStores < TX_MAX_STORES);
e->addr = addr;
e->val = val;
// Add to hashmap
WriteSetEntry* be = buckets[hashAddr];
// Clear if entry is from previous tx
e->next = (be < e && hash(be->addr) == hashAddr) ? be : nullptr;
buckets[hashAddr] = e;
}
// Does a lookup on the WriteSet for an addr.
// If the numStores is lower than MAX_ARRAY_LOOKUP, the lookup is done on the log, otherwise, the lookup is done on the hashmap.
// If it's not in the write-set, return lval.
inline uint64_t lookupAddr(const void* addr, uint64_t lval) {
if (numStores < MAX_ARRAY_LOOKUP) {
// Lookup in array
for (unsigned int idx = 0; idx < numStores; idx++) {
if (log[idx].addr == addr) return log[idx].val;
}
} else {
// Lookup in hashmap
const uint64_t hashAddr = hash(addr);
WriteSetEntry* be = buckets[hashAddr];
if (be < &log[numStores] && hash(be->addr) == hashAddr) {
while (be != nullptr) {
if (be->addr == addr) return be->val;
be = be->next;
}
}
}
return lval;
}
// Returns true if there is a predecent log entry for the same cache line
inline bool lookupCacheLine(void* addr, int myidx) {
size_t cl = (size_t)addr & (~0x3F);
if (numStores < MAX_ARRAY_LOOKUP) {
// Lookup in array
for (unsigned int idx = 0; idx < myidx; idx++) {
if ((size_t)log[idx].addr & (~0x3F) == cl) return true;
}
} else {
// Lookup in hashmap. If it has the same cache line, it must be in this bucket
WriteSetEntry* be = buckets[hash(addr)];
if (be < &log[numStores]) {
while (be != nullptr) {
if ((size_t)be->addr & (~0x3F) == cl) return true;
be = be->next;
}
}
}
return false;
}
// Assignment operator, used when making a copy of a WriteSet to help another thread
WriteSet& operator = (const WriteSet &other) {
numStores = other.numStores;
for (uint64_t i = 0; i < numStores; i++) log[i] = other.log[i];
return *this;
}
// Applies all entries in the log as DCASes.
// Seq must match for DCAS to succeed. This method is on the "hot-path".
inline void apply(uint64_t seq, const int tid) {
for (uint64_t i = 0; i < numStores; i++) {
// Use an heuristic to give each thread 8 consecutive DCAS to apply
WriteSetEntry& e = log[(tid*8 + i) % numStores];
tmtype<uint64_t>* tmte = (tmtype<uint64_t>*)e.addr;
uint64_t lval = tmte->val.load(std::memory_order_acquire);
uint64_t lseq = tmte->seq.load(std::memory_order_acquire);
if (lseq < seq) DCAS((uint64_t*)e.addr, lval, lseq, e.val, seq);
}
}
};
// Forward declaration
struct OpData;
// This is used by addOrReplace() to know which OpData instance to use for the current transaction
extern thread_local OpData* tl_opdata;
// Its purpose is to hold thread-local data
struct OpData {
uint64_t curTx {0}; // Used during a transaction to keep the value of curTx read in beginTx() (owner thread only)
uint64_t nestedTrans {0}; // Thread-local: Number of nested transactions
PWriteSet* pWriteSet {nullptr}; // Pointer to the redo log in persistent memory
uint64_t padding[16-3]; // Padding to avoid false-sharing in nestedTrans and curTx
};
// Used to identify aborted transactions
struct AbortedTx {};
static constexpr AbortedTx AbortedTxException {};
class OneFileWF;
extern OneFileWF gOFWF;
/**
* <h1> OneFile STM (Wait-Free) </h1>
*
* OneFile is a Software Transacional Memory with wait-free progress, meant to
* implement wait-free data structures.
* OneFile is a word-based STM and it uses double-compare-and-swap (DCAS).
*
* Right now it has several limitations, some will be fixed in the future, some may be hard limitations of this approach:
* - We can't have stack allocated tmtype<> variables. For example, we can't created inside a transaction "tmtpye<uint64_t> tmp = a;",
* it will give weird errors because of stack allocation.
* - We need DCAS but it can be emulated with LL/SC or even with single-word CAS
* if we do redirection to a (lock-free) pool with SeqPtrs;
*/
class OneFileWF {
private:
static const bool debug = false;
OpData *opData;
int fd {-1};
HazardErasOF he {REGISTRY_MAX_THREADS};
// Maximum number of times a reader will fail a transaction before turning into an updateTx()
static const int MAX_READ_TRIES = 4;
// Member variables for wait-free consensus
tmtype<TransFunc*>* operations; // We've tried adding padding here but it didn't make a difference
tmtype<uint64_t>* results;
uint64_t padding[16];
public:
EsLoco<tmtype> esloco {};
PMetadata* pmd {nullptr};
std::atomic<uint64_t>* curTx {nullptr}; // Pointer to persistent memory location of curTx (it's in PMetadata)
WriteSet* writeSets; // Two write-sets for each thread
OneFileWF() {
opData = new OpData[REGISTRY_MAX_THREADS];
writeSets = new WriteSet[REGISTRY_MAX_THREADS];
operations = new tmtype<TransFunc*>[REGISTRY_MAX_THREADS];
for (unsigned i = 0; i < REGISTRY_MAX_THREADS; i++) operations[i].operationsInit();
results = new tmtype<uint64_t>[REGISTRY_MAX_THREADS];
for (unsigned i = 0; i < REGISTRY_MAX_THREADS; i++) results[i].resultsInit();
mapPersistentRegion(PFILE_NAME, PREGION_ADDR, PREGION_SIZE);
}
~OneFileWF() {
delete[] opData;
delete[] writeSets;
delete[] operations;
delete[] results;
}
static std::string className() { return "OneFilePTM-WF"; }
void mapPersistentRegion(const char* filename, uint8_t* regionAddr, const uint64_t regionSize) {
// Check that the header with the logs leaves at least half the memory available to the user
if (sizeof(PMetadata) > regionSize/2) {
printf("ERROR: the size of the logs in persistent memory is so large that it takes more than half the whole persistent memory\n");
printf("Please reduce some of the settings in OneFilePTM.hpp and try again\n");
assert(false);
}
bool reuseRegion = false;
// Check if the file already exists or not
struct stat buf;
if (stat(filename, &buf) == 0) {
// File exists
fd = open(filename, O_RDWR|O_CREAT, 0755);
assert(fd >= 0);
reuseRegion = true;
} else {
// File doesn't exist
fd = open(filename, O_RDWR|O_CREAT, 0755);
assert(fd >= 0);
if (lseek(fd, regionSize-1, SEEK_SET) == -1) {
perror("lseek() error");
}
if (write(fd, "", 1) == -1) {
perror("write() error");
}
}
// mmap() memory range
void* got_addr = (uint8_t *)mmap(regionAddr, regionSize, (PROT_READ | PROT_WRITE), MAP_SHARED, fd, 0);
if (got_addr == MAP_FAILED || got_addr != regionAddr) {
printf("got_addr = %p instead of %p\n", got_addr, regionAddr);
perror("ERROR: mmap() is not working !!! ");
assert(false);
}
// Check if the header is consistent and only then can we attempt to re-use, otherwise we clear everything that's there
pmd = reinterpret_cast<PMetadata*>(regionAddr);
if (reuseRegion) reuseRegion = (pmd->id == PMetadata::MAGIC_ID);
// Map pieces of persistent Metadata to pointers in volatile memory
for (uint64_t i = 0; i < REGISTRY_MAX_THREADS; i++) opData[i].pWriteSet = &(pmd->plog[i]);
curTx = &(pmd->curTx);
// If the file has just been created or if the header is not consistent, clear everything.
// Otherwise, re-use and recover to a consistent state.
if (reuseRegion) {
esloco.init(regionAddr+sizeof(PMetadata), regionSize-sizeof(PMetadata), false);
//recover(); // Not needed on x86
} else {
// Start by resetting all tmtypes::seq in the metadata region
std::memset(regionAddr, 0, sizeof(PMetadata));
new (regionAddr) PMetadata();
esloco.init(regionAddr+sizeof(PMetadata), regionSize-sizeof(PMetadata), true);
PFENCE();
pmd->id = PMetadata::MAGIC_ID;
PWB(&pmd->id);
PFENCE();
}
}
// My transaction was successful, it's my duty to cleanup any retired objects.
// This is called by the owner thread when the transaction succeeds, to pass
// the retired objects to Hazard Eras. We can't delete the objects
// immediately because there might be other threads trying to apply our log
// which may (or may not) contain addresses inside the objects in this list.
void retireRetiresFromLog(OpData& myopd, const int tid) {
uint64_t lseq = trans2seq(curTx->load(std::memory_order_acquire));
// Start a cleaning phase, scanning to see which ones can be removed
he.clean(lseq, tid);
}
// Progress condition: wait-free population-oblivious
// Attempts to publish our write-set (commit the transaction) and then applies the write-set.
// Returns true if my transaction was committed.
inline bool commitTx(OpData& myopd, const int tid) {
// If it's a read-only transaction, then commit immediately
if (writeSets[tid].numStores == 0) return true;
// Give up if the curTx has changed sinced our transaction started
if (myopd.curTx != curTx->load(std::memory_order_acquire)) return false;
// Move our request to OPEN, using the sequence of the previous transaction +1
const uint64_t seq = trans2seq(myopd.curTx);
const uint64_t newTx = seqidx2trans(seq+1,tid);
myopd.pWriteSet->request.store(newTx, std::memory_order_release);
// Copy the write-set to persistent memory and flush it
writeSets[tid].persistAndFlushLog(myopd.pWriteSet);
// Attempt to CAS curTx to our OpDesc instance (tid) incrementing the seq in it
uint64_t lcurTx = myopd.curTx;
if (debug) printf("tid=%i attempting CAS on curTx from (%ld,%ld) to (%ld,%ld)\n", tid, trans2seq(lcurTx), trans2idx(lcurTx), seq+1, (uint64_t)tid);
if (!curTx->compare_exchange_strong(lcurTx, newTx)) return false;
PWB(curTx);
// Execute each store in the write-set using DCAS() and close the request
helpApply(newTx, tid);
retireRetiresFromLog(myopd, tid);
// We should need a PSYNC() here to provide durable linearizabilty, but the CAS of the state in helpApply() acts as a PSYNC() (on x86).
if (debug) printf("Committed transaction (%ld,%ld) with %ld stores\n", seq+1, (uint64_t)tid, writeSets[tid].numStores);
return true;
}
// Progress condition: wait-free (bounded by the number of threads)
// Applies a mutative transaction or gets another thread with an ongoing
// transaction to apply it.
// If three 'seq' have passed since the transaction when we published our
// function, then the worst-case scenario is: the first transaction does not
// see our function; the second transaction transforms our function
// but doesn't apply the corresponding write-set; the third transaction
// guarantees that the log of the second transaction is applied.
inline void innerUpdateTx(OpData& myopd, TransFunc* funcptr, const int tid) {
++myopd.nestedTrans;
if (debug) printf("updateTx(tid=%d)\n", tid);
// We need an era from before the 'funcptr' is announced, so as to protect it
uint64_t firstEra = trans2seq(curTx->load(std::memory_order_acquire));
operations[tid].rawStore(funcptr, results[tid].getSeq());
tl_opdata = &myopd;
// Check 3x for the completion of our function because we don't have a fence
// on operations[tid].rawStore(), otherwise it would be just 2x.
for (int iter = 0; iter < 4; iter++) {
// An update transaction is read-only until it does the first store()
tl_is_read_only = true;
// Clear the logs of the previous transaction
writeSets[tid].numStores = 0;
myopd.curTx = curTx->load(std::memory_order_acquire);
// Optimization: if my request is answered, then my tx is committed
if (results[tid].getSeq() > operations[tid].getSeq()) break;
helpApply(myopd.curTx, tid);
// Reset the write-set after (possibly) helping another transaction complete
writeSets[tid].numStores = 0;
// Use HE to protect the TransFunc we're going to access
he.set(myopd.curTx, tid);
if (myopd.curTx != curTx->load()) continue;
try {
if (!transformAll(myopd.curTx, tid)) continue;
} catch (AbortedTx&) {
continue;
}
if (commitTx(myopd, tid)) break;
}
tl_opdata = nullptr;
--myopd.nestedTrans;
he.clear(tid);
retireMyFunc(tid, funcptr, firstEra);
}
// Update transaction with non-void return value
template<typename R, class F> static R updateTx(F&& func) {
const int tid = ThreadRegistry::getTID();
OpData& myopd = gOFWF.opData[tid];
if (myopd.nestedTrans > 0) return func();
// Copy the lambda to a std::function<> and announce a request with the pointer to it
gOFWF.innerUpdateTx(myopd, new TransFunc([func] () { return (uint64_t)func(); }), tid);
return (R)gOFWF.results[tid].pload();
}
// Update transaction with void return value
template<class F> static void updateTx(F&& func) {
const int tid = ThreadRegistry::getTID();
OpData& myopd = gOFWF.opData[tid];
if (myopd.nestedTrans > 0) {
func();
return;
}
// Copy the lambda to a std::function<> and announce a request with the pointer to it
gOFWF.innerUpdateTx(myopd, new TransFunc([func] () { func(); return 0; }), tid);
}
// Progress condition: wait-free (bounded by the number of threads + MAX_READ_TRIES)
template<typename R, class F> R readTransaction(F&& func) {
const int tid = ThreadRegistry::getTID();
OpData& myopd = opData[tid];
if (myopd.nestedTrans > 0) return func();
++myopd.nestedTrans;
tl_opdata = &myopd;
tl_is_read_only = true;
if (debug) printf("readTx(tid=%d)\n", tid);
R retval {};
writeSets[tid].numStores = 0;
for (int iter = 0; iter < MAX_READ_TRIES; iter++) {
myopd.curTx = curTx->load(std::memory_order_acquire);
helpApply(myopd.curTx, tid);
// Reset the write-set after (possibly) helping another transaction complete
writeSets[tid].numStores = 0;
// Use HE to protect the objects we're going to access during the simulation
he.set(myopd.curTx, tid);
if (myopd.curTx != curTx->load()) continue;
try {
retval = func();
} catch (AbortedTx&) {
continue;
}
--myopd.nestedTrans;
tl_opdata = nullptr;
he.clear(tid);
return retval;
}
if (debug) printf("readTx() executed MAX_READ_TRIES, posing as updateTx()\n");
--myopd.nestedTrans;
// Tried too many times unsucessfully, pose as an updateTx()
return updateTx<R>(func);
}
template<typename R, typename F> static R readTx(F&& func) { return gOFWF.readTransaction<R>(func); }
template<typename F> static void readTx(F&& func) { gOFWF.readTransaction(func); }
template <typename T, typename... Args> static T* tmNew(Args&&... args) {
//template <typename T> static T* tmNew() {
T* ptr = (T*)gOFWF.esloco.malloc(sizeof(T));
//new (ptr) T; // new placement
new (ptr) T(std::forward<Args>(args)...);
return ptr;
}