include/ofi_atomic_queue: properly align atomic values

The atomic queue implementation uses atomic operations to allow threads/processes to claim and comit entries into the queue. Because of the use of atomics, padding is essential to allow atomic values to have a dedicated cache line (as atomics will lock the cache line over the course of the operation). The atomic queue already had padding between the atomics to make sure they were not on the same cache line as each other but this didn't take into account the other fields that might be accessed by other processes as well such as the size and size mask. This could also potentially cause issues with any fields immediately preceeding the queue as well as with fields within the entrytype. This patch requires the AQ base address to be cache line aligned and pads the atomic values properly so that they are guaranteed to be on their own cache line. This patch also changes the shm AQ alignment to 64 to adhere to this new requirement. Signed-off-by: Alexia Ingerson <alexia.ingerson@intel.com>
ofiwg · Oct 31, 2023 · fa38bd2 · fa38bd2
1 parent ef95638
commit fa38bd2
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 7 deletions.
diff --git a/include/ofi_atomic_queue.h b/include/ofi_atomic_queue.h
@@ -94,26 +94,36 @@ extern "C" {
 
 #define OFI_CACHE_LINE_SIZE (64)
 
+/*
+ * Base address of atomic queue must be cache line aligned to maximize atomic
+ * value perforamnce benefits
+ */
 #define OFI_DECLARE_ATOMIC_Q(entrytype, name)			\
 struct name ## _entry {						\
 	ofi_atomic64_t	seq;					\
 	bool		noop;					\
 	entrytype	buf;					\
-};								\
+} __attribute__((__aligned__(64)));				\
+								\
 struct name {							\
-	int		size;					\
-	int		size_mask;				\
 	ofi_atomic64_t	write_pos;				\
-	char		pad0[OFI_CACHE_LINE_SIZE];		\
+	uint8_t		pad0[OFI_CACHE_LINE_SIZE -		\
+			     sizeof(ofi_atomic64_t)];		\
 	ofi_atomic64_t	read_pos;				\
-	char		pad1[OFI_CACHE_LINE_SIZE];		\
+	uint8_t		pad1[OFI_CACHE_LINE_SIZE -		\
+			     sizeof(ofi_atomic64_t)];		\
+	int		size;					\
+	int		size_mask;				\
+	uint8_t		pad2[OFI_CACHE_LINE_SIZE -		\
+			     (sizeof(int) * 2)];		\
 	struct name ## _entry entry[];				\
 } __attribute__((__aligned__(64)));				\
 								\
 static inline void name ## _init(struct name *aq, size_t size)	\
 {								\
 	size_t i;						\
 	assert(size == roundup_power_of_two(size));		\
+	assert(!((uintptr_t) aq % OFI_CACHE_LINE_SIZE));	\
 	aq->size = size;					\
 	aq->size_mask = aq->size - 1;				\
 	ofi_atomic_initialize64(&aq->write_pos, 0);		\

diff --git a/prov/shm/src/smr_util.c b/prov/shm/src/smr_util.c
@@ -108,8 +108,8 @@ size_t smr_calculate_size_offsets(size_t tx_count, size_t rx_count,
 	tx_size = roundup_power_of_two(tx_count);
 	rx_size = roundup_power_of_two(rx_count);
 
-	/* Align cmd_queue offset to 128-bit boundary. */
-	cmd_queue_offset = ofi_get_aligned_size(sizeof(struct smr_region), 16);
+	/* Align cmd_queue offset to cache line */
+	cmd_queue_offset = ofi_get_aligned_size(sizeof(struct smr_region), 64);
 	resp_queue_offset = cmd_queue_offset + sizeof(struct smr_cmd_queue) +
 			    sizeof(struct smr_cmd_queue_entry) * rx_size;
 	inject_pool_offset = resp_queue_offset + sizeof(struct smr_resp_queue) +