Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 55 additions & 57 deletions amd/device-libs/asanrtl/src/dm.cl
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
#include "asan_util.h"
#include "shadow_mapping.h"

#define OPTNONE __attribute__((optnone))

static const __constant uchar kAsanHeapLeftRedzoneMagic = (uchar)0xfa;
static const __constant uint kAsanHeapLeftRedzoneMagicx4 = 0xfafafafaU;
static const __constant ulong kAsanHeapLeftRedzoneMagicx8 = 0xfafafafafafafafaUL;
Expand All @@ -28,15 +26,17 @@ extern ulong __ockl_devmem_request(ulong addr, ulong size);
// Magic at beginning of allocation
#define ALLOC_MAGIC 0xfedcba1ee1abcdefUL

#define AS(P,V) __opencl_atomic_store(P, V, memory_order_relaxed, memory_scope_device)
#define AL(P) __opencl_atomic_load(P, memory_order_relaxed, memory_scope_device)
#define AA(P,V) __opencl_atomic_fetch_add(P, V, memory_order_relaxed, memory_scope_device)
#define AN(P,V) __opencl_atomic_fetch_and(P, V, memory_order_relaxed, memory_scope_device)
#define AO(P,V) __opencl_atomic_fetch_or(P, V, memory_order_relaxed, memory_scope_device)
#define ACE(P,E,V) __opencl_atomic_compare_exchange_strong(P, E, V, memory_order_relaxed, memory_order_relaxed, memory_scope_device)
#define MEMORD memory_order_relaxed
#define RF() __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent", "global")
#define ARF() __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent", "global")

#define AS(P,V) __opencl_atomic_store(P, V, MEMORD, memory_scope_device)
#define AL(P) __opencl_atomic_load(P, MEMORD, memory_scope_device)
#define AA(P,V) __opencl_atomic_fetch_add(P, V, MEMORD, memory_scope_device)
#define AN(P,V) __opencl_atomic_fetch_and(P, V, MEMORD, memory_scope_device)
#define AO(P,V) __opencl_atomic_fetch_or(P, V, MEMORD, memory_scope_device)
#define ACE(P,E,V) __opencl_atomic_compare_exchange_strong(P, E, V, MEMORD, MEMORD, memory_scope_device)

// An allocation
#define ALLOC_HEADER_BYTES 32
typedef struct alloc_struct {
Expand All @@ -55,6 +55,7 @@ typedef struct alloc_struct {
#define SLAB_BYTES (1UL << 21)
#define SLAB_THRESHOLD (SLAB_BYTES / 64)
#define SLAB_HEADER_BYTES 32
#define SLAB_USEABLE_BYTES (SLAB_BYTES - SLAB_HEADER_BYTES)

// Assume SLAB_ALIGN so low 12 bits are already clear
#define SLAB_SHIFT 6
Expand All @@ -63,18 +64,21 @@ typedef struct alloc_struct {
#define LINE 128
#define PAD(N,M) ulong pad##N[LINE/8 - M];

#define F_POISON_NEEDED 0x01
#define F_POISON_PENDING 0x02
#define F_UNREADY 0x04
#define F_MASK (F_POISON_NEEDED | F_POISON_PENDING | F_UNREADY)
#define VF_POISON_NEEDED 0x01
#define VF_POISON_PENDING 0x02
#define VF_UNREADY 0x04
#define VF_MASK (VF_POISON_NEEDED | VF_POISON_PENDING | VF_UNREADY)

#define VABSHIFT 32
#define VRBSHIFT 4
#define VRBMASK (SLAB_BYTES - 1UL)

// A slab of memory used to provide malloc returned blocks
typedef struct slab_s {
atomic_ulong next; // link to next slab on queue chain, must be first
atomic_ulong ap; // Pointer to next allocation and flags
atomic_uint rb; // returned bytes
uint pad;
atomic_ulong v; // Allocated bytes, returned bytes, and flags
atomic_ulong sid; // slab ID
ulong pad;
ulong space[(SLAB_BYTES-SLAB_HEADER_BYTES)/8]; // Space for allocations. Must be aligned 16
} slab_t;

Expand Down Expand Up @@ -196,7 +200,6 @@ slab_pause(void)


// Intended to be called from only one lane of a wave
OPTNONE
NO_SANITIZE_ADDR
static void
put_free_slab(__global heap_t *hp, __global slab_t *sp)
Expand All @@ -206,6 +209,7 @@ put_free_slab(__global heap_t *hp, __global slab_t *sp)
for (;;) {
ulong top = AL(&lp->top);
AS(&sp->next, (ulong)slabptr(top));
RF();
if (ACE(&lp->top, &top, addcnt((ulong)sp, top))) {
return;
}
Expand All @@ -228,26 +232,14 @@ get_free_slab(__global heap_t *hp)
__global slab_t *sp = slabptr(top);
if (sp) {
ulong next = AL(&sp->next);
if (ACE(&lp->top, &top, addcnt(next, top)))
if (ACE(&lp->top, &top, addcnt(next, top))) {
return sp;
}
} else {
return 0;
}
slab_pause();
}

}

NO_SANITIZE_ADDR
static void
ready_slab(__global slab_t *sp)
{
AS(&sp->rb, 0U);
if (!(AL(&sp->ap) & (ulong)(F_POISON_PENDING | F_POISON_NEEDED))) {
AS(&sp->ap, (ulong)sp + SLAB_HEADER_BYTES);
} else {
AN(&sp->ap, ~(ulong)F_UNREADY);
}
}

NO_SANITIZE_ADDR
Expand All @@ -258,6 +250,7 @@ unpublish_allocation(__global alloc_t *ap, ulong pc)
__global uchar *s = (__global uchar *)MEM_TO_SHADOW((ulong)ap - arz);
__builtin_memset(s, kAsanHeapFreeMagic, ap->asz / SHADOW_GRANULARITY);
ap->pc = pc;
RF();
}

// Free a slab based allocation
Expand All @@ -276,8 +269,8 @@ slab_free(__global alloc_t *ap, ulong pc)
uint sz = __ockl_alisa_u32(ap->asz);
uint aid = __ockl_activelane_u32();
if (aid == 0) {
uint rb = AA(&sp->rb, sz) + sz;
if (rb == SLAB_BYTES - SLAB_HEADER_BYTES) {
ulong v = AA(&sp->v, (ulong)sz << VRBSHIFT) + ((ulong)sz << VRBSHIFT);
if (((v >> VRBSHIFT) & VRBMASK) == SLAB_USEABLE_BYTES) {
put_free_slab(hp, sp);
}
}
Expand Down Expand Up @@ -404,8 +397,7 @@ try_new_slab(__global heap_t *hp)
__global slab_t *sp = obtain_new_slab(hp);
if (sp) {
AS(&sp->next, 0UL);
AS(&sp->rb, 0U);
AS(&sp->ap, (ulong)sp + (ulong)SLAB_HEADER_BYTES + (ulong)(F_UNREADY | F_POISON_PENDING | F_POISON_NEEDED));
AS(&sp->v, (ulong)(VF_UNREADY | VF_POISON_PENDING | VF_POISON_NEEDED));
#if defined SLAB_IDENTITY
AS(&sp->sid, AA(&hp->num_slab_allocations, 1UL));
#else
Expand All @@ -428,7 +420,6 @@ new_slab_wait(__global heap_t *hp)
}

// Called by a single workitem
OPTNONE
NO_SANITIZE_ADDR
static __global slab_t *
get_current_slab(__global heap_t *hp)
Expand All @@ -453,17 +444,18 @@ get_current_slab(__global heap_t *hp)
__global slab_t *fs = get_free_slab(hp);
if (fs) {
if (ACE(&hp->cs, &cs, (ulong)fs)) {
ready_slab(fs);
AN(&fs->v, (ulong)(VF_POISON_PENDING | VF_POISON_NEEDED));
return fs;
}
put_free_slab(hp, fs);
continue;
}


__global slab_t *ns = try_new_slab(hp);
if ((ulong)ns > (ulong)SLAB_BUSY) {
if (ACE(&hp->cs, &cs, (ulong)ns)) {
AN(&ns->ap, ~(ulong)F_UNREADY);
AN(&ns->v, (ulong)(VF_POISON_PENDING | VF_POISON_NEEDED));
return ns;
}
put_free_slab(hp, ns);
Expand All @@ -488,7 +480,7 @@ poison_slab(__global slab_t *sp, int aid, int na)
RF();

if (!aid)
AN(&sp->ap, ~(ulong)F_POISON_PENDING);
AN(&sp->v, ~(ulong)VF_POISON_PENDING);
}

NO_SANITIZE_ADDR
Expand Down Expand Up @@ -542,41 +534,47 @@ slab_malloc(ulong lsz, ulong pc)

ulong o = (ulong)__ockl_alisa_u32(asz);

ulong ap = 0;
ulong v = 0;
if (!aid)
ap = AL(&cs->ap);
ap = first(ap);
v = AL(&cs->v);
v = first(v);

if (ap & (ulong)F_MASK) {
ulong p = 0;
if (v & (ulong)VF_MASK) {
ulong vv = 0;
if (!aid)
p = AN(&cs->ap, ~(ulong)F_POISON_NEEDED);
p = first(p);
vv = AN(&cs->v, ~(ulong)VF_POISON_NEEDED);
vv = first(vv);

if (p & (ulong)F_POISON_NEEDED)
if (vv & (ulong)VF_POISON_NEEDED)
poison_slab(cs, aid, active_lane_count());
else
slab_pause();
} else {
ulong p = 0;
ulong vv = 0;
if (!aid)
p = AA(&cs->ap, o);
p = first(p);
vv = AA(&cs->v, o << VABSHIFT);
vv = first(vv);

if (!(p & (ulong)F_MASK)) {
if (p + o <= (ulong)cs + SLAB_BYTES) {
ret = publish_allocation(p + o - asz, (ulong)cs, pc, asz, arz, usz);
if (!(vv & (ulong)VF_MASK)) {
ulong b = vv >> VABSHIFT;
if (b + o <= SLAB_USEABLE_BYTES) {
if (b + o == SLAB_USEABLE_BYTES) {
ulong e = (ulong)cs;
ACE(&hp->cs, &e, 0UL);
AO(&cs->v, (ulong)VF_UNREADY);
}
ret = publish_allocation((ulong)cs + SLAB_HEADER_BYTES + b + o - asz, (ulong)cs, pc, asz, arz, usz);
go = 0;
} else {
if (!__ockl_activelane_u32()) {
ulong e = (ulong)cs;
ACE(&hp->cs, &e, 0UL);
AO(&cs->ap, (ulong)F_UNREADY);
AO(&cs->v, (ulong)VF_UNREADY);
}
if (p + o - asz < (ulong)cs + SLAB_BYTES) {
uint unused = (uint)((ulong)cs + SLAB_BYTES - (p + o - asz));
uint rb = AA(&cs->rb, unused) + unused;
if (rb == SLAB_BYTES - SLAB_HEADER_BYTES) {
if (b + o - asz < SLAB_USEABLE_BYTES) {
ulong pad = SLAB_USEABLE_BYTES - (b + o - asz);
ulong vvv = AA(&cs->v, pad << VRBSHIFT) + (pad << VRBSHIFT);
if (((vvv >> VRBSHIFT) & VRBMASK) == SLAB_USEABLE_BYTES) {
put_free_slab(hp, cs);
}
}
Expand Down