Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion base/runtime_internals.jl
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,10 @@ struct DataTypeLayout
# fielddesc_type : 2;
# arrayelem_isboxed : 1;
# arrayelem_isunion : 1;
# arrayelem_isatomic : 1;
# arrayelem_islocked : 1;
# isbitsegal : 1;
# padding : 8;
end

"""
Expand Down Expand Up @@ -637,7 +641,7 @@ function datatype_isbitsegal(dt::DataType)
@_foldable_meta
dt.layout == C_NULL && throw(UndefRefError())
flags = unsafe_load(convert(Ptr{DataTypeLayout}, dt.layout)).flags
return (flags & (1<<5)) != 0
return (flags & (1<<7)) != 0
end

"""
Expand Down
13 changes: 7 additions & 6 deletions src/cgutils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3299,7 +3299,7 @@ static Value *emit_genericmemoryelsize(jl_codectx_t &ctx, Value *v, jl_value_t *
if (jl_is_genericmemoryref_type(sty))
sty = (jl_datatype_t*)jl_field_type_concrete(sty, 1);
size_t sz = sty->layout->size;
if (sty->layout->flags.arrayelem_isunion)
if (sty->layout->flags.arrayelem_isunion && add_isunion)
sz++;
auto elsize = ConstantInt::get(ctx.types().T_size, sz);
return elsize;
Expand Down Expand Up @@ -4716,8 +4716,8 @@ static jl_cgval_t emit_memoryref_direct(jl_codectx_t &ctx, const jl_cgval_t &mem

} else {
data = emit_genericmemoryptr(ctx, boxmem, layout, 0);
Type *elty = isboxed ? ctx.types().T_prjlvalue : julia_type_to_llvm(ctx, jl_tparam1(typ));
data = ctx.builder.CreateInBoundsGEP(elty, data, idx0);
idx0 = ctx.builder.CreateMul(idx0, emit_genericmemoryelsize(ctx, boxmem, mem.typ, false), "", true, true);
data = ctx.builder.CreatePtrAdd(data, idx0);
}

return _emit_memoryref(ctx, boxmem, data, layout, typ);
Expand Down Expand Up @@ -4820,9 +4820,10 @@ static jl_cgval_t emit_memoryref(jl_codectx_t &ctx, const jl_cgval_t &ref, jl_cg
setName(ctx.emission_context, ovflw, "memoryref_ovflw");
}
#endif
Type *elty = isboxed ? ctx.types().T_prjlvalue : julia_type_to_llvm(ctx, jl_tparam1(ref.typ));
Copy link
Member

@gbaraldi gbaraldi Jul 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there no way to implement this? I guess julia_type_to_llvm doesn't make the lock FCA? As I said this will pessimize vectorization a lot

Copy link
Member Author

@vtjnash vtjnash Jul 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, there is not. This is why GEP was such a big mistake that is slowly getting replaced by ptradd upstream. (I realize this reverts #57389, including its copy in #58768)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, the fact that this pessimizes vectorization is just that LLVM models complex GEPS differently from muls/shifts (even though the backend will probably generate similar code (it shouldn't be hard to pattern match)

Copy link

@bbrehm bbrehm Jul 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there is a potential problem for the vectorizer, can't we just use the old GEP code if the memory is not atomic?

Performance with locks is abysmal anyways; the remaining special case where the GEP code is invalid is stuff like NTuple{3,Int8} that must be represented as 4 bytes for AtomicMemory.

In view of current experience with llvm refusing to optimize atomics, I view it as unlikely that the autovectorizer dares to touch them, and we can revisit that in the future if anybody complains.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that sounds like a good idea for now

newdata = ctx.builder.CreateGEP(elty, data, offset);
setName(ctx.emission_context, newdata, "memoryref_data_offset");
boffset = ctx.builder.CreateMul(offset, elsz);
setName(ctx.emission_context, boffset, "memoryref_byteoffset");
newdata = ctx.builder.CreateGEP(getInt8Ty(ctx.builder.getContext()), data, boffset);
setName(ctx.emission_context, newdata, "memoryref_data_byteoffset");
(void)boffset; // LLVM is very bad at handling GEP with types different from the load
if (bc) {
BasicBlock *failBB, *endBB;
Expand Down
19 changes: 9 additions & 10 deletions src/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3862,8 +3862,8 @@ static bool emit_f_opmemory(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
const jl_datatype_layout_t *layout = ((jl_datatype_t*)mty_dt)->layout;
bool isboxed = layout->flags.arrayelem_isboxed;
bool isunion = layout->flags.arrayelem_isunion;
bool isatomic = kind == (jl_value_t*)jl_atomic_sym;
bool needlock = isatomic && layout->size > MAX_ATOMIC_SIZE;
bool isatomic = layout->flags.arrayelem_isatomic || layout->flags.arrayelem_islocked;
bool needlock = layout->flags.arrayelem_islocked;
size_t elsz = layout->size;
size_t al = layout->alignment;
if (al > JL_HEAP_ALIGNMENT)
Expand Down Expand Up @@ -4231,7 +4231,7 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
size_t al = layout->alignment;
if (al > JL_HEAP_ALIGNMENT)
al = JL_HEAP_ALIGNMENT;
bool needlock = isatomic && !isboxed && elsz > MAX_ATOMIC_SIZE;
bool needlock = layout->flags.arrayelem_islocked;
AtomicOrdering Order = (needlock || order <= jl_memory_order_notatomic)
? (isboxed ? AtomicOrdering::Unordered : AtomicOrdering::NotAtomic)
: get_llvm_atomic_order(order);
Expand Down Expand Up @@ -4317,7 +4317,8 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
*ret = jl_cgval_t(); // unreachable
return true;
}
bool isatomic = kind == (jl_value_t*)jl_atomic_sym;
const jl_datatype_layout_t *layout = ((jl_datatype_t*)mty_dt)->layout;
bool isatomic = layout->flags.arrayelem_isatomic || layout->flags.arrayelem_islocked;
if (!isatomic && order != jl_memory_order_notatomic && order != jl_memory_order_unspecified) {
emit_atomic_error(ctx, "memoryref_isassigned: non-atomic memory cannot be accessed atomically");
*ret = jl_cgval_t(); // unreachable
Expand All @@ -4333,13 +4334,12 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
}
jl_value_t *boundscheck = argv[3].constant;
emit_typecheck(ctx, argv[3], (jl_value_t*)jl_bool_type, fname);
const jl_datatype_layout_t *layout = ((jl_datatype_t*)mty_dt)->layout;
Value *mem = emit_memoryref_mem(ctx, ref, layout);
Value *mlen = emit_genericmemorylen(ctx, mem, ref.typ);
Value *oob = bounds_check_enabled(ctx, boundscheck) ? ctx.builder.CreateIsNull(mlen) : nullptr;
bool isboxed = layout->flags.arrayelem_isboxed;
if (isboxed || layout->first_ptr >= 0) {
bool needlock = isatomic && !isboxed && layout->size > MAX_ATOMIC_SIZE;
bool needlock = layout->flags.arrayelem_islocked;
AtomicOrdering Order = (needlock || order <= jl_memory_order_notatomic)
? (isboxed ? AtomicOrdering::Unordered : AtomicOrdering::NotAtomic)
: get_llvm_atomic_order(order);
Expand All @@ -4359,13 +4359,12 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
ctx.builder.SetInsertPoint(passBB);
}
Value *elem = emit_memoryref_ptr(ctx, ref, layout);
if (needlock) {
if (!isboxed)
elem = emit_ptrgep(ctx, elem, layout->first_ptr * sizeof(void*));
else if (needlock)
// n.b. no actual lock acquire needed, as the check itself only needs to load a single pointer and check for null
// elem += sizeof(lock);
elem = emit_ptrgep(ctx, elem, LLT_ALIGN(sizeof(jl_mutex_t), JL_SMALL_BYTE_ALIGNMENT));
}
if (!isboxed)
elem = emit_ptrgep(ctx, elem, layout->first_ptr * sizeof(void*));
// emit this using the same type as BUILTIN(memoryrefget)
// so that LLVM may be able to load-load forward them and fold the result
auto tbaa = isboxed ? ctx.tbaa().tbaa_ptrarraybuf : ctx.tbaa().tbaa_arraybuf;
Expand Down
59 changes: 40 additions & 19 deletions src/datatype.c
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,10 @@ static jl_datatype_layout_t *jl_get_layout(uint32_t sz,
flddesc->flags.haspadding = haspadding;
flddesc->flags.isbitsegal = isbitsegal;
flddesc->flags.fielddesc_type = fielddesc_type;
flddesc->flags.arrayelem_isboxed = arrayelem == 1;
flddesc->flags.arrayelem_isunion = arrayelem == 2;
flddesc->flags.arrayelem_isboxed = (arrayelem & 1) != 0;
flddesc->flags.arrayelem_isunion = (arrayelem & 2) != 0;
flddesc->flags.arrayelem_isatomic = (arrayelem & 4) != 0;
flddesc->flags.arrayelem_islocked = (arrayelem & 8) != 0;
flddesc->flags.padding = 0;
flddesc->npointers = npointers;
flddesc->first_ptr = first_ptr;
Expand Down Expand Up @@ -537,6 +539,7 @@ void jl_get_genericmemory_layout(jl_datatype_t *st)
uint32_t *pointers = &first_ptr;
int needlock = 0;

const jl_datatype_layout_t *el_layout = NULL;
if (isunboxed) {
elsz = LLT_ALIGN(elsz, al);
if (kind == (jl_value_t*)jl_atomic_sym) {
Expand All @@ -551,12 +554,12 @@ void jl_get_genericmemory_layout(jl_datatype_t *st)
else {
assert(jl_is_datatype(eltype));
zi = ((jl_datatype_t*)eltype)->zeroinit;
const jl_datatype_layout_t *layout = ((jl_datatype_t*)eltype)->layout;
if (layout->first_ptr >= 0) {
first_ptr = layout->first_ptr;
npointers = layout->npointers;
if (layout->flags.fielddesc_type == 2) {
pointers = (uint32_t*)jl_dt_layout_ptrs(layout);
el_layout = ((jl_datatype_t*)eltype)->layout;
if (el_layout->first_ptr >= 0) {
first_ptr = el_layout->first_ptr;
npointers = el_layout->npointers;
if (el_layout->flags.fielddesc_type == 2 && !needlock) {
pointers = (uint32_t*)jl_dt_layout_ptrs(el_layout);
}
else {
pointers = (uint32_t*)alloca(npointers * sizeof(uint32_t));
Expand All @@ -568,10 +571,22 @@ void jl_get_genericmemory_layout(jl_datatype_t *st)
}
if (needlock) {
assert(al <= JL_SMALL_BYTE_ALIGNMENT);
size_t offset = LLT_ALIGN(sizeof(jl_mutex_t), JL_SMALL_BYTE_ALIGNMENT);
elsz += offset;
size_t lock_offset = LLT_ALIGN(sizeof(jl_mutex_t), JL_SMALL_BYTE_ALIGNMENT);
elsz += lock_offset;
if (al < sizeof(void*)) {
al = sizeof(void*);
elsz = LLT_ALIGN(elsz, al);
}
haspadding = 1;
zi = 1;
// Adjust pointer offsets to account for the lock at the beginning
if (first_ptr != -1) {
uint32_t lock_offset_words = lock_offset / sizeof(void*);
first_ptr += lock_offset_words;
for (int j = 0; j < npointers; j++) {
pointers[j] += lock_offset_words;
}
}
}
}
else {
Expand All @@ -580,13 +595,17 @@ void jl_get_genericmemory_layout(jl_datatype_t *st)
zi = 1;
}

int arrayelem;
// arrayelem is a bitfield: 1=isboxed, 2=isunion, 4=isatomic, 8=islocked
int arrayelem = 0;
if (!isunboxed)
arrayelem = 1;
else if (isunion)
arrayelem = 2;
else
arrayelem = 0;
arrayelem |= 1; // arrayelem_isboxed
if (isunion)
arrayelem |= 2; // arrayelem_isunion
if (kind == (jl_value_t*)jl_atomic_sym) {
arrayelem |= 4; // arrayelem_isatomic
if (needlock)
arrayelem |= 8; // arrayelem_islocked
}
assert(!st->layout);
st->layout = jl_get_layout(elsz, nfields, npointers, al, haspadding, isbitsegal, arrayelem, NULL, pointers);
st->zeroinit = zi;
Expand Down Expand Up @@ -647,17 +666,17 @@ void jl_compute_field_offsets(jl_datatype_t *st)
// if we have no fields, we can trivially skip the rest
if (st == jl_symbol_type || st == jl_string_type) {
// opaque layout - heap-allocated blob
static const jl_datatype_layout_t opaque_byte_layout = {0, 0, 1, -1, 1, { .haspadding = 0, .fielddesc_type=0, .isbitsegal=1, .arrayelem_isboxed=0, .arrayelem_isunion=0 }};
static const jl_datatype_layout_t opaque_byte_layout = {0, 0, 1, -1, 1, { .isbitsegal=1 }};
st->layout = &opaque_byte_layout;
return;
}
else if (st == jl_simplevector_type || st == jl_module_type) {
static const jl_datatype_layout_t opaque_ptr_layout = {0, 0, 1, -1, sizeof(void*), { .haspadding = 0, .fielddesc_type=0, .isbitsegal=1, .arrayelem_isboxed=0, .arrayelem_isunion=0 }};
static const jl_datatype_layout_t opaque_ptr_layout = {0, 0, 1, -1, sizeof(void*), { .isbitsegal=1 }};
st->layout = &opaque_ptr_layout;
return;
}
else {
static const jl_datatype_layout_t singleton_layout = {0, 0, 0, -1, 1, { .haspadding = 0, .fielddesc_type=0, .isbitsegal=1, .arrayelem_isboxed=0, .arrayelem_isunion=0 }};
static const jl_datatype_layout_t singleton_layout = {0, 0, 0, -1, 1, { .isbitsegal=1 }};
st->layout = &singleton_layout;
}
}
Expand Down Expand Up @@ -1001,6 +1020,8 @@ JL_DLLEXPORT jl_datatype_t * jl_new_foreign_type(jl_sym_t *name,
layout->flags.padding = 0;
layout->flags.arrayelem_isboxed = 0;
layout->flags.arrayelem_isunion = 0;
layout->flags.arrayelem_isatomic = 0;
layout->flags.arrayelem_islocked = 0;
jl_fielddescdyn_t * desc =
(jl_fielddescdyn_t *) ((char *)layout + sizeof(*layout));
desc->markfunc = markfunc;
Expand Down
15 changes: 6 additions & 9 deletions src/genericmemory.c
Original file line number Diff line number Diff line change
Expand Up @@ -262,8 +262,8 @@ JL_DLLEXPORT void jl_genericmemory_copyto(jl_genericmemory_t *dest, char* destda

JL_DLLEXPORT jl_value_t *jl_genericmemoryref(jl_genericmemory_t *mem, size_t i)
{
int isatomic = (jl_tparam0(jl_typetagof(mem)) == (jl_value_t*)jl_atomic_sym);
const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(mem))->layout;
int isatomic = layout->flags.arrayelem_isatomic || layout->flags.arrayelem_islocked;
jl_genericmemoryref_t m;
m.mem = mem;
m.ptr_or_offset = (layout->flags.arrayelem_isunion || layout->size == 0) ? (void*)i : (void*)((char*)mem->ptr + layout->size * i);
Expand Down Expand Up @@ -342,8 +342,8 @@ static jl_value_t *jl_ptrmemrefget(jl_genericmemoryref_t m JL_PROPAGATES_ROOT, i

JL_DLLEXPORT jl_value_t *jl_memoryrefget(jl_genericmemoryref_t m, int isatomic)
{
assert(isatomic == (jl_tparam0(jl_typetagof(m.mem)) == (jl_value_t*)jl_atomic_sym));
const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m.mem))->layout;
assert(isatomic == (layout->flags.arrayelem_isatomic || layout->flags.arrayelem_islocked));
if (layout->flags.arrayelem_isboxed)
return jl_ptrmemrefget(m, isatomic);
jl_value_t *eltype = jl_tparam1(jl_typetagof(m.mem));
Expand All @@ -365,7 +365,7 @@ JL_DLLEXPORT jl_value_t *jl_memoryrefget(jl_genericmemoryref_t m, int isatomic)
assert(data - (char*)m.mem->ptr < layout->size * m.mem->length);
jl_value_t *r;
size_t fsz = jl_datatype_size(eltype);
int needlock = isatomic && fsz > MAX_ATOMIC_SIZE;
int needlock = layout->flags.arrayelem_islocked;
if (isatomic && !needlock) {
r = jl_atomic_new_bits(eltype, data);
}
Expand Down Expand Up @@ -393,9 +393,6 @@ static int _jl_memoryref_isassigned(jl_genericmemoryref_t m, int isatomic)
if (layout->flags.arrayelem_isboxed) {
}
else if (layout->first_ptr >= 0) {
int needlock = isatomic && layout->size > MAX_ATOMIC_SIZE;
if (needlock)
elem = elem + LLT_ALIGN(sizeof(jl_mutex_t), JL_SMALL_BYTE_ALIGNMENT) / sizeof(jl_value_t*);
elem = &elem[layout->first_ptr];
}
else {
Expand All @@ -411,15 +408,15 @@ JL_DLLEXPORT jl_value_t *jl_memoryref_isassigned(jl_genericmemoryref_t m, int is

JL_DLLEXPORT void jl_memoryrefset(jl_genericmemoryref_t m JL_ROOTING_ARGUMENT, jl_value_t *rhs JL_ROOTED_ARGUMENT JL_MAYBE_UNROOTED, int isatomic)
{
assert(isatomic == (jl_tparam0(jl_typetagof(m.mem)) == (jl_value_t*)jl_atomic_sym));
const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m.mem))->layout;
assert(isatomic == (layout->flags.arrayelem_isatomic || layout->flags.arrayelem_islocked));
jl_value_t *eltype = jl_tparam1(jl_typetagof(m.mem));
if (eltype != (jl_value_t*)jl_any_type && !jl_typeis(rhs, eltype)) {
JL_GC_PUSH1(&rhs);
if (!jl_isa(rhs, eltype))
jl_type_error("memoryrefset!", eltype, rhs);
JL_GC_POP();
}
const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m.mem))->layout;
if (layout->flags.arrayelem_isboxed) {
assert((char*)m.ptr_or_offset - (char*)m.mem->ptr < sizeof(jl_value_t*) * m.mem->length);
if (isatomic)
Expand Down Expand Up @@ -449,7 +446,7 @@ JL_DLLEXPORT void jl_memoryrefset(jl_genericmemoryref_t m JL_ROOTING_ARGUMENT, j
}
if (layout->size != 0) {
assert(data - (char*)m.mem->ptr < layout->size * m.mem->length);
int needlock = isatomic && layout->size > MAX_ATOMIC_SIZE;
int needlock = layout->flags.arrayelem_islocked;
size_t fsz = jl_datatype_size((jl_datatype_t*)jl_typeof(rhs)); // need to shrink-wrap the final copy
if (isatomic && !needlock) {
jl_atomic_store_bits(data, rhs, fsz);
Expand Down
6 changes: 5 additions & 1 deletion src/julia.h
Original file line number Diff line number Diff line change
Expand Up @@ -575,10 +575,12 @@ typedef struct {
// metadata bit only for GenericMemory eltype layout
uint16_t arrayelem_isboxed : 1;
uint16_t arrayelem_isunion : 1;
uint16_t arrayelem_isatomic : 1;
uint16_t arrayelem_islocked : 1;
// If set, this type's egality can be determined entirely by comparing
// the non-padding bits of this datatype.
uint16_t isbitsegal : 1;
uint16_t padding : 10;
uint16_t padding : 8;
} flags;
// union {
// jl_fielddesc8_t field8[nfields];
Expand Down Expand Up @@ -1668,6 +1670,8 @@ static inline int jl_field_isconst(jl_datatype_t *st, int i) JL_NOTSAFEPOINT
#define jl_is_addrspacecore(v) jl_typetagis(v,jl_addrspacecore_type)
#define jl_is_abioverride(v) jl_typetagis(v,jl_abioverride_type)
#define jl_genericmemory_isbitsunion(a) (((jl_datatype_t*)jl_typetagof(a))->layout->flags.arrayelem_isunion)
#define jl_genericmemory_isatomic(a) (((jl_datatype_t*)jl_typetagof(a))->layout->flags.arrayelem_isatomic)
#define jl_genericmemory_islocked(a) (((jl_datatype_t*)jl_typetagof(a))->layout->flags.arrayelem_islocked)
#define jl_is_array_any(v) jl_typetagis(v,jl_array_any_type)

JL_DLLEXPORT int jl_subtype(jl_value_t *a, jl_value_t *b);
Expand Down
5 changes: 5 additions & 0 deletions src/rtutils.c
Original file line number Diff line number Diff line change
Expand Up @@ -1294,6 +1294,11 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt
}
else {
char *ptr = ((char*)m->ptr) + j * layout->size;
if (layout->flags.arrayelem_islocked) {
// Skip the lock at the beginning for locked arrays
size_t lock_size = sizeof(jl_mutex_t);
ptr += lock_size;
}
n += jl_static_show_x_(out, (jl_value_t*)ptr,
(jl_datatype_t*)(typetagdata ? jl_nth_union_component(el_type, typetagdata[j]) : el_type),
depth, ctx);
Expand Down
Loading